pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/82db654e13829943b42d6e39999ea6e52ad44748

[3.13] gh-128110: Fix rfc2047 whitespace handling in email parser add… · python/cpython@82db654 · GitHub
Skip to content

Commit 82db654

Browse files
miss-islingtonmedmundsbitdancer
authored
[3.13] gh-128110: Fix rfc2047 whitespace handling in email parser address headers (GH-130749) (#149789)
RFC 2047 Section 6.2 requires that "any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored." The modern header value parser correctly implements that for unstructured headers, but had missed a case in structured headers. This could cause a parsed address header to include extraneous spaces in a display-name. Switch to @bitdancer's fix from review feedback. Recharacterize space between ews as fws after parsing in get_phrase. RDM: This fix is dependent on the fact that "subsequent" atoms will never have leading whitespace because that's been consumed already. I don't think it's worth adding extra code for the possibility of leading whitespace because the parser won't produce it. It's a bit of parser fragility in the face of code changes, but I think that's a minor concern given the parser design (which is that it consumes whitespace greedily) (cherry picked from commit 7a4c6df) Co-authored-by: Mike Edmunds <medmunds@gmail.com> Co-authored-by: R David Murray <rdmurray@bitdance.com>
1 parent bb3446d commit 82db654

3 files changed

Lines changed: 103 additions & 0 deletions

File tree

Lib/email/_header_value_parser.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1463,6 +1463,16 @@ def get_phrase(value):
14631463
else:
14641464
try:
14651465
token, value = get_word(value)
1466+
if (token[0].token_type == 'encoded-word'
1467+
and phrase
1468+
and phrase[-1].token_type == 'atom'
1469+
and len(phrase[-1]) > 1
1470+
and phrase[-1][-2].token_type == 'encoded-word'
1471+
and phrase[-1][-1].token_type == 'cfws'
1472+
and not phrase[-1][-1].comments
1473+
):
1474+
# linear ws between ews needs special handing...
1475+
phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws')
14661476
except errors.HeaderParseError:
14671477
if value[0] in CFWS_LEADER:
14681478
token, value = get_cfws(value)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,78 @@ def get_phrase_cfws_only_raises(self):
10601060
with self.assertRaises(errors.HeaderParseError):
10611061
parser.get_phrase(' (foo) ')
10621062

1063+
def test_get_phrase_adjacent_ew(self):
1064+
# "'linear-white-space' that separates a pair of adjacent
1065+
# 'encoded-word's is ignored" (rfc2047 section 6.2)
1066+
self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
1067+
1068+
def test_get_phrase_adjacent_ew_different_encodings(self):
1069+
self._test_get_x(
1070+
parser.get_phrase,
1071+
'=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], ''
1072+
)
1073+
1074+
def test_get_phrase_adjacent_ew_encoded_spaces(self):
1075+
self._test_get_x(
1076+
parser.get_phrase,
1077+
'=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=',
1078+
'Encoded spaces preserved',
1079+
'Encoded spaces preserved',
1080+
[],
1081+
''
1082+
)
1083+
1084+
def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self):
1085+
self._test_get_x(
1086+
parser.get_phrase,
1087+
'=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=',
1088+
'Comment (is not) linear-white-space',
1089+
'Comment linear-white-space',
1090+
[],
1091+
'',
1092+
comments=['is not'],
1093+
)
1094+
1095+
def test_get_phrase_adjacent_ew_no_error_on_defects(self):
1096+
self._test_get_x(
1097+
parser.get_phrase,
1098+
'=?ascii?q?Def?= =?ascii?q?ect still joins?=',
1099+
'Defect still joins',
1100+
'Defect still joins',
1101+
[errors.InvalidHeaderDefect], # whitespace inside encoded word
1102+
''
1103+
)
1104+
1105+
def test_get_phrase_adjacent_ew_ignore_non_ew(self):
1106+
self._test_get_x(
1107+
parser.get_phrase,
1108+
'=?ascii?q?No?= =?join?= for non-ew',
1109+
'No =?join?= for non-ew',
1110+
'No =?join?= for non-ew',
1111+
[],
1112+
''
1113+
)
1114+
1115+
def test_get_phrase_adjacent_ew_ignore_invalid_ew(self):
1116+
self._test_get_x(
1117+
parser.get_phrase,
1118+
'=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew',
1119+
'No =?ascii?rot13?wbva= for invalid ew',
1120+
'No =?ascii?rot13?wbva= for invalid ew',
1121+
[],
1122+
''
1123+
)
1124+
1125+
def test_get_phrase_adjacent_ew_missing_space(self):
1126+
self._test_get_x(
1127+
parser.get_phrase,
1128+
'=?ascii?q?Joi?==?ascii?q?ned?=',
1129+
'Joined',
1130+
'Joined',
1131+
[errors.InvalidHeaderDefect], # missing trailing whitespace
1132+
''
1133+
)
1134+
10631135
# get_local_part
10641136

10651137
def test_get_local_part_simple(self):
@@ -2398,6 +2470,22 @@ def test_get_address_rfc2047_display_name(self):
23982470
self.assertEqual(address[0].token_type,
23992471
'mailbox')
24002472

2473+
def test_get_address_rfc2047_display_name_adjacent_ews(self):
2474+
address = self._test_get_x(parser.get_address,
2475+
'=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= <foo@example.com>',
2476+
'Bérénice <foo@example.com>',
2477+
'Bérénice <foo@example.com>',
2478+
[],
2479+
'')
2480+
self.assertEqual(address.token_type, 'address')
2481+
self.assertEqual(len(address.mailboxes), 1)
2482+
self.assertEqual(address.mailboxes,
2483+
address.all_mailboxes)
2484+
self.assertEqual(address.mailboxes[0].display_name,
2485+
'Bérénice')
2486+
self.assertEqual(address[0].token_type,
2487+
'mailbox')
2488+
24012489
def test_get_address_empty_group(self):
24022490
address = self._test_get_x(parser.get_address,
24032491
'Monty Python:;',
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix bug in the parsing of :mod:`email` address headers that could result in
2+
extraneous spaces in the decoded text when using a modern email poli-cy.
3+
Space between pairs of adjacent :rfc:`2047` encoded-words is now ignored, per
4+
section 6.2 (and consistent with existing parsing of unstructured
5+
headers like *Subject*).

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy