pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/b7038817fee37fea81183d64c9dd957bab571fde

55097560d244c08.css" /> #19480: merge with 3.3. · python/cpython@b703881 · GitHub
Skip to content

Commit b703881

Browse files
committed
#19480: merge with 3.3.
2 parents 589327e + 7165d8b commit b703881

File tree

3 files changed

+28
-13
lines changed

3 files changed

+28
-13
lines changed

Lib/html/parser.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,16 @@
2525
starttagopen = re.compile('<[a-zA-Z]')
2626
piclose = re.compile('>')
2727
commentclose = re.compile(r'--\s*>')
28-
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
29-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
30-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
31-
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
3228
# Note:
3329
# 1) the strict attrfind isn't really strict, but we can't make it
3430
# correctly strict without breaking backward compatibility;
35-
# 2) if you change attrfind remember to update locatestarttagend too;
36-
# 3) if you change attrfind and/or locatestarttagend the parser will
31+
# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
32+
# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
3733
# explode, so don't do it.
34+
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
35+
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36+
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37+
tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
3838
attrfind = re.compile(
3939
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
4040
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
@@ -56,7 +56,7 @@
5656
\s* # trailing whitespace
5757
""", re.VERBOSE)
5858
locatestarttagend_tolerant = re.compile(r"""
59-
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
59+
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
6060
(?:[\s/]* # optional whitespace before attribute name
6161
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
6262
(?:\s*=+\s* # value indicator
@@ -336,7 +336,10 @@ def parse_starttag(self, i):
336336

337337
# Now parse the data between i+1 and j into a tag and attrs
338338
attrs = []
339-
match = tagfind.match(rawdata, i+1)
339+
if self.strict:
340+
match = tagfind.match(rawdata, i+1)
341+
else:
342+
match = tagfind_tolerant.match(rawdata, i+1)
340343
assert match, 'unexpected call to parse_starttag()'
341344
k = match.end()
342345
self.lasttag = tag = match.group(1).lower()
@@ -448,7 +451,7 @@ def parse_endtag(self, i):
448451
return i+3
449452
else:
450453
return self.parse_bogus_comment(i)
451-
tagname = namematch.group().lower()
454+
tagname = namematch.group(1).lower()
452455
# consume and ignore other stuff between the name and the >
453456
# Note: this is not 100% correct, since we might have things like
454457
# </tag attr=">">, but looking for > after tha name should cover

Lib/test/test_htmlparser.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,11 @@ def test_starttag_junk_chars(self):
231231
self._parse_error("<a foo='bar")
232232
self._parse_error("<a foo='>'")
233233
self._parse_error("<a foo='>")
234+
self._parse_error("<a$>")
235+
self._parse_error("<a$b>")
236+
self._parse_error("<a$b/>")
237+
self._parse_error("<a$b >")
238+
self._parse_error("<a$b />")
234239

235240
def test_valid_doctypes(self):
236241
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
@@ -379,8 +384,8 @@ def test_tolerant_parsing(self):
379384
('starttag', 'html', [('<html', None)]),
380385
('data', 'te>>xt'),
381386
('entityref', 'a'),
382-
('data', '<<bc'),
383-
('endtag', 'a'),
387+
('data', '<'),
388+
('starttag', 'bc<', [('a', None)]),
384389
('endtag', 'html'),
385390
('data', '\n<img src="URL>'),
386391
('comment', '/img'),
@@ -391,15 +396,19 @@ def test_starttag_junk_chars(self):
391396
self._run_check("</$>", [('comment', '$')])
392397
self._run_check("</", [('data', '</')])
393398
self._run_check("</a", [('data', '</a')])
394-
# XXX this might be wrong
395-
self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
399+
self._run_check("<a<a>", [('starttag', 'a<a', [])])
396400
self._run_check("</a<a>", [('endtag', 'a<a')])
397401
self._run_check("<!", [('data', '<!')])
398402
self._run_check("<a", [('data', '<a')])
399403
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
400404
self._run_check("<a foo='bar", [('data', "<a foo='bar")])
401405
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
402406
self._run_check("<a foo='>", [('data', "<a foo='>")])
407+
self._run_check("<a$>", [('starttag', 'a$', [])])
408+
self._run_check("<a$b>", [('starttag', 'a$b', [])])
409+
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
410+
self._run_check("<a$b >", [('starttag', 'a$b', [])])
411+
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
403412

404413
def test_slashes_in_starttag(self):
405414
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ Library
6464
- Issue #19286: Directories in ``package_data`` are no longer added to
6565
the filelist, preventing failure outlined in the ticket.
6666

67+
- Issue #19480: HTMLParser now accepts all valid start-tag names as defined
68+
by the HTML5 standard.
69+
6770
- Issue #15114: The html.parser module now raises a DeprecationWarning when the
6871
strict argument of HTMLParser or the HTMLParser.error method are used.
6972

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy