pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/7165d8b9ba7df402fb167ff20dc6d1a35e7386ed

5097560d244c08.css" /> #19480: HTMLParser now accepts all valid start-tag names as defined b… · python/cpython@7165d8b · GitHub
Skip to content

Commit 7165d8b

Browse files
committed
#19480: HTMLParser now accepts all valid start-tag names as defined by the HTML5 standard.
1 parent d5a2f0b commit 7165d8b

File tree

3 files changed

+28
-13
lines changed

3 files changed

+28
-13
lines changed

Lib/html/parser.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,16 @@
2323
starttagopen = re.compile('<[a-zA-Z]')
2424
piclose = re.compile('>')
2525
commentclose = re.compile(r'--\s*>')
26-
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
27-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
28-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
29-
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
3026
# Note:
3127
# 1) the strict attrfind isn't really strict, but we can't make it
3228
# correctly strict without breaking backward compatibility;
33-
# 2) if you change attrfind remember to update locatestarttagend too;
34-
# 3) if you change attrfind and/or locatestarttagend the parser will
29+
# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
30+
# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
3531
# explode, so don't do it.
32+
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
33+
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
34+
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
35+
tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
3636
attrfind = re.compile(
3737
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
3838
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
@@ -54,7 +54,7 @@
5454
\s* # trailing whitespace
5555
""", re.VERBOSE)
5656
locatestarttagend_tolerant = re.compile(r"""
57-
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
57+
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
5858
(?:[\s/]* # optional whitespace before attribute name
5959
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
6060
(?:\s*=+\s* # value indicator
@@ -328,7 +328,10 @@ def parse_starttag(self, i):
328328

329329
# Now parse the data between i+1 and j into a tag and attrs
330330
attrs = []
331-
match = tagfind.match(rawdata, i+1)
331+
if self.strict:
332+
match = tagfind.match(rawdata, i+1)
333+
else:
334+
match = tagfind_tolerant.match(rawdata, i+1)
332335
assert match, 'unexpected call to parse_starttag()'
333336
k = match.end()
334337
self.lasttag = tag = match.group(1).lower()
@@ -440,7 +443,7 @@ def parse_endtag(self, i):
440443
return i+3
441444
else:
442445
return self.parse_bogus_comment(i)
443-
tagname = namematch.group().lower()
446+
tagname = namematch.group(1).lower()
444447
# consume and ignore other stuff between the name and the >
445448
# Note: this is not 100% correct, since we might have things like
446449
# </tag attr=">">, but looking for > after tha name should cover

Lib/test/test_htmlparser.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,11 @@ def test_starttag_junk_chars(self):
229229
self._parse_error("<a foo='bar")
230230
self._parse_error("<a foo='>'")
231231
self._parse_error("<a foo='>")
232+
self._parse_error("<a$>")
233+
self._parse_error("<a$b>")
234+
self._parse_error("<a$b/>")
235+
self._parse_error("<a$b >")
236+
self._parse_error("<a$b />")
232237

233238
def test_valid_doctypes(self):
234239
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
@@ -368,8 +373,8 @@ def test_tolerant_parsing(self):
368373
('starttag', 'html', [('<html', None)]),
369374
('data', 'te>>xt'),
370375
('entityref', 'a'),
371-
('data', '<<bc'),
372-
('endtag', 'a'),
376+
('data', '<'),
377+
('starttag', 'bc<', [('a', None)]),
373378
('endtag', 'html'),
374379
('data', '\n<img src="URL>'),
375380
('comment', '/img'),
@@ -380,15 +385,19 @@ def test_starttag_junk_chars(self):
380385
self._run_check("</$>", [('comment', '$')])
381386
self._run_check("</", [('data', '</')])
382387
self._run_check("</a", [('data', '</a')])
383-
# XXX this might be wrong
384-
self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
388+
self._run_check("<a<a>", [('starttag', 'a<a', [])])
385389
self._run_check("</a<a>", [('endtag', 'a<a')])
386390
self._run_check("<!", [('data', '<!')])
387391
self._run_check("<a", [('data', '<a')])
388392
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
389393
self._run_check("<a foo='bar", [('data', "<a foo='bar")])
390394
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
391395
self._run_check("<a foo='>", [('data', "<a foo='>")])
396+
self._run_check("<a$>", [('starttag', 'a$', [])])
397+
self._run_check("<a$b>", [('starttag', 'a$b', [])])
398+
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
399+
self._run_check("<a$b >", [('starttag', 'a$b', [])])
400+
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
392401

393402
def test_slashes_in_starttag(self):
394403
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Core and Builtins
1313
Library
1414
-------
1515

16+
- Issue #19480: HTMLParser now accepts all valid start-tag names as defined
17+
by the HTML5 standard.
18+
1619
- Issue #6157: Fixed tkinter.Text.debug(). Original patch by Guilherme Polo.
1720

1821
- Issue #6160: The bbox() method of tkinter.Spinbox now returns a tuple of

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy