@@ -70,6 +70,18 @@ def handle_starttag(self, tag, attrs):
7070 self .append (("starttag_text" , self .get_starttag_text ()))
7171
7272
73+ class EventCollectorCharrefs (EventCollector ):
74+
75+ def get_events (self ):
76+ return self .events
77+
78+ def handle_charref (self , data ):
79+ self .fail ('This should never be called with convert_charrefs=True' )
80+
81+ def handle_entityref (self , data ):
82+ self .fail ('This should never be called with convert_charrefs=True' )
83+
84+
7385class TestCaseBase (unittest .TestCase ):
7486
7587 def get_collector (self ):
@@ -84,12 +96,14 @@ def _run_check(self, source, expected_events, collector=None):
8496 parser .close ()
8597 events = parser .get_events ()
8698 if events != expected_events :
87- self .fail ("received events did not match expected events\n "
88- "Expected:\n " + pprint .pformat (expected_events ) +
99+ self .fail ("received events did not match expected events" +
100+ "\n Source:\n " + repr (source ) +
101+ "\n Expected:\n " + pprint .pformat (expected_events ) +
89102 "\n Received:\n " + pprint .pformat (events ))
90103
91104 def _run_check_extra (self , source , events ):
92- self ._run_check (source , events , EventCollectorExtra ())
105+ self ._run_check (source , events ,
106+ EventCollectorExtra (convert_charrefs = False ))
93107
94108 def _parse_error (self , source ):
95109 def parse (source = source ):
@@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase):
105119
106120 def get_collector (self ):
107121 with support .check_warnings (("" , DeprecationWarning ), quite = False ):
108- return EventCollector (strict = True )
122+ return EventCollector (strict = True , convert_charrefs = False )
109123
110124 def test_processing_instruction_only (self ):
111125 self ._run_check ("<?processing instruction>" , [
@@ -335,7 +349,7 @@ def get_events(self):
335349 self ._run_check (s , [("starttag" , element_lower , []),
336350 ("data" , content ),
337351 ("endtag" , element_lower )],
338- collector = Collector ())
352+ collector = Collector (convert_charrefs = False ))
339353
340354 def test_comments (self ):
341355 html = ("<!-- I'm a valid comment -->"
@@ -363,13 +377,53 @@ def test_condcoms(self):
363377 ('comment' , '[if lte IE 7]>pretty?<![endif]' )]
364378 self ._run_check (html , expected )
365379
380+ def test_convert_charrefs (self ):
381+ collector = lambda : EventCollectorCharrefs (convert_charrefs = True )
382+ self .assertTrue (collector ().convert_charrefs )
383+ charrefs = ['"' , '"' , '"' , '"' , '"' , '"' ]
384+ # check charrefs in the middle of the text/attributes
385+ expected = [('starttag' , 'a' , [('href' , 'foo"zar' )]),
386+ ('data' , 'a"z' ), ('endtag' , 'a' )]
387+ for charref in charrefs :
388+ self ._run_check ('<a href="foo{0}zar">a{0}z</a>' .format (charref ),
389+ expected , collector = collector ())
390+ # check charrefs at the beginning/end of the text/attributes
391+ expected = [('data' , '"' ),
392+ ('starttag' , 'a' , [('x' , '"' ), ('y' , '"X' ), ('z' , 'X"' )]),
393+ ('data' , '"' ), ('endtag' , 'a' ), ('data' , '"' )]
394+ for charref in charrefs :
395+ self ._run_check ('{0}<a x="{0}" y="{0}X" z="X{0}">'
396+ '{0}</a>{0}' .format (charref ),
397+ expected , collector = collector ())
398+ # check charrefs in <script>/<style> elements
399+ for charref in charrefs :
400+ text = 'X' .join ([charref ]* 3 )
401+ expected = [('data' , '"' ),
402+ ('starttag' , 'script' , []), ('data' , text ),
403+ ('endtag' , 'script' ), ('data' , '"' ),
404+ ('starttag' , 'style' , []), ('data' , text ),
405+ ('endtag' , 'style' ), ('data' , '"' )]
406+ self ._run_check ('{1}<script>{0}</script>{1}'
407+ '<style>{0}</style>{1}' .format (text , charref ),
408+ expected , collector = collector ())
409+ # check truncated charrefs at the end of the file
410+ html = '&quo &# &#x'
411+ for x in range (1 , len (html )):
412+ self ._run_check (html [:x ], [('data' , html [:x ])],
413+ collector = collector ())
414+ # check a string with no charrefs
415+ self ._run_check ('no charrefs here' , [('data' , 'no charrefs here' )],
416+ collector = collector ())
417+
366418
367419class HTMLParserTolerantTestCase (HTMLParserStrictTestCase ):
368420
369421 def get_collector (self ):
370- return EventCollector ()
422+ return EventCollector (convert_charrefs = False )
371423
372424 def test_deprecation_warnings (self ):
425+ with self .assertWarns (DeprecationWarning ):
426+ EventCollector () # convert_charrefs not passed explicitly
373427 with self .assertWarns (DeprecationWarning ):
374428 EventCollector (strict = True )
375429 with self .assertWarns (DeprecationWarning ):
@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase):
630684
631685 def get_collector (self ):
632686 with support .check_warnings (("" , DeprecationWarning ), quite = False ):
633- return EventCollector (strict = True )
687+ return EventCollector (strict = True , convert_charrefs = False )
634688
635689 def test_attr_syntax (self ):
636690 output = [
@@ -691,7 +745,7 @@ def test_entityrefs_in_attributes(self):
691745class AttributesTolerantTestCase (AttributesStrictTestCase ):
692746
693747 def get_collector (self ):
694- return EventCollector ()
748+ return EventCollector (convert_charrefs = False )
695749
696750 def test_attr_funky_names2 (self ):
697751 self ._run_check (
0 commit comments