From 786da2964783f2c91ec1dfc18e9fd48d731fb4fa Mon Sep 17 00:00:00 2001
From: Padraic Fanning sample
+text
+“
+
+
+""", [
+ ("data", "\n"),
+ ("decl", "DOCTYPE html PUBLIC 'foo'"),
+ ("data", "\n"),
+ ("starttag", "html", []),
+ ("entityref", "entity"),
+ ("charref", "32"),
+ ("data", "\n"),
+ ("comment", "comment1a\n->
', + 'foo = "";', + 'foo = "";', + 'foo = <\n/script> ', + '', + ('\n//<\\/s\'+\'cript>\');\n//]]>'), + '\n\n', + 'foo = "";', + '', + # these two should be invalid according to the HTML 5 spec, + # section 8.1.2.2 + #'foo = \nscript>', + #'foo = script>', + ] + elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] + for content in contents: + for element in elements: + element_lower = element.lower() + s = '<{element}>{content}{element}>'.format(element=element, + content=content) + self._run_check(s, [("starttag", element_lower, []), + ("data", content), + ("endtag", element_lower)]) + + def test_cdata_with_closing_tags(self): + # see issue #13358 + # make sure that HTMLParser calls handle_data only once for each CDATA. + # The normal event collector normalizes the events in get_events, + # so we override it to return the original list of events. + class Collector(EventCollector): + def get_events(self): + return self.events + + content = """ ¬-an-entity-ref; +
+ ''"""
+ for element in [' script', 'script ', ' script ',
+ '\nscript', 'script\n', '\nscript\n']:
+ element_lower = element.lower().strip()
+ s = '{1}'
+ '{1}'.format(text, charref),
+ expected, collector=collector())
+ # check truncated charrefs at the end of the file
+ html = '&quo '
+ for x in range(1, len(html)):
+ self._run_check(html[:x], [('data', html[:x])],
+ collector=collector())
+ # check a string with no charrefs
+ self._run_check('no charrefs here', [('data', 'no charrefs here')],
+ collector=collector())
+
+ # the remaining tests were for the "tolerant" parser (which is now
+ # the default), and check various kind of broken markup
+ def test_tolerant_parsing(self):
+ self._run_check('te>>xt&a<'),
+ ('comment', '/img'),
+ ('endtag', 'html<')])
+
+ def test_starttag_junk_chars(self):
+ self._run_check(">", [])
+ self._run_check("$>", [('comment', '$')])
+ self._run_check("", [('data', '')])
+ self._run_check("
'
+ 'foo'
+ '
')
+ # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
+ # and "8.2.4.45 Markup declaration open state", comment tokens should
+ # be emitted instead of 'unknown decl', but calling unknown_decl
+ # provides more flexibility.
+ # See also Lib/_markupbase.py:parse_declaration
+ expected = [
+ ('unknown decl', 'if !(IE)'),
+ ('data', 'broken condcom'),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if ! IE'),
+ ('startendtag', 'link', [('href', 'favicon.tiff')]),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if !IE 6'),
+ ('startendtag', 'img', [('src', 'firefox.png')]),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if !ie 6'),
+ ('starttag', 'b', []),
+ ('data', 'foo'),
+ ('endtag', 'b'),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if (!IE)|(lt IE 9)'),
+ ('startendtag', 'img', [('src', 'mammoth.bmp')]),
+ ('unknown decl', 'endif')
+ ]
+ self._run_check(html, expected)
+
+ def test_convert_charrefs_dropped_text(self):
+ # #23144: make sure that all the events are triggered when
+ # convert_charrefs is True, even if we don't call .close()
+ parser = EventCollector(convert_charrefs=True)
+ # before the fix, bar & baz was missing
+ parser.feed("foo link bar & baz")
+ self.assertEqual(
+ parser.get_events(),
+ [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+ ('endtag', 'a'), ('data', ' bar & baz')]
+ )
+
+
+class AttributesTestCase(TestCaseBase):
+
+ def test_attr_syntax(self):
+ output = [
+ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
+ ]
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+
+ def test_attr_values(self):
+ self._run_check("""""",
+ [("starttag", "a", [("b", "xxx\n\txxx"),
+ ("c", "yyy\t\nyyy"),
+ ("d", "\txyz\n")])])
+ self._run_check("""""",
+ [("starttag", "a", [("b", ""), ("c", "")])])
+ # Regression test for SF patch #669683.
+ self._run_check("
",
+ [("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", "\u4e2d\u6587")])])
+ self._run_check(
+ "",
+ [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")])])
+ self._run_check(
+ '',
+ [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")])])
+
+ def test_attr_entity_replacement(self):
+ self._run_check(
+ "",
+ [("starttag", "a", [("b", "&><\"'")])])
+
+ def test_attr_funky_names(self):
+ self._run_check(
+ "",
+ [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
+
+ def test_entityrefs_in_attributes(self):
+ self._run_check(
+ "",
+ [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
+
+
+ def test_attr_funky_names2(self):
+ self._run_check(
+ r"| " + "- software-and-i" + "- library |