diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
new file mode 100644
index 000000000..8364f9240
--- /dev/null
+++ b/Lib/test/test_htmlparser.py
@@ -0,0 +1,826 @@
+"""Tests for HTMLParser.py."""
+
+import html.parser
+import pprint
+import unittest
+
+
+class EventCollector(html.parser.HTMLParser):
+
+ def __init__(self, *args, **kw):
+ self.events = []
+ self.append = self.events.append
+ html.parser.HTMLParser.__init__(self, *args, **kw)
+
+ def get_events(self):
+ # Normalize the list of events so that buffer artefacts don't
+ # separate runs of contiguous characters.
+ L = []
+ prevtype = None
+ for event in self.events:
+ type = event[0]
+ if type == prevtype == "data":
+ L[-1] = ("data", L[-1][1] + event[1])
+ else:
+ L.append(event)
+ prevtype = type
+ self.events = L
+ return L
+
+ # structure markup
+
+ def handle_starttag(self, tag, attrs):
+ self.append(("starttag", tag, attrs))
+
+ def handle_startendtag(self, tag, attrs):
+ self.append(("startendtag", tag, attrs))
+
+ def handle_endtag(self, tag):
+ self.append(("endtag", tag))
+
+ # all other markup
+
+ def handle_comment(self, data):
+ self.append(("comment", data))
+
+ def handle_charref(self, data):
+ self.append(("charref", data))
+
+ def handle_data(self, data):
+ self.append(("data", data))
+
+ def handle_decl(self, data):
+ self.append(("decl", data))
+
+ def handle_entityref(self, data):
+ self.append(("entityref", data))
+
+ def handle_pi(self, data):
+ self.append(("pi", data))
+
+ def unknown_decl(self, decl):
+ self.append(("unknown decl", decl))
+
+
+class EventCollectorExtra(EventCollector):
+
+ def handle_starttag(self, tag, attrs):
+ EventCollector.handle_starttag(self, tag, attrs)
+ self.append(("starttag_text", self.get_starttag_text()))
+
+
+class EventCollectorCharrefs(EventCollector):
+
+ def handle_charref(self, data):
+ self.fail('This should never be called with convert_charrefs=True')
+
+ def handle_entityref(self, data):
+ self.fail('This should never be called with convert_charrefs=True')
+
+
+class TestCaseBase(unittest.TestCase):
+
+ def get_collector(self):
+ return EventCollector(convert_charrefs=False)
+
+ def _run_check(self, source, expected_events, collector=None):
+ if collector is None:
+ collector = self.get_collector()
+ parser = collector
+ for s in source:
+ parser.feed(s)
+ parser.close()
+ events = parser.get_events()
+ if events != expected_events:
+ self.fail("received events did not match expected events" +
+ "\nSource:\n" + repr(source) +
+ "\nExpected:\n" + pprint.pformat(expected_events) +
+ "\nReceived:\n" + pprint.pformat(events))
+
+ def _run_check_extra(self, source, events):
+ self._run_check(source, events,
+ EventCollectorExtra(convert_charrefs=False))
+
+
+class HTMLParserTestCase(TestCaseBase):
+
+ def test_processing_instruction_only(self):
+ self._run_check("", [
+ ("pi", "processing instruction"),
+ ])
+ self._run_check("", [
+ ("pi", "processing instruction ?"),
+ ])
+
+ # TODO: RUSTPYTHON
+ @unittest.expectedFailure
+ def test_simple_html(self):
+ self._run_check("""
+
+&entity;
+
+sample
+text
+“
+
+
+""", [
+ ("data", "\n"),
+ ("decl", "DOCTYPE html PUBLIC 'foo'"),
+ ("data", "\n"),
+ ("starttag", "html", []),
+ ("entityref", "entity"),
+ ("charref", "32"),
+ ("data", "\n"),
+ ("comment", "comment1a\n->
', + 'foo = "";', + 'foo = "";', + 'foo = <\n/script> ', + '', + ('\n//<\\/s\'+\'cript>\');\n//]]>'), + '\n\n', + 'foo = "";', + '', + # these two should be invalid according to the HTML 5 spec, + # section 8.1.2.2 + #'foo = \nscript>', + #'foo = script>', + ] + elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] + for content in contents: + for element in elements: + element_lower = element.lower() + s = '<{element}>{content}{element}>'.format(element=element, + content=content) + self._run_check(s, [("starttag", element_lower, []), + ("data", content), + ("endtag", element_lower)]) + + def test_cdata_with_closing_tags(self): + # see issue #13358 + # make sure that HTMLParser calls handle_data only once for each CDATA. + # The normal event collector normalizes the events in get_events, + # so we override it to return the original list of events. + class Collector(EventCollector): + def get_events(self): + return self.events + + content = """ ¬-an-entity-ref; +
+ ''"""
+ for element in [' script', 'script ', ' script ',
+ '\nscript', 'script\n', '\nscript\n']:
+ element_lower = element.lower().strip()
+ s = '{1}'
+ '{1}'.format(text, charref),
+ expected, collector=collector())
+ # check truncated charrefs at the end of the file
+ html = '&quo '
+ for x in range(1, len(html)):
+ self._run_check(html[:x], [('data', html[:x])],
+ collector=collector())
+ # check a string with no charrefs
+ self._run_check('no charrefs here', [('data', 'no charrefs here')],
+ collector=collector())
+
+ # the remaining tests were for the "tolerant" parser (which is now
+ # the default), and check various kind of broken markup
+ def test_tolerant_parsing(self):
+ self._run_check('te>>xt&a<'),
+ ('comment', '/img'),
+ ('endtag', 'html<')])
+
+ def test_starttag_junk_chars(self):
+ self._run_check(">", [])
+ self._run_check("$>", [('comment', '$')])
+ self._run_check("", [('data', '')])
+ self._run_check("
'
+ 'foo'
+ '
')
+ # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
+ # and "8.2.4.45 Markup declaration open state", comment tokens should
+ # be emitted instead of 'unknown decl', but calling unknown_decl
+ # provides more flexibility.
+ # See also Lib/_markupbase.py:parse_declaration
+ expected = [
+ ('unknown decl', 'if !(IE)'),
+ ('data', 'broken condcom'),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if ! IE'),
+ ('startendtag', 'link', [('href', 'favicon.tiff')]),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if !IE 6'),
+ ('startendtag', 'img', [('src', 'firefox.png')]),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if !ie 6'),
+ ('starttag', 'b', []),
+ ('data', 'foo'),
+ ('endtag', 'b'),
+ ('unknown decl', 'endif'),
+ ('unknown decl', 'if (!IE)|(lt IE 9)'),
+ ('startendtag', 'img', [('src', 'mammoth.bmp')]),
+ ('unknown decl', 'endif')
+ ]
+ self._run_check(html, expected)
+
+ def test_convert_charrefs_dropped_text(self):
+ # #23144: make sure that all the events are triggered when
+ # convert_charrefs is True, even if we don't call .close()
+ parser = EventCollector(convert_charrefs=True)
+ # before the fix, bar & baz was missing
+ parser.feed("foo link bar & baz")
+ self.assertEqual(
+ parser.get_events(),
+ [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+ ('endtag', 'a'), ('data', ' bar & baz')]
+ )
+
+
+class AttributesTestCase(TestCaseBase):
+
+ # TODO: RUSTPYTHON
+ @unittest.expectedFailure
+ def test_attr_syntax(self):
+ output = [
+ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
+ ]
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+ self._run_check("""""", output)
+
+ # TODO: RUSTPYTHON
+ @unittest.expectedFailure
+ def test_attr_values(self):
+ self._run_check("""""",
+ [("starttag", "a", [("b", "xxx\n\txxx"),
+ ("c", "yyy\t\nyyy"),
+ ("d", "\txyz\n")])])
+ self._run_check("""""",
+ [("starttag", "a", [("b", ""), ("c", "")])])
+ # Regression test for SF patch #669683.
+ self._run_check("
",
+ [("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", "\u4e2d\u6587")])])
+ self._run_check(
+ "",
+ [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")])])
+ self._run_check(
+ '',
+ [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")])])
+
+ def test_attr_entity_replacement(self):
+ self._run_check(
+ "",
+ [("starttag", "a", [("b", "&><\"'")])])
+
+ # TODO: RUSTPYTHON
+ @unittest.expectedFailure
+ def test_attr_funky_names(self):
+ self._run_check(
+ "",
+ [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
+
+ def test_entityrefs_in_attributes(self):
+ self._run_check(
+ "",
+ [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
+
+
+ def test_attr_funky_names2(self):
+ self._run_check(
+ r"| " + "- software-and-i" + "- library |