Merge pull request #4009 from youknowone/htmlurllib

Update html/urllib and their tests
2026-06-02 19:39:49 +09:00 · 2022-08-07 17:23:14 +09:00
parent cd75df546d ad573574d2
commit 8b7a3ea8ea
10 changed files with 121 additions and 118 deletions
--- a/Lib/html/entities.py
+++ b/Lib/html/entities.py
@@ -4,6 +4,7 @@ __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']


 # maps the HTML entity name to the Unicode code point
+# from https://html.spec.whatwg.org/multipage/named-characters.html
 name2codepoint = {
    'AElig':    0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
    'Aacute':   0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -9,7 +9,6 @@


 import re
-import warnings
 import _markupbase

 from html import unescape
@@ -47,7 +46,7 @@ locatestarttagend_tolerant = re.compile(r"""
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
-         (?:\s*,)*                   # possibly followed by a comma
+        \s*                          # possibly followed by a space
       )?(?:\s|/(?!>))*
     )*
   )?
@@ -406,7 +405,7 @@ class HTMLParser(_markupbase.ParserBase):
            tagname = namematch.group(1).lower()
            # consume and ignore other stuff between the name and the >
            # Note: this is not 100% correct, since we might have things like
-            # </tag attr=">">, but looking for > after tha name should cover
+            # </tag attr=">">, but looking for > after the name should cover
            # most of the cases and is much simpler
            gtpos = rawdata.find('>', namematch.end())
            self.handle_endtag(tagname)
@@ -418,7 +417,7 @@ class HTMLParser(_markupbase.ParserBase):
                self.handle_data(rawdata[i:gtpos])
                return gtpos

-        self.handle_endtag(elem.lower())
+        self.handle_endtag(elem)
        self.clear_cdata_mode()
        return gtpos

@@ -461,10 +460,3 @@ class HTMLParser(_markupbase.ParserBase):

    def unknown_decl(self, data):
        pass
-
-    # Internal -- helper to remove special character quoting
-    def unescape(self, s):
-        warnings.warn('The unescape method is deprecated and will be removed '
-                      'in 3.5, use html.unescape() instead.',
-                      DeprecationWarning, stacklevel=2)
-        return unescape(s)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -537,13 +537,6 @@ text
        for html, expected in data:
            self._run_check(html, expected)

-    def test_unescape_method(self):
-        from html import unescape
-        p = self.get_collector()
-        with self.assertWarns(DeprecationWarning):
-            s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
-            self.assertEqual(p.unescape(s), unescape(s))
-
    def test_broken_comments(self):
        html = ('<! not really a comment >'
                '<! not a comment either -->'
@@ -761,8 +754,6 @@ class AttributesTestCase(TestCaseBase):
        ]
        self._run_check(html, expected)

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_comma_between_attributes(self):
        # see bpo 41478
        # HTMLParser preserves duplicate attributes, leaving the task of
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -9,7 +9,8 @@ import io
 import unittest
 from unittest.mock import patch
 from test import support
-from test.support import os_helper, warnings_helper
+from test.support import os_helper
+from test.support import warnings_helper
 import os
 try:
    import ssl
--- a/Lib/test/test_urllib2.py
+++ b/Lib/test/test_urllib2.py
@@ -141,8 +141,6 @@ class RequestHdrsTests(unittest.TestCase):
        req.remove_header("Unredirected-spam")
        self.assertFalse(req.has_header("Unredirected-spam"))

-    # TODO: RUSTPYTHON, AssertionError: Tuples differ: ('foo', 'ni') != (None, None)
-    @unittest.expectedFailure
    def test_password_manager(self):
        mgr = urllib.request.HTTPPasswordMgr()
        add = mgr.add_password
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -613,8 +613,8 @@ class UrlParseTestCase(unittest.TestCase):
            p.port

    def test_urlsplit_remove_unsafe_bytes(self):
-        # Remove ASCII tabs and newlines from input, for http common case scenario.
-        url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        # Remove ASCII tabs and newlines from input
+        url = "http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
        p = urllib.parse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "www.python.org")
@@ -627,8 +627,8 @@ class UrlParseTestCase(unittest.TestCase):
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")

-        # Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
-        url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        # Remove ASCII tabs and newlines from input as bytes.
+        url = b"http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
        p = urllib.parse.urlsplit(url)
        self.assertEqual(p.scheme, b"http")
        self.assertEqual(p.netloc, b"www.python.org")
@@ -641,24 +641,13 @@ class UrlParseTestCase(unittest.TestCase):
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment")

-        # any scheme
-        url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        p = urllib.parse.urlsplit(url)
-        self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
-
-        # Remove ASCII tabs and newlines from input as bytes, any scheme.
-        url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        p = urllib.parse.urlsplit(url)
-        self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
-
-        # Unsafe bytes is not returned from urlparse cache.
-        # scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme
-        url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        scheme = "htt\nps"
+        # with scheme as cache-key
+        url = "http://www.python.org/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        scheme = "ht\ntp"
        for _ in range(2):
            p = urllib.parse.urlsplit(url, scheme=scheme)
-            self.assertEqual(p.scheme, "https")
-            self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
+            self.assertEqual(p.scheme, "http")
+            self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")

    def test_attributes_bad_port(self):
        """Check handling of invalid ports."""
@@ -745,15 +734,17 @@ class UrlParseTestCase(unittest.TestCase):

    def test_portseparator(self):
        # Issue 754016 makes changes for port separator ':' from scheme separator
-        self.assertEqual(urllib.parse.urlparse("path:80"),
-                ('','','path:80','','',''))
+        self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','',''))
+        self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','',''))
+        self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','',''))
        self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
        self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
                ('http','www.python.org:80','','','',''))
        # As usual, need to check bytes input as well
-        self.assertEqual(urllib.parse.urlparse(b"path:80"),
-                (b'',b'',b'path:80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -29,6 +29,7 @@ test_urlparse.py provides a good indicator of parsing behavior.

 import re
 import sys
+import types
 import collections
 import warnings

@@ -179,6 +180,8 @@ class _NetlocResultMixinBase(object):
                raise ValueError("Port out of range 0-65535")
        return port

+    __class_getitem__ = classmethod(types.GenericAlias)
+

 class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
    __slots__ = ()
@@ -369,9 +372,23 @@ del _fix_result_transcoding
 def urlparse(url, scheme='', allow_fragments=True):
    """Parse a URL into 6 components:
    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
-    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
-    Note that we don't break the components up in smaller bits
-    (e.g. netloc is a single string) and we don't expand % escapes."""
+
+    The result is a named 6-tuple with fields corresponding to the
+    above. It is either a ParseResult or ParseResultBytes object,
+    depending on the type of the url parameter.
+
+    The username, password, hostname, and port sub-components of netloc
+    can also be accessed as attributes of the returned object.
+
+    The scheme argument provides the default value of the scheme
+    component when no scheme is found in url.
+
+    If allow_fragments is False, no attempt is made to separate the
+    fragment component from the previous component, which can be either
+    path or query.
+
+    Note that % escapes are not expanded.
+    """
    url, scheme, _coerce_result = _coerce_args(url, scheme)
    splitresult = urlsplit(url, scheme, allow_fragments)
    scheme, netloc, url, query, fragment = splitresult
@@ -417,20 +434,33 @@ def _checknetloc(netloc):
            raise ValueError("netloc '" + netloc + "' contains invalid " +
                             "characters under NFKC normalization")

-def _remove_unsafe_bytes_from_url(url):
-    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
-        url = url.replace(b, "")
-    return url
-
 def urlsplit(url, scheme='', allow_fragments=True):
    """Parse a URL into 5 components:
    <scheme>://<netloc>/<path>?<query>#<fragment>
-    Return a 5-tuple: (scheme, netloc, path, query, fragment).
-    Note that we don't break the components up in smaller bits
-    (e.g. netloc is a single string) and we don't expand % escapes."""
+
+    The result is a named 5-tuple with fields corresponding to the
+    above. It is either a SplitResult or SplitResultBytes object,
+    depending on the type of the url parameter.
+
+    The username, password, hostname, and port sub-components of netloc
+    can also be accessed as attributes of the returned object.
+
+    The scheme argument provides the default value of the scheme
+    component when no scheme is found in url.
+
+    If allow_fragments is False, no attempt is made to separate the
+    fragment component from the previous component, which can be either
+    path or query.
+
+    Note that % escapes are not expanded.
+    """
+
    url, scheme, _coerce_result = _coerce_args(url, scheme)
-    url = _remove_unsafe_bytes_from_url(url)
-    scheme = _remove_unsafe_bytes_from_url(scheme)
+
+    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+        url = url.replace(b, "")
+        scheme = scheme.replace(b, "")
+
    allow_fragments = bool(allow_fragments)
    key = url, scheme, allow_fragments, type(url), type(scheme)
    cached = _parse_cache.get(key, None)
@@ -441,31 +471,11 @@ def urlsplit(url, scheme='', allow_fragments=True):
    netloc = query = fragment = ''
    i = url.find(':')
    if i > 0:
-        if url[:i] == 'http': # optimize the common case
-            url = url[i+1:]
-            if url[:2] == '//':
-                netloc, url = _splitnetloc(url, 2)
-                if (('[' in netloc and ']' not in netloc) or
-                        (']' in netloc and '[' not in netloc)):
-                    raise ValueError("Invalid IPv6 URL")
-            if allow_fragments and '#' in url:
-                url, fragment = url.split('#', 1)
-            if '?' in url:
-                url, query = url.split('?', 1)
-            _checknetloc(netloc)
-            v = SplitResult('http', netloc, url, query, fragment)
-            _parse_cache[key] = v
-            return _coerce_result(v)
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
-            # make sure "url" is not actually a port number (in which case
-            # "scheme" is really part of the path)
-            rest = url[i+1:]
-            if not rest or any(c not in '0123456789' for c in rest):
-                # not a port number
-                scheme, url = url[:i].lower(), rest
+            scheme, url = url[:i].lower(), url[i+1:]

    if url[:2] == '//':
        netloc, url = _splitnetloc(url, 2)
@@ -642,7 +652,7 @@ def unquote(string, encoding='utf-8', errors='replace'):
    unquote('abc%20def') -> 'abc def'.
    """
    if isinstance(string, bytes):
-        raise TypeError('Expected str, got bytes')
+        return unquote_to_bytes(string).decode(encoding, errors)
    if '%' not in string:
        string.split
        return string
@@ -744,9 +754,8 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
        if max_num_fields < num_fields:
            raise ValueError('Max number of fields exceeded')

-    pairs = [s1 for s1 in qs.split(separator)]
    r = []
-    for name_value in pairs:
+    for name_value in qs.split(separator):
        if not name_value and not strict_parsing:
            continue
        nv = name_value.split('=', 1)
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -64,7 +64,7 @@ opener = urllib.request.build_opener(proxy_support, authinfo,
 # install it
 urllib.request.install_opener(opener)

-f = urllib.request.urlopen('http://www.python.org/')
+f = urllib.request.urlopen('https://www.python.org/')
 """

 # XXX issues:
@@ -163,18 +163,10 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,

    The *cadefault* parameter is ignored.

-    This function always returns an object which can work as a context
-    manager and has methods such as

-    * geturl() - return the URL of the resource retrieved, commonly used to
-      determine if a redirect was followed
-
-    * info() - return the meta-information of the page, such as headers, in the
-      form of an email.message_from_string() instance (see Quick Reference to
-      HTTP Headers)
-
-    * getcode() - return the HTTP status code of the response.  Raises URLError
-      on errors.
+    This function always returns an object which can work as a
+    context manager and has the properties url, headers, and status.
+    See urllib.response.addinfourl for more detail on these properties.

    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
    object slightly modified. In addition to the three new methods above, the
@@ -210,6 +202,8 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
                                             cafile=cafile,
                                             capath=capath)
+        # send ALPN extension to indicate HTTP/1.1 protocol
+        context.set_alpn_protocols(['http/1.1'])
        https_handler = HTTPSHandler(context=context)
        opener = build_opener(https_handler)
    elif context:
@@ -895,10 +889,10 @@ class HTTPPasswordMgr:
            return True
        if base[0] != test[0]:
            return False
-        common = posixpath.commonprefix((base[1], test[1]))
-        if len(common) == len(base[1]):
-            return True
-        return False
+        prefix = base[1]
+        if prefix[-1:] != '/':
+            prefix += '/'
+        return test[1].startswith(prefix)


 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
@@ -1823,7 +1817,7 @@ class URLopener:
                hdrs = fp.info()
                fp.close()
                return url2pathname(_splithost(url1)[1]), hdrs
-            except OSError as msg:
+            except OSError:
                pass
        fp = self.open(url, data)
        try:
@@ -2680,22 +2674,26 @@ elif os.name == 'nt':
                # Returned as Unicode but problems if not converted to ASCII
                proxyServer = str(winreg.QueryValueEx(internetSettings,
                                                       'ProxyServer')[0])
-                if '=' in proxyServer:
-                    # Per-protocol settings
-                    for p in proxyServer.split(';'):
-                        protocol, address = p.split('=', 1)
-                        # See if address has a type:// prefix
-                        if not re.match('(?:[^/:]+)://', address):
-                            address = '%s://%s' % (protocol, address)
-                        proxies[protocol] = address
-                else:
-                    # Use one setting for all protocols
-                    if proxyServer[:5] == 'http:':
-                        proxies['http'] = proxyServer
-                    else:
-                        proxies['http'] = 'http://%s' % proxyServer
-                        proxies['https'] = 'https://%s' % proxyServer
-                        proxies['ftp'] = 'ftp://%s' % proxyServer
+                if '=' not in proxyServer and ';' not in proxyServer:
+                    # Use one setting for all protocols.
+                    proxyServer = 'http={0};https={0};ftp={0}'.format(proxyServer)
+                for p in proxyServer.split(';'):
+                    protocol, address = p.split('=', 1)
+                    # See if address has a type:// prefix
+                    if not re.match('(?:[^/:]+)://', address):
+                        # Add type:// prefix to address without specifying type
+                        if protocol in ('http', 'https', 'ftp'):
+                            # The default proxy type of Windows is HTTP
+                            address = 'http://' + address
+                        elif protocol == 'socks':
+                            address = 'socks://' + address
+                    proxies[protocol] = address
+                # Use SOCKS proxy for HTTP(S) protocols
+                if proxies.get('socks'):
+                    # The default SOCKS proxy type of Windows is SOCKS4
+                    address = re.sub(r'^socks://', 'socks4://', proxies['socks'])
+                    proxies['http'] = proxies.get('http') or address
+                    proxies['https'] = proxies.get('https') or address
            internetSettings.Close()
        except (OSError, ValueError, TypeError):
            # Either registry key not found etc, or the value in an
--- a/Lib/urllib/response.py
+++ b/Lib/urllib/response.py
@@ -73,6 +73,10 @@ class addinfourl(addinfo):
        self.url = url
        self.code = code

+    @property
+    def status(self):
+        return self.code
+
    def getcode(self):
        return self.code

--- a/stdlib/src/ssl.rs
+++ b/stdlib/src/ssl.rs
@@ -26,7 +26,7 @@ mod _ssl {
    use crate::{
        common::{
            ascii,
-            lock::{PyRwLock, PyRwLockWriteGuard},
+            lock::{PyMutex, PyRwLock, PyRwLockWriteGuard},
        },
        socket::{self, PySocket},
        vm::{
@@ -423,6 +423,7 @@ mod _ssl {
        ctx: PyRwLock<SslContextBuilder>,
        check_hostname: AtomicCell<bool>,
        protocol: SslVersion,
+        post_handshake_auth: PyMutex<bool>,
    }

    impl fmt::Debug for PySslContext {
@@ -491,6 +492,7 @@ mod _ssl {
                ctx: PyRwLock::new(builder),
                check_hostname: AtomicCell::new(check_hostname),
                protocol: proto,
+                post_handshake_auth: PyMutex::new(false),
            }
            .into_ref_with_type(vm, cls)
            .map(Into::into)
@@ -510,6 +512,22 @@ mod _ssl {
            func(builder_as_ctx(&c))
        }

+        #[pyproperty]
+        fn post_handshake_auth(&self) -> bool {
+            *self.post_handshake_auth.lock()
+        }
+        #[pyproperty(setter)]
+        fn set_post_handshake_auth(
+            &self,
+            value: Option<PyObjectRef>,
+            vm: &VirtualMachine,
+        ) -> PyResult<()> {
+            let value = value
+                .ok_or_else(|| vm.new_attribute_error("cannot delete attribute".to_owned()))?;
+            *self.post_handshake_auth.lock() = value.is_true(vm)?;
+            Ok(())
+        }
+
        #[pymethod]
        fn set_ciphers(&self, cipherlist: PyStrRef, vm: &VirtualMachine) -> PyResult<()> {
            let ciphers = cipherlist.as_str();