Copy updated urllib from CPython 3.8.3.

2020-07-19 13:53:04 +12:00
parent 9b48296fd7
commit cfc7a7c734
4 changed files with 386 additions and 182 deletions
--- a/Lib/urllib/error.py
+++ b/Lib/urllib/error.py
@@ -16,14 +16,10 @@ import urllib.response
 __all__ = ['URLError', 'HTTPError', 'ContentTooShortError']


-# do these error classes make sense?
-# make sure all of the OSError stuff is overridden.  we just want to be
-# subtypes.
-
 class URLError(OSError):
    # URLError is a sub-type of OSError, but it doesn't share any of
    # the implementation.  need to override __init__ and __str__.
-    # It sets self.args for compatibility with other EnvironmentError
+    # It sets self.args for compatibility with other OSError
    # subclasses, but args doesn't have the typical format with errno in
    # slot 0 and strerror in slot 1.  This may be better than nothing.
    def __init__(self, reason, filename=None):
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -30,6 +30,7 @@ test_urlparse.py provides a good indicator of parsing behavior.
 import re
 import sys
 import collections
+import warnings

 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
@@ -38,29 +39,37 @@ __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "DefragResult", "ParseResult", "SplitResult",
           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]

-# A classification of schemes ('' means apply by default)
-uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
+# A classification of schemes.
+# The empty string classifies URLs with no scheme specified,
+# being the default value returned by “urlsplit” and “urlparse”.
+
+uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
                 'wais', 'file', 'https', 'shttp', 'mms',
-                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
+                 'prospero', 'rtsp', 'rtspu', 'sftp',
                 'svn', 'svn+ssh', 'ws', 'wss']
-uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
+
+uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
-               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
+               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
               'ws', 'wss']
-uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
+
+uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
-               'mms', '', 'sftp', 'tel']
+               'mms', 'sftp', 'tel']

 # These are not actually used anymore, but should stay for backwards
 # compatibility.  (They are undocumented, but have a public-looking name.)
+
 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
-uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
-              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
-uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
+
+uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
+              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
+
+uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
                 'nntp', 'wais', 'https', 'shttp', 'snews',
-                 'file', 'prospero', '']
+                 'file', 'prospero']

 # Characters valid in scheme names
 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
@@ -147,16 +156,22 @@ class _NetlocResultMixinBase(object):
    def hostname(self):
        hostname = self._hostinfo[0]
        if not hostname:
-            hostname = None
-        elif hostname is not None:
-            hostname = hostname.lower()
-        return hostname
+            return None
+        # Scoped IPv6 address may have zone info, which must not be lowercased
+        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
+        separator = '%' if isinstance(hostname, str) else b'%'
+        hostname, percent, zone = hostname.partition(separator)
+        return hostname.lower() + percent + zone

    @property
    def port(self):
        port = self._hostinfo[1]
        if port is not None:
-            port = int(port, 10)
+            try:
+                port = int(port, 10)
+            except ValueError:
+                message = f'Port could not be cast to integer value as {port!r}'
+                raise ValueError(message) from None
            if not ( 0 <= port <= 65535):
                raise ValueError("Port out of range 0-65535")
        return port
@@ -274,7 +289,7 @@ by reference to a primary resource and additional identifying information.
 """

 _ParseResultBase.__doc__ = """
-ParseResult(scheme, netloc, path, params,  query, fragment)
+ParseResult(scheme, netloc, path, params, query, fragment)

 A 6-tuple that contains components of a parsed URL.
 """
@@ -381,6 +396,24 @@ def _splitnetloc(url, start=0):
            delim = min(delim, wdelim)     # use earliest delim position
    return url[start:delim], url[delim:]   # return (domain, rest)

+def _checknetloc(netloc):
+    if not netloc or netloc.isascii():
+        return
+    # looking for characters like \u2100 that expand to 'a/c'
+    # IDNA uses NFKC equivalence, so normalize for this check
+    import unicodedata
+    n = netloc.replace('@', '')   # ignore characters already included
+    n = n.replace(':', '')        # but not the surrounding text
+    n = n.replace('#', '')
+    n = n.replace('?', '')
+    netloc2 = unicodedata.normalize('NFKC', n)
+    if n == netloc2:
+        return
+    for c in '/?#@:':
+        if c in netloc2:
+            raise ValueError("netloc '" + netloc + "' contains invalid " +
+                             "characters under NFKC normalization")
+
 def urlsplit(url, scheme='', allow_fragments=True):
    """Parse a URL into 5 components:
    <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -399,7 +432,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
    i = url.find(':')
    if i > 0:
        if url[:i] == 'http': # optimize the common case
-            scheme = url[:i].lower()
            url = url[i+1:]
            if url[:2] == '//':
                netloc, url = _splitnetloc(url, 2)
@@ -410,7 +442,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
                url, fragment = url.split('#', 1)
            if '?' in url:
                url, query = url.split('?', 1)
-            v = SplitResult(scheme, netloc, url, query, fragment)
+            _checknetloc(netloc)
+            v = SplitResult('http', netloc, url, query, fragment)
            _parse_cache[key] = v
            return _coerce_result(v)
        for c in url[:i]:
@@ -433,6 +466,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
        url, fragment = url.split('#', 1)
    if '?' in url:
        url, query = url.split('?', 1)
+    _checknetloc(netloc)
    v = SplitResult(scheme, netloc, url, query, fragment)
    _parse_cache[key] = v
    return _coerce_result(v)
@@ -574,7 +608,7 @@ def unquote_to_bytes(string):
    # if the function is never called
    global _hextobyte
    if _hextobyte is None:
-        _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
+        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
                      for a in _hexdig for b in _hexdig}
    for item in bits[1:]:
        try:
@@ -612,8 +646,9 @@ def unquote(string, encoding='utf-8', errors='replace'):
        append(bits[i + 1])
    return ''.join(res)

+
 def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
-             encoding='utf-8', errors='replace'):
+             encoding='utf-8', errors='replace', max_num_fields=None):
    """Parse a query given as a string argument.

        Arguments:
@@ -633,10 +668,16 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,

        encoding and errors: specify how to decode percent-encoded sequences
            into Unicode characters, as accepted by the bytes.decode() method.
+
+        max_num_fields: int. If set, then throws a ValueError if there
+            are more than n fields read by parse_qsl().
+
+        Returns a dictionary.
    """
    parsed_result = {}
    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
-                      encoding=encoding, errors=errors)
+                      encoding=encoding, errors=errors,
+                      max_num_fields=max_num_fields)
    for name, value in pairs:
        if name in parsed_result:
            parsed_result[name].append(value)
@@ -644,30 +685,43 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
            parsed_result[name] = [value]
    return parsed_result

+
 def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
-              encoding='utf-8', errors='replace'):
+              encoding='utf-8', errors='replace', max_num_fields=None):
    """Parse a query given as a string argument.

-    Arguments:
+        Arguments:

-    qs: percent-encoded query string to be parsed
+        qs: percent-encoded query string to be parsed

-    keep_blank_values: flag indicating whether blank values in
-        percent-encoded queries should be treated as blank strings.  A
-        true value indicates that blanks should be retained as blank
-        strings.  The default false value indicates that blank values
-        are to be ignored and treated as if they were  not included.
+        keep_blank_values: flag indicating whether blank values in
+            percent-encoded queries should be treated as blank strings.
+            A true value indicates that blanks should be retained as blank
+            strings.  The default false value indicates that blank values
+            are to be ignored and treated as if they were  not included.

-    strict_parsing: flag indicating what to do with parsing errors. If
-        false (the default), errors are silently ignored. If true,
-        errors raise a ValueError exception.
+        strict_parsing: flag indicating what to do with parsing errors. If
+            false (the default), errors are silently ignored. If true,
+            errors raise a ValueError exception.

-    encoding and errors: specify how to decode percent-encoded sequences
-        into Unicode characters, as accepted by the bytes.decode() method.
+        encoding and errors: specify how to decode percent-encoded sequences
+            into Unicode characters, as accepted by the bytes.decode() method.

-    Returns a list, as G-d intended.
+        max_num_fields: int. If set, then throws a ValueError
+            if there are more than n fields read by parse_qsl().
+
+        Returns a list, as G-d intended.
    """
    qs, _coerce_result = _coerce_args(qs)
+
+    # If max_num_fields is defined then check that the number of fields
+    # is less than max_num_fields. This prevents a memory exhaustion DOS
+    # attack via post bodies with many fields.
+    if max_num_fields is not None:
+        num_fields = 1 + qs.count('&') + qs.count(';')
+        if max_num_fields < num_fields:
+            raise ValueError('Max number of fields exceeded')
+
    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    r = []
    for name_value in pairs:
@@ -704,7 +758,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
 _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                         b'abcdefghijklmnopqrstuvwxyz'
                         b'0123456789'
-                         b'_.-')
+                         b'_.-~')
 _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
 _safe_quoters = {}

@@ -734,22 +788,32 @@ def quote(string, safe='/', encoding=None, errors=None):
    """quote('abc def') -> 'abc%20def'

    Each part of a URL, e.g. the path info, the query, etc., has a
-    different set of reserved characters that must be quoted.
+    different set of reserved characters that must be quoted. The
+    quote function offers a cautious (not minimal) way to quote a
+    string for most of these parts.

-    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
-    the following reserved characters.
+    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
+    the following (un)reserved characters.

-    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
-                  "$" | ","
+    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    reserved      = gen-delims / sub-delims
+    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+                  / "*" / "+" / "," / ";" / "="

-    Each of these characters is reserved in some component of a URL,
+    Each of the reserved characters is reserved in some component of a URL,
    but not necessarily in all of them.

-    By default, the quote function is intended for quoting the path
-    section of a URL.  Thus, it will not encode '/'.  This character
-    is reserved, but in typical usage the quote function is being
-    called on a path where the existing slash characters are used as
-    reserved characters.
+    The quote function %-escapes all characters that are neither in the
+    unreserved chars ("always safe") nor the additional chars set via the
+    safe arg.
+
+    The default for the safe arg is '/'. The character is reserved, but in
+    typical usage the quote function is being called on a path where the
+    existing slash characters are to be preserved.
+
+    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
+    Now, "~" is included in the set of unreserved characters.

    string and safe may be either str or bytes objects. encoding and errors
    must not be specified if string is a bytes object.
@@ -893,7 +957,14 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
                        l.append(k + '=' + elt)
    return '&'.join(l)

+
 def to_bytes(url):
+    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
+                  DeprecationWarning, stacklevel=2)
+    return _to_bytes(url)
+
+
+def _to_bytes(url):
    """to_bytes(u"URL") --> 'URL'."""
    # Most URL schemes require ASCII. If that changes, the conversion
    # can be relaxed.
@@ -906,16 +977,29 @@ def to_bytes(url):
                               " contains non-ASCII characters")
    return url

+
 def unwrap(url):
-    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
+    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
+
+    The string is returned unchanged if it's not a wrapped URL.
+    """
    url = str(url).strip()
    if url[:1] == '<' and url[-1:] == '>':
        url = url[1:-1].strip()
-    if url[:4] == 'URL:': url = url[4:].strip()
+    if url[:4] == 'URL:':
+        url = url[4:].strip()
    return url

-_typeprog = None
+
 def splittype(url):
+    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splittype(url)
+
+
+_typeprog = None
+def _splittype(url):
    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
    global _typeprog
    if _typeprog is None:
@@ -927,12 +1011,20 @@ def splittype(url):
        return scheme.lower(), data
    return None, url

-_hostprog = None
+
 def splithost(url):
+    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splithost(url)
+
+
+_hostprog = None
+def _splithost(url):
    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
    global _hostprog
    if _hostprog is None:
-        _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)

    match = _hostprog.match(url)
    if match:
@@ -942,32 +1034,64 @@ def splithost(url):
        return host_port, path
    return None, url

+
 def splituser(host):
+    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splituser(host)
+
+
+def _splituser(host):
    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
    user, delim, host = host.rpartition('@')
    return (user if delim else None), host

+
 def splitpasswd(user):
+    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitpasswd(user)
+
+
+def _splitpasswd(user):
    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
    user, delim, passwd = user.partition(':')
    return user, (passwd if delim else None)

+
+def splitport(host):
+    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitport(host)
+
+
 # splittag('/path#tag') --> '/path', 'tag'
 _portprog = None
-def splitport(host):
+def _splitport(host):
    """splitport('host:port') --> 'host', 'port'."""
    global _portprog
    if _portprog is None:
-        _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
+        _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)

-    match = _portprog.match(host)
+    match = _portprog.fullmatch(host)
    if match:
        host, port = match.groups()
        if port:
            return host, port
    return host, None

+
 def splitnport(host, defport=-1):
+    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitnport(host, defport)
+
+
+def _splitnport(host, defport=-1):
    """Split host and port, returning numeric port.
    Return given default port if no ':' found; defaults to -1.
    Return numerical port if a valid number are found after ':'.
@@ -983,27 +1107,59 @@ def splitnport(host, defport=-1):
        return host, nport
    return host, defport

+
 def splitquery(url):
+    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitquery(url)
+
+
+def _splitquery(url):
    """splitquery('/path?query') --> '/path', 'query'."""
    path, delim, query = url.rpartition('?')
    if delim:
        return path, query
    return url, None

+
 def splittag(url):
+    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splittag(url)
+
+
+def _splittag(url):
    """splittag('/path#tag') --> '/path', 'tag'."""
    path, delim, tag = url.rpartition('#')
    if delim:
        return path, tag
    return url, None

+
 def splitattr(url):
+    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
+                  "use urllib.parse.urlparse() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitattr(url)
+
+
+def _splitattr(url):
    """splitattr('/path;attr1=value1;attr2=value2;...') ->
        '/path', ['attr1=value1', 'attr2=value2', ...]."""
    words = url.split(';')
    return words[0], words[1:]

+
 def splitvalue(attr):
+    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
+                  "use urllib.parse.parse_qsl() instead",
+                  DeprecationWarning, stacklevel=2)
+    return _splitvalue(attr)
+
+
+def _splitvalue(attr):
    """splitvalue('attr=value') --> 'attr', 'value'."""
    attr, delim, value = attr.partition('=')
    return attr, (value if delim else None)
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -94,7 +94,6 @@ import socket
 import string
 import sys
 import time
-import collections
 import tempfile
 import contextlib
 import warnings
@@ -103,8 +102,8 @@ import warnings
 from urllib.error import URLError, HTTPError, ContentTooShortError
 from urllib.parse import (
    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
-    splittype, splithost, splitport, splituser, splitpasswd,
-    splitattr, splitquery, splitvalue, splittag, to_bytes,
+    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
+    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
    unquote_to_bytes, urlunparse)
 from urllib.response import addinfourl, addclosehook

@@ -199,7 +198,7 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
    global _opener
    if cafile or capath or cadefault:
        import warnings
-        warnings.warn("cafile, cpath and cadefault are deprecated, use a "
+        warnings.warn("cafile, capath and cadefault are deprecated, use a "
                      "custom context instead.", DeprecationWarning, 2)
        if context is not None:
            raise ValueError(
@@ -243,7 +242,7 @@ def urlretrieve(url, filename=None, reporthook=None, data=None):
    Returns a tuple containing the path to the newly created
    data file as well as the resulting HTTPMessage object.
    """
-    url_type, path = splittype(url)
+    url_type, path = _splittype(url)

    with contextlib.closing(urlopen(url, data)) as fp:
        headers = fp.info()
@@ -351,7 +350,7 @@ class Request:
    def full_url(self, url):
        # unwrap('<URL:type://host/path>') --> 'type://host/path'
        self._full_url = unwrap(url)
-        self._full_url, self.fragment = splittag(self._full_url)
+        self._full_url, self.fragment = _splittag(self._full_url)
        self._parse()

    @full_url.deleter
@@ -379,10 +378,10 @@ class Request:
        self.data = None

    def _parse(self):
-        self.type, rest = splittype(self._full_url)
+        self.type, rest = _splittype(self._full_url)
        if self.type is None:
            raise ValueError("unknown url type: %r" % self.full_url)
-        self.host, self.selector = splithost(rest)
+        self.host, self.selector = _splithost(rest)
        if self.host:
            self.host = unquote(self.host)

@@ -427,8 +426,7 @@ class Request:
        self.unredirected_hdrs.pop(header_name, None)

    def header_items(self):
-        hdrs = self.unredirected_hdrs.copy()
-        hdrs.update(self.headers)
+        hdrs = {**self.unredirected_hdrs, **self.headers}
        return list(hdrs.items())

 class OpenerDirector:
@@ -523,6 +521,7 @@ class OpenerDirector:
            meth = getattr(processor, meth_name)
            req = meth(req)

+        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
        response = self._open(req, data)

        # post-process response
@@ -684,8 +683,8 @@ class HTTPRedirectHandler(BaseHandler):
        newurl = newurl.replace(' ', '%20')

        CONTENT_HEADERS = ("content-length", "content-type")
-        newheaders = dict((k, v) for k, v in req.headers.items()
-                          if k.lower() not in CONTENT_HEADERS)
+        newheaders = {k: v for k, v in req.headers.items()
+                      if k.lower() not in CONTENT_HEADERS}
        return Request(newurl,
                       headers=newheaders,
                       origin_req_host=req.origin_req_host,
@@ -769,7 +768,7 @@ def _parse_proxy(proxy):
    According to RFC 3986, having an authority component means the URL must
    have two slashes after the scheme.
    """
-    scheme, r_scheme = splittype(proxy)
+    scheme, r_scheme = _splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
@@ -784,9 +783,9 @@ def _parse_proxy(proxy):
        if end == -1:
            end = None
        authority = r_scheme[2:end]
-    userinfo, hostport = splituser(authority)
+    userinfo, hostport = _splituser(authority)
    if userinfo is not None:
-        user, password = splitpasswd(userinfo)
+        user, password = _splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport
@@ -801,6 +800,7 @@ class ProxyHandler(BaseHandler):
        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
        self.proxies = proxies
        for type, url in proxies.items():
+            type = type.lower()
            setattr(self, '%s_open' % type,
                    lambda r, proxy=url, type=type, meth=self.proxy_open:
                        meth(r, proxy, type))
@@ -846,7 +846,7 @@ class HTTPPasswordMgr:
            self.passwd[realm] = {}
        for default_port in True, False:
            reduced_uri = tuple(
-                [self.reduce_uri(u, default_port) for u in uri])
+                self.reduce_uri(u, default_port) for u in uri)
            self.passwd[realm][reduced_uri] = (user, passwd)

    def find_user_password(self, realm, authuri):
@@ -873,7 +873,7 @@ class HTTPPasswordMgr:
            scheme = None
            authority = uri
            path = '/'
-        host, port = splitport(authority)
+        host, port = _splitport(authority)
        if default_port and port is None and scheme is not None:
            dport = {"http": 80,
                     "https": 443,
@@ -945,8 +945,15 @@ class AbstractBasicAuthHandler:

    # allow for double- and single-quoted realm values
    # (single quotes are a violation of the RFC, but appear in the wild)
-    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
-                    'realm=(["\']?)([^"\']*)\\2', re.I)
+    rx = re.compile('(?:^|,)'   # start of the string or ','
+                    '[ \t]*'    # optional whitespaces
+                    '([^ \t]+)' # scheme like "Basic"
+                    '[ \t]+'    # mandatory whitespaces
+                    # realm=xxx
+                    # realm='xxx'
+                    # realm="xxx"
+                    'realm=(["\']?)([^"\']*)\\2',
+                    re.I)

    # XXX could pre-emptively send auth info already accepted (RFC 2617,
    # end of section 2, and section 1.2 immediately after "credentials"
@@ -958,27 +965,51 @@ class AbstractBasicAuthHandler:
        self.passwd = password_mgr
        self.add_password = self.passwd.add_password

+    def _parse_realm(self, header):
+        # parse WWW-Authenticate header: accept multiple challenges per header
+        found_challenge = False
+        for mo in AbstractBasicAuthHandler.rx.finditer(header):
+            scheme, quote, realm = mo.groups()
+            if quote not in ['"', "'"]:
+                warnings.warn("Basic Auth Realm was unquoted",
+                              UserWarning, 3)
+
+            yield (scheme, realm)
+
+            found_challenge = True
+
+        if not found_challenge:
+            if header:
+                scheme = header.split()[0]
+            else:
+                scheme = ''
+            yield (scheme, None)
+
    def http_error_auth_reqed(self, authreq, host, req, headers):
        # host may be an authority (without userinfo) or a URL with an
        # authority
-        # XXX could be multiple headers
-        authreq = headers.get(authreq, None)
+        headers = headers.get_all(authreq)
+        if not headers:
+            # no header found
+            return

-        if authreq:
-            scheme = authreq.split()[0]
-            if scheme.lower() != 'basic':
-                raise ValueError("AbstractBasicAuthHandler does not"
-                                 " support the following scheme: '%s'" %
-                                 scheme)
-            else:
-                mo = AbstractBasicAuthHandler.rx.search(authreq)
-                if mo:
-                    scheme, quote, realm = mo.groups()
-                    if quote not in ['"',"'"]:
-                        warnings.warn("Basic Auth Realm was unquoted",
-                                      UserWarning, 2)
-                    if scheme.lower() == 'basic':
-                        return self.retry_http_basic_auth(host, req, realm)
+        unsupported = None
+        for header in headers:
+            for scheme, realm in self._parse_realm(header):
+                if scheme.lower() != 'basic':
+                    unsupported = scheme
+                    continue
+
+                if realm is not None:
+                    # Use the first matching Basic challenge.
+                    # Ignore following challenges even if they use the Basic
+                    # scheme.
+                    return self.retry_http_basic_auth(host, req, realm)
+
+        if unsupported is not None:
+            raise ValueError("AbstractBasicAuthHandler does not "
+                             "support the following scheme: %r"
+                             % (scheme,))

    def retry_http_basic_auth(self, host, req, realm):
        user, pw = self.passwd.find_user_password(realm, host)
@@ -1144,7 +1175,11 @@ class AbstractDigestAuthHandler:
        A2 = "%s:%s" % (req.get_method(),
                        # XXX selector: what about proxies and full urls
                        req.selector)
-        if qop == 'auth':
+        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
+        #     or `auth-int` to the response back. we use `auth` to send the response back.
+        if qop is None:
+            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
+        elif 'auth' in qop.split(','):
            if nonce == self.last_nonce:
                self.nonce_count += 1
            else:
@@ -1152,10 +1187,8 @@ class AbstractDigestAuthHandler:
                self.last_nonce = nonce
            ncvalue = '%08x' % self.nonce_count
            cnonce = self.get_cnonce(nonce)
-            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
+            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
            respdig = KD(H(A1), noncebit)
-        elif qop is None:
-            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
        else:
            # XXX handle auth-int.
            raise URLError("qop '%s' is not supported." % qop)
@@ -1262,8 +1295,8 @@ class AbstractHTTPHandler(BaseHandler):

        sel_host = host
        if request.has_proxy():
-            scheme, sel = splittype(request.selector)
-            sel_host, sel_path = splithost(sel)
+            scheme, sel = _splittype(request.selector)
+            sel_host, sel_path = _splithost(sel)
        if not request.has_header('Host'):
            request.add_unredirected_header('Host', sel_host)
        for name, value in self.parent.addheaders:
@@ -1287,8 +1320,8 @@ class AbstractHTTPHandler(BaseHandler):
        h.set_debuglevel(self._debuglevel)

        headers = dict(req.unredirected_hdrs)
-        headers.update(dict((k, v) for k, v in req.headers.items()
-                            if k not in headers))
+        headers.update({k: v for k, v in req.headers.items()
+                        if k not in headers})

        # TODO(jhylton): Should this be redesigned to handle
        # persistent connections?
@@ -1300,7 +1333,7 @@ class AbstractHTTPHandler(BaseHandler):
        # So make sure the connection gets closed after the (only)
        # request.
        headers["Connection"] = "close"
-        headers = dict((name.title(), val) for name, val in headers.items())
+        headers = {name.title(): val for name, val in headers.items()}

        if req._tunnel_host:
            tunnel_headers = {}
@@ -1479,7 +1512,7 @@ class FileHandler(BaseHandler):
                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
                (mtype or 'text/plain', size, modified))
            if host:
-                host, port = splitport(host)
+                host, port = _splitport(host)
            if not host or \
                (not port and _safe_gethostbyname(host) in self.get_names()):
                if host:
@@ -1488,7 +1521,6 @@ class FileHandler(BaseHandler):
                    origurl = 'file://' + filename
                return addinfourl(open(localfile, 'rb'), headers, origurl)
        except OSError as exp:
-            # users shouldn't expect OSErrors coming from urlopen()
            raise URLError(exp)
        raise URLError('file not on local host')

@@ -1505,16 +1537,16 @@ class FTPHandler(BaseHandler):
        host = req.host
        if not host:
            raise URLError('ftp error: no host given')
-        host, port = splitport(host)
+        host, port = _splitport(host)
        if port is None:
            port = ftplib.FTP_PORT
        else:
            port = int(port)

        # username/password handling
-        user, host = splituser(host)
+        user, host = _splituser(host)
        if user:
-            user, passwd = splitpasswd(user)
+            user, passwd = _splitpasswd(user)
        else:
            passwd = None
        host = unquote(host)
@@ -1525,7 +1557,7 @@ class FTPHandler(BaseHandler):
            host = socket.gethostbyname(host)
        except OSError as msg:
            raise URLError(msg)
-        path, attrs = splitattr(req.selector)
+        path, attrs = _splitattr(req.selector)
        dirs = path.split('/')
        dirs = list(map(unquote, dirs))
        dirs, file = dirs[:-1], dirs[-1]
@@ -1535,7 +1567,7 @@ class FTPHandler(BaseHandler):
            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
            type = file and 'I' or 'D'
            for attr in attrs:
-                attr, value = splitvalue(attr)
+                attr, value = _splitvalue(attr)
                if attr.lower() == 'type' and \
                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
                    type = value.upper()
@@ -1658,14 +1690,10 @@ else:
        of the 'file' scheme; not recommended for general use."""
        return quote(pathname)

-# This really consists of two pieces:
-# (1) a class which handles opening of all sorts of URLs
-#     (plus assorted utilities etc.)
-# (2) a set of functions for parsing URLs
-# XXX Should these be separated out into different modules?
-

 ftpcache = {}
+
+
 class URLopener:
    """Class to open URLs.
    This is a class rather than just a subroutine because we may need
@@ -1733,26 +1761,26 @@ class URLopener:
    # External interface
    def open(self, fullurl, data=None):
        """Use URLopener().open(file) instead of open(file, 'r')."""
-        fullurl = unwrap(to_bytes(fullurl))
+        fullurl = unwrap(_to_bytes(fullurl))
        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
        if self.tempcache and fullurl in self.tempcache:
            filename, headers = self.tempcache[fullurl]
            fp = open(filename, 'rb')
            return addinfourl(fp, headers, fullurl)
-        urltype, url = splittype(fullurl)
+        urltype, url = _splittype(fullurl)
        if not urltype:
            urltype = 'file'
        if urltype in self.proxies:
            proxy = self.proxies[urltype]
-            urltype, proxyhost = splittype(proxy)
-            host, selector = splithost(proxyhost)
+            urltype, proxyhost = _splittype(proxy)
+            host, selector = _splithost(proxyhost)
            url = (host, fullurl) # Signal special case to open_*()
        else:
            proxy = None
        name = 'open_' + urltype
        self.type = urltype
        name = name.replace('-', '_')
-        if not hasattr(self, name):
+        if not hasattr(self, name) or name == 'open_local_file':
            if proxy:
                return self.open_unknown_proxy(proxy, fullurl, data)
            else:
@@ -1769,28 +1797,28 @@ class URLopener:

    def open_unknown(self, fullurl, data=None):
        """Overridable interface to open unknown URL type."""
-        type, url = splittype(fullurl)
+        type, url = _splittype(fullurl)
        raise OSError('url error', 'unknown url type', type)

    def open_unknown_proxy(self, proxy, fullurl, data=None):
        """Overridable interface to open unknown URL type."""
-        type, url = splittype(fullurl)
+        type, url = _splittype(fullurl)
        raise OSError('url error', 'invalid proxy for %s' % type, proxy)

    # External interface
    def retrieve(self, url, filename=None, reporthook=None, data=None):
        """retrieve(url) returns (filename, headers) for a local object
        or (tempfilename, headers) for a remote object."""
-        url = unwrap(to_bytes(url))
+        url = unwrap(_to_bytes(url))
        if self.tempcache and url in self.tempcache:
            return self.tempcache[url]
-        type, url1 = splittype(url)
+        type, url1 = _splittype(url)
        if filename is None and (not type or type == 'file'):
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                fp.close()
-                return url2pathname(splithost(url1)[1]), hdrs
+                return url2pathname(_splithost(url1)[1]), hdrs
            except OSError as msg:
                pass
        fp = self.open(url, data)
@@ -1799,11 +1827,10 @@ class URLopener:
            if filename:
                tfp = open(filename, 'wb')
            else:
-                import tempfile
-                garbage, path = splittype(url)
-                garbage, path = splithost(path or "")
-                path, garbage = splitquery(path or "")
-                path, garbage = splitattr(path or "")
+                garbage, path = _splittype(url)
+                garbage, path = _splithost(path or "")
+                path, garbage = _splitquery(path or "")
+                path, garbage = _splitattr(path or "")
                suffix = os.path.splitext(path)[1]
                (fd, filename) = tempfile.mkstemp(suffix)
                self.__tempfiles.append(filename)
@@ -1860,25 +1887,25 @@ class URLopener:
        user_passwd = None
        proxy_passwd= None
        if isinstance(url, str):
-            host, selector = splithost(url)
+            host, selector = _splithost(url)
            if host:
-                user_passwd, host = splituser(host)
+                user_passwd, host = _splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            # check whether the proxy contains authorization information
-            proxy_passwd, host = splituser(host)
+            proxy_passwd, host = _splituser(host)
            # now we proceed with the url we want to obtain
-            urltype, rest = splittype(selector)
+            urltype, rest = _splittype(selector)
            url = rest
            user_passwd = None
            if urltype.lower() != 'http':
                realhost = None
            else:
-                realhost, rest = splithost(rest)
+                realhost, rest = _splithost(rest)
                if realhost:
-                    user_passwd, realhost = splituser(realhost)
+                    user_passwd, realhost = _splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
                if proxy_bypass(realhost):
@@ -1984,7 +2011,7 @@ class URLopener:
        """Use local file."""
        import email.utils
        import mimetypes
-        host, file = splithost(url)
+        host, file = _splithost(url)
        localname = url2pathname(file)
        try:
            stats = os.stat(localname)
@@ -2001,7 +2028,7 @@ class URLopener:
            if file[:1] == '/':
                urlfile = 'file://' + file
            return addinfourl(open(localname, 'rb'), headers, urlfile)
-        host, port = splitport(host)
+        host, port = _splitport(host)
        if (not port
           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
            urlfile = file
@@ -2017,11 +2044,11 @@ class URLopener:
        if not isinstance(url, str):
            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
        import mimetypes
-        host, path = splithost(url)
+        host, path = _splithost(url)
        if not host: raise URLError('ftp error: no host given')
-        host, port = splitport(host)
-        user, host = splituser(host)
-        if user: user, passwd = splitpasswd(user)
+        host, port = _splitport(host)
+        user, host = _splituser(host)
+        if user: user, passwd = _splitpasswd(user)
        else: passwd = None
        host = unquote(host)
        user = unquote(user or '')
@@ -2032,7 +2059,7 @@ class URLopener:
            port = ftplib.FTP_PORT
        else:
            port = int(port)
-        path, attrs = splitattr(path)
+        path, attrs = _splitattr(path)
        path = unquote(path)
        dirs = path.split('/')
        dirs, file = dirs[:-1], dirs[-1]
@@ -2054,7 +2081,7 @@ class URLopener:
            if not file: type = 'D'
            else: type = 'I'
            for attr in attrs:
-                attr, value = splitvalue(attr)
+                attr, value = _splitvalue(attr)
                if attr.lower() == 'type' and \
                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
                    type = value.upper()
@@ -2237,11 +2264,11 @@ class FancyURLopener(URLopener):
            return getattr(self,name)(url, realm, data)

    def retry_proxy_http_basic_auth(self, url, realm, data=None):
-        host, selector = splithost(url)
+        host, selector = _splithost(url)
        newurl = 'http://' + host + selector
        proxy = self.proxies['http']
-        urltype, proxyhost = splittype(proxy)
-        proxyhost, proxyselector = splithost(proxyhost)
+        urltype, proxyhost = _splittype(proxy)
+        proxyhost, proxyselector = _splithost(proxyhost)
        i = proxyhost.find('@') + 1
        proxyhost = proxyhost[i:]
        user, passwd = self.get_user_passwd(proxyhost, realm, i)
@@ -2255,11 +2282,11 @@ class FancyURLopener(URLopener):
            return self.open(newurl, data)

    def retry_proxy_https_basic_auth(self, url, realm, data=None):
-        host, selector = splithost(url)
+        host, selector = _splithost(url)
        newurl = 'https://' + host + selector
        proxy = self.proxies['https']
-        urltype, proxyhost = splittype(proxy)
-        proxyhost, proxyselector = splithost(proxyhost)
+        urltype, proxyhost = _splittype(proxy)
+        proxyhost, proxyselector = _splithost(proxyhost)
        i = proxyhost.find('@') + 1
        proxyhost = proxyhost[i:]
        user, passwd = self.get_user_passwd(proxyhost, realm, i)
@@ -2273,7 +2300,7 @@ class FancyURLopener(URLopener):
            return self.open(newurl, data)

    def retry_http_basic_auth(self, url, realm, data=None):
-        host, selector = splithost(url)
+        host, selector = _splithost(url)
        i = host.find('@') + 1
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
@@ -2287,7 +2314,7 @@ class FancyURLopener(URLopener):
            return self.open(newurl, data)

    def retry_https_basic_auth(self, url, realm, data=None):
-        host, selector = splithost(url)
+        host, selector = _splithost(url)
        i = host.find('@') + 1
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
@@ -2504,23 +2531,26 @@ def proxy_bypass_environment(host, proxies=None):
    try:
        no_proxy = proxies['no']
    except KeyError:
-        return 0
+        return False
    # '*' is special case for always bypass
    if no_proxy == '*':
-        return 1
+        return True
+    host = host.lower()
    # strip port off host
-    hostonly, port = splitport(host)
+    hostonly, port = _splitport(host)
    # check if the host ends with any of the DNS suffixes
-    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
-    for name in no_proxy_list:
+    for name in no_proxy.split(','):
+        name = name.strip()
        if name:
-            name = re.escape(name)
-            pattern = r'(.+\.)?%s$' % name
-            if (re.match(pattern, hostonly, re.I)
-                    or re.match(pattern, host, re.I)):
-                return 1
+            name = name.lstrip('.')  # ignore leading dots
+            name = name.lower()
+            if hostonly == name or host == name:
+                return True
+            name = '.' + name
+            if hostonly.endswith(name) or host.endswith(name):
+                return True
    # otherwise, don't bypass
-    return 0
+    return False


 # This code tests an OSX specific data structure but is testable on all
@@ -2539,7 +2569,7 @@ def _proxy_bypass_macosx_sysconf(host, proxy_settings):
    """
    from fnmatch import fnmatch

-    hostonly, port = splitport(host)
+    hostonly, port = _splitport(host)

    def ip2num(ipAddr):
        parts = ipAddr.split('.')
@@ -2646,7 +2676,7 @@ elif os.name == 'nt':
                    for p in proxyServer.split(';'):
                        protocol, address = p.split('=', 1)
                        # See if address has a type:// prefix
-                        if not re.match('^([^/:]+)://', address):
+                        if not re.match('(?:[^/:]+)://', address):
                            address = '%s://%s' % (protocol, address)
                        proxies[protocol] = address
                else:
@@ -2693,7 +2723,7 @@ elif os.name == 'nt':
        if not proxyEnable or not proxyOverride:
            return 0
        # try to make a host list from name and IP address.
-        rawHost, port = splitport(host)
+        rawHost, port = _splitport(host)
        host = [rawHost]
        try:
            addr = socket.gethostbyname(rawHost)
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -16,6 +16,9 @@ import urllib.request

 __all__ = ["RobotFileParser"]

+RequestRate = collections.namedtuple("RequestRate", "requests seconds")
+
+
 class RobotFileParser:
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.
@@ -24,6 +27,7 @@ class RobotFileParser:

    def __init__(self, url=''):
        self.entries = []
+        self.sitemaps = []
        self.default_entry = None
        self.disallow_all = False
        self.allow_all = False
@@ -136,12 +140,14 @@ class RobotFileParser:
                        # check if all values are sane
                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
                            and numbers[1].strip().isdigit()):
-                            req_rate = collections.namedtuple('req_rate',
-                                                              'requests seconds')
-                            entry.req_rate = req_rate
-                            entry.req_rate.requests = int(numbers[0])
-                            entry.req_rate.seconds = int(numbers[1])
+                            entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
                        state = 2
+                elif line[0] == "sitemap":
+                    # According to http://www.sitemaps.org/protocol.html
+                    # "This directive is independent of the user-agent line,
+                    #  so it doesn't matter where you place it in your file."
+                    # Therefore we do not change the state of the parser.
+                    self.sitemaps.append(line[1])
        if state == 2:
            self._add_entry(entry)

@@ -180,7 +186,9 @@ class RobotFileParser:
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.delay
-        return self.default_entry.delay
+        if self.default_entry:
+            return self.default_entry.delay
+        return None

    def request_rate(self, useragent):
        if not self.mtime():
@@ -188,10 +196,20 @@ class RobotFileParser:
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.req_rate
-        return self.default_entry.req_rate
+        if self.default_entry:
+            return self.default_entry.req_rate
+        return None
+
+    def site_maps(self):
+        if not self.sitemaps:
+            return None
+        return self.sitemaps

    def __str__(self):
-        return ''.join([str(entry) + "\n" for entry in self.entries])
+        entries = self.entries
+        if self.default_entry is not None:
+            entries = entries + [self.default_entry]
+        return '\n\n'.join(map(str, entries))


 class RuleLine:
@@ -223,10 +241,14 @@ class Entry:
    def __str__(self):
        ret = []
        for agent in self.useragents:
-            ret.extend(["User-agent: ", agent, "\n"])
-        for line in self.rulelines:
-            ret.extend([str(line), "\n"])
-        return ''.join(ret)
+            ret.append(f"User-agent: {agent}")
+        if self.delay is not None:
+            ret.append(f"Crawl-delay: {self.delay}")
+        if self.req_rate is not None:
+            rate = self.req_rate
+            ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
+        ret.extend(map(str, self.rulelines))
+        return '\n'.join(ret)

    def applies_to(self, useragent):
        """check if this entry applies to the specified agent"""