Copy updated urllib from CPython 3.8.3.

This commit is contained in:
Ben Lewis
2020-07-19 13:53:04 +12:00
parent 9b48296fd7
commit cfc7a7c734
4 changed files with 386 additions and 182 deletions

View File

@@ -16,14 +16,10 @@ import urllib.response
__all__ = ['URLError', 'HTTPError', 'ContentTooShortError']
# do these error classes make sense?
# make sure all of the OSError stuff is overridden. we just want to be
# subtypes.
class URLError(OSError):
# URLError is a sub-type of OSError, but it doesn't share any of
# the implementation. need to override __init__ and __str__.
# It sets self.args for compatibility with other EnvironmentError
# It sets self.args for compatibility with other OSError
# subclasses, but args doesn't have the typical format with errno in
# slot 0 and strerror in slot 1. This may be better than nothing.
def __init__(self, reason, filename=None):

View File

@@ -30,6 +30,7 @@ test_urlparse.py provides a good indicator of parsing behavior.
import re
import sys
import collections
import warnings
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "urlencode", "parse_qs",
@@ -38,29 +39,37 @@ __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"DefragResult", "ParseResult", "SplitResult",
"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
# A classification of schemes.
# The empty string classifies URLs with no scheme specified,
# being the default value returned by “urlsplit” and “urlparse”.
uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
'wais', 'file', 'https', 'shttp', 'mms',
'prospero', 'rtsp', 'rtspu', '', 'sftp',
'prospero', 'rtsp', 'rtspu', 'sftp',
'svn', 'svn+ssh', 'ws', 'wss']
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
'ws', 'wss']
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
'mms', '', 'sftp', 'tel']
'mms', 'sftp', 'tel']
# These are not actually used anymore, but should stay for backwards
# compatibility. (They are undocumented, but have a public-looking name.)
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
'nntp', 'wais', 'https', 'shttp', 'snews',
'file', 'prospero', '']
'file', 'prospero']
# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
@@ -147,16 +156,22 @@ class _NetlocResultMixinBase(object):
def hostname(self):
hostname = self._hostinfo[0]
if not hostname:
hostname = None
elif hostname is not None:
hostname = hostname.lower()
return hostname
return None
# Scoped IPv6 address may have zone info, which must not be lowercased
# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
separator = '%' if isinstance(hostname, str) else b'%'
hostname, percent, zone = hostname.partition(separator)
return hostname.lower() + percent + zone
@property
def port(self):
port = self._hostinfo[1]
if port is not None:
port = int(port, 10)
try:
port = int(port, 10)
except ValueError:
message = f'Port could not be cast to integer value as {port!r}'
raise ValueError(message) from None
if not ( 0 <= port <= 65535):
raise ValueError("Port out of range 0-65535")
return port
@@ -274,7 +289,7 @@ by reference to a primary resource and additional identifying information.
"""
_ParseResultBase.__doc__ = """
ParseResult(scheme, netloc, path, params, query, fragment)
ParseResult(scheme, netloc, path, params, query, fragment)
A 6-tuple that contains components of a parsed URL.
"""
@@ -381,6 +396,24 @@ def _splitnetloc(url, start=0):
delim = min(delim, wdelim) # use earliest delim position
return url[start:delim], url[delim:] # return (domain, rest)
def _checknetloc(netloc):
if not netloc or netloc.isascii():
return
# looking for characters like \u2100 that expand to 'a/c'
# IDNA uses NFKC equivalence, so normalize for this check
import unicodedata
n = netloc.replace('@', '') # ignore characters already included
n = n.replace(':', '') # but not the surrounding text
n = n.replace('#', '')
n = n.replace('?', '')
netloc2 = unicodedata.normalize('NFKC', n)
if n == netloc2:
return
for c in '/?#@:':
if c in netloc2:
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -399,7 +432,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
i = url.find(':')
if i > 0:
if url[:i] == 'http': # optimize the common case
scheme = url[:i].lower()
url = url[i+1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
@@ -410,7 +442,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
_checknetloc(netloc)
v = SplitResult('http', netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
for c in url[:i]:
@@ -433,6 +466,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
@@ -574,7 +608,7 @@ def unquote_to_bytes(string):
# if the function is never called
global _hextobyte
if _hextobyte is None:
_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
_hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
for a in _hexdig for b in _hexdig}
for item in bits[1:]:
try:
@@ -612,8 +646,9 @@ def unquote(string, encoding='utf-8', errors='replace'):
append(bits[i + 1])
return ''.join(res)
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
encoding='utf-8', errors='replace', max_num_fields=None):
"""Parse a query given as a string argument.
Arguments:
@@ -633,10 +668,16 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding and errors: specify how to decode percent-encoded sequences
into Unicode characters, as accepted by the bytes.decode() method.
max_num_fields: int. If set, then throws a ValueError if there
are more than n fields read by parse_qsl().
Returns a dictionary.
"""
parsed_result = {}
pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors)
encoding=encoding, errors=errors,
max_num_fields=max_num_fields)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
@@ -644,30 +685,43 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
parsed_result[name] = [value]
return parsed_result
def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
encoding='utf-8', errors='replace', max_num_fields=None):
"""Parse a query given as a string argument.
Arguments:
Arguments:
qs: percent-encoded query string to be parsed
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings.
A true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
encoding and errors: specify how to decode percent-encoded sequences
into Unicode characters, as accepted by the bytes.decode() method.
encoding and errors: specify how to decode percent-encoded sequences
into Unicode characters, as accepted by the bytes.decode() method.
Returns a list, as G-d intended.
max_num_fields: int. If set, then throws a ValueError
if there are more than n fields read by parse_qsl().
Returns a list, as G-d intended.
"""
qs, _coerce_result = _coerce_args(qs)
# If max_num_fields is defined then check that the number of fields
# is less than max_num_fields. This prevents a memory exhaustion DOS
# attack via post bodies with many fields.
if max_num_fields is not None:
num_fields = 1 + qs.count('&') + qs.count(';')
if max_num_fields < num_fields:
raise ValueError('Max number of fields exceeded')
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
@@ -704,7 +758,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
b'abcdefghijklmnopqrstuvwxyz'
b'0123456789'
b'_.-')
b'_.-~')
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
_safe_quoters = {}
@@ -734,22 +788,32 @@ def quote(string, safe='/', encoding=None, errors=None):
"""quote('abc def') -> 'abc%20def'
Each part of a URL, e.g. the path info, the query, etc., has a
different set of reserved characters that must be quoted.
different set of reserved characters that must be quoted. The
quote function offers a cautious (not minimal) way to quote a
string for most of these parts.
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
the following reserved characters.
RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
the following (un)reserved characters.
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
"$" | ","
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="
Each of these characters is reserved in some component of a URL,
Each of the reserved characters is reserved in some component of a URL,
but not necessarily in all of them.
By default, the quote function is intended for quoting the path
section of a URL. Thus, it will not encode '/'. This character
is reserved, but in typical usage the quote function is being
called on a path where the existing slash characters are used as
reserved characters.
The quote function %-escapes all characters that are neither in the
unreserved chars ("always safe") nor the additional chars set via the
safe arg.
The default for the safe arg is '/'. The character is reserved, but in
typical usage the quote function is being called on a path where the
existing slash characters are to be preserved.
Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
Now, "~" is included in the set of unreserved characters.
string and safe may be either str or bytes objects. encoding and errors
must not be specified if string is a bytes object.
@@ -893,7 +957,14 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
l.append(k + '=' + elt)
return '&'.join(l)
def to_bytes(url):
warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
DeprecationWarning, stacklevel=2)
return _to_bytes(url)
def _to_bytes(url):
"""to_bytes(u"URL") --> 'URL'."""
# Most URL schemes require ASCII. If that changes, the conversion
# can be relaxed.
@@ -906,16 +977,29 @@ def to_bytes(url):
" contains non-ASCII characters")
return url
def unwrap(url):
"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
"""Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
The string is returned unchanged if it's not a wrapped URL.
"""
url = str(url).strip()
if url[:1] == '<' and url[-1:] == '>':
url = url[1:-1].strip()
if url[:4] == 'URL:': url = url[4:].strip()
if url[:4] == 'URL:':
url = url[4:].strip()
return url
_typeprog = None
def splittype(url):
warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splittype(url)
_typeprog = None
def _splittype(url):
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
global _typeprog
if _typeprog is None:
@@ -927,12 +1011,20 @@ def splittype(url):
return scheme.lower(), data
return None, url
_hostprog = None
def splithost(url):
warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splithost(url)
_hostprog = None
def _splithost(url):
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
global _hostprog
if _hostprog is None:
_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
match = _hostprog.match(url)
if match:
@@ -942,32 +1034,64 @@ def splithost(url):
return host_port, path
return None, url
def splituser(host):
warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splituser(host)
def _splituser(host):
"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
user, delim, host = host.rpartition('@')
return (user if delim else None), host
def splitpasswd(user):
warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splitpasswd(user)
def _splitpasswd(user):
"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
user, delim, passwd = user.partition(':')
return user, (passwd if delim else None)
def splitport(host):
warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splitport(host)
# splittag('/path#tag') --> '/path', 'tag'
_portprog = None
def splitport(host):
def _splitport(host):
"""splitport('host:port') --> 'host', 'port'."""
global _portprog
if _portprog is None:
_portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
_portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
match = _portprog.match(host)
match = _portprog.fullmatch(host)
if match:
host, port = match.groups()
if port:
return host, port
return host, None
def splitnport(host, defport=-1):
warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splitnport(host, defport)
def _splitnport(host, defport=-1):
"""Split host and port, returning numeric port.
Return given default port if no ':' found; defaults to -1.
Return numerical port if a valid number are found after ':'.
@@ -983,27 +1107,59 @@ def splitnport(host, defport=-1):
return host, nport
return host, defport
def splitquery(url):
warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splitquery(url)
def _splitquery(url):
"""splitquery('/path?query') --> '/path', 'query'."""
path, delim, query = url.rpartition('?')
if delim:
return path, query
return url, None
def splittag(url):
warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splittag(url)
def _splittag(url):
"""splittag('/path#tag') --> '/path', 'tag'."""
path, delim, tag = url.rpartition('#')
if delim:
return path, tag
return url, None
def splitattr(url):
warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
"use urllib.parse.urlparse() instead",
DeprecationWarning, stacklevel=2)
return _splitattr(url)
def _splitattr(url):
"""splitattr('/path;attr1=value1;attr2=value2;...') ->
'/path', ['attr1=value1', 'attr2=value2', ...]."""
words = url.split(';')
return words[0], words[1:]
def splitvalue(attr):
warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
"use urllib.parse.parse_qsl() instead",
DeprecationWarning, stacklevel=2)
return _splitvalue(attr)
def _splitvalue(attr):
"""splitvalue('attr=value') --> 'attr', 'value'."""
attr, delim, value = attr.partition('=')
return attr, (value if delim else None)

View File

@@ -94,7 +94,6 @@ import socket
import string
import sys
import time
import collections
import tempfile
import contextlib
import warnings
@@ -103,8 +102,8 @@ import warnings
from urllib.error import URLError, HTTPError, ContentTooShortError
from urllib.parse import (
urlparse, urlsplit, urljoin, unwrap, quote, unquote,
splittype, splithost, splitport, splituser, splitpasswd,
splitattr, splitquery, splitvalue, splittag, to_bytes,
_splittype, _splithost, _splitport, _splituser, _splitpasswd,
_splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
unquote_to_bytes, urlunparse)
from urllib.response import addinfourl, addclosehook
@@ -199,7 +198,7 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
global _opener
if cafile or capath or cadefault:
import warnings
warnings.warn("cafile, cpath and cadefault are deprecated, use a "
warnings.warn("cafile, capath and cadefault are deprecated, use a "
"custom context instead.", DeprecationWarning, 2)
if context is not None:
raise ValueError(
@@ -243,7 +242,7 @@ def urlretrieve(url, filename=None, reporthook=None, data=None):
Returns a tuple containing the path to the newly created
data file as well as the resulting HTTPMessage object.
"""
url_type, path = splittype(url)
url_type, path = _splittype(url)
with contextlib.closing(urlopen(url, data)) as fp:
headers = fp.info()
@@ -351,7 +350,7 @@ class Request:
def full_url(self, url):
# unwrap('<URL:type://host/path>') --> 'type://host/path'
self._full_url = unwrap(url)
self._full_url, self.fragment = splittag(self._full_url)
self._full_url, self.fragment = _splittag(self._full_url)
self._parse()
@full_url.deleter
@@ -379,10 +378,10 @@ class Request:
self.data = None
def _parse(self):
self.type, rest = splittype(self._full_url)
self.type, rest = _splittype(self._full_url)
if self.type is None:
raise ValueError("unknown url type: %r" % self.full_url)
self.host, self.selector = splithost(rest)
self.host, self.selector = _splithost(rest)
if self.host:
self.host = unquote(self.host)
@@ -427,8 +426,7 @@ class Request:
self.unredirected_hdrs.pop(header_name, None)
def header_items(self):
hdrs = self.unredirected_hdrs.copy()
hdrs.update(self.headers)
hdrs = {**self.unredirected_hdrs, **self.headers}
return list(hdrs.items())
class OpenerDirector:
@@ -523,6 +521,7 @@ class OpenerDirector:
meth = getattr(processor, meth_name)
req = meth(req)
sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
response = self._open(req, data)
# post-process response
@@ -684,8 +683,8 @@ class HTTPRedirectHandler(BaseHandler):
newurl = newurl.replace(' ', '%20')
CONTENT_HEADERS = ("content-length", "content-type")
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in CONTENT_HEADERS)
newheaders = {k: v for k, v in req.headers.items()
if k.lower() not in CONTENT_HEADERS}
return Request(newurl,
headers=newheaders,
origin_req_host=req.origin_req_host,
@@ -769,7 +768,7 @@ def _parse_proxy(proxy):
According to RFC 3986, having an authority component means the URL must
have two slashes after the scheme.
"""
scheme, r_scheme = splittype(proxy)
scheme, r_scheme = _splittype(proxy)
if not r_scheme.startswith("/"):
# authority
scheme = None
@@ -784,9 +783,9 @@ def _parse_proxy(proxy):
if end == -1:
end = None
authority = r_scheme[2:end]
userinfo, hostport = splituser(authority)
userinfo, hostport = _splituser(authority)
if userinfo is not None:
user, password = splitpasswd(userinfo)
user, password = _splitpasswd(userinfo)
else:
user = password = None
return scheme, user, password, hostport
@@ -801,6 +800,7 @@ class ProxyHandler(BaseHandler):
assert hasattr(proxies, 'keys'), "proxies must be a mapping"
self.proxies = proxies
for type, url in proxies.items():
type = type.lower()
setattr(self, '%s_open' % type,
lambda r, proxy=url, type=type, meth=self.proxy_open:
meth(r, proxy, type))
@@ -846,7 +846,7 @@ class HTTPPasswordMgr:
self.passwd[realm] = {}
for default_port in True, False:
reduced_uri = tuple(
[self.reduce_uri(u, default_port) for u in uri])
self.reduce_uri(u, default_port) for u in uri)
self.passwd[realm][reduced_uri] = (user, passwd)
def find_user_password(self, realm, authuri):
@@ -873,7 +873,7 @@ class HTTPPasswordMgr:
scheme = None
authority = uri
path = '/'
host, port = splitport(authority)
host, port = _splitport(authority)
if default_port and port is None and scheme is not None:
dport = {"http": 80,
"https": 443,
@@ -945,8 +945,15 @@ class AbstractBasicAuthHandler:
# allow for double- and single-quoted realm values
# (single quotes are a violation of the RFC, but appear in the wild)
rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
'realm=(["\']?)([^"\']*)\\2', re.I)
rx = re.compile('(?:^|,)' # start of the string or ','
'[ \t]*' # optional whitespaces
'([^ \t]+)' # scheme like "Basic"
'[ \t]+' # mandatory whitespaces
# realm=xxx
# realm='xxx'
# realm="xxx"
'realm=(["\']?)([^"\']*)\\2',
re.I)
# XXX could pre-emptively send auth info already accepted (RFC 2617,
# end of section 2, and section 1.2 immediately after "credentials"
@@ -958,27 +965,51 @@ class AbstractBasicAuthHandler:
self.passwd = password_mgr
self.add_password = self.passwd.add_password
def _parse_realm(self, header):
# parse WWW-Authenticate header: accept multiple challenges per header
found_challenge = False
for mo in AbstractBasicAuthHandler.rx.finditer(header):
scheme, quote, realm = mo.groups()
if quote not in ['"', "'"]:
warnings.warn("Basic Auth Realm was unquoted",
UserWarning, 3)
yield (scheme, realm)
found_challenge = True
if not found_challenge:
if header:
scheme = header.split()[0]
else:
scheme = ''
yield (scheme, None)
def http_error_auth_reqed(self, authreq, host, req, headers):
# host may be an authority (without userinfo) or a URL with an
# authority
# XXX could be multiple headers
authreq = headers.get(authreq, None)
headers = headers.get_all(authreq)
if not headers:
# no header found
return
if authreq:
scheme = authreq.split()[0]
if scheme.lower() != 'basic':
raise ValueError("AbstractBasicAuthHandler does not"
" support the following scheme: '%s'" %
scheme)
else:
mo = AbstractBasicAuthHandler.rx.search(authreq)
if mo:
scheme, quote, realm = mo.groups()
if quote not in ['"',"'"]:
warnings.warn("Basic Auth Realm was unquoted",
UserWarning, 2)
if scheme.lower() == 'basic':
return self.retry_http_basic_auth(host, req, realm)
unsupported = None
for header in headers:
for scheme, realm in self._parse_realm(header):
if scheme.lower() != 'basic':
unsupported = scheme
continue
if realm is not None:
# Use the first matching Basic challenge.
# Ignore following challenges even if they use the Basic
# scheme.
return self.retry_http_basic_auth(host, req, realm)
if unsupported is not None:
raise ValueError("AbstractBasicAuthHandler does not "
"support the following scheme: %r"
% (scheme,))
def retry_http_basic_auth(self, host, req, realm):
user, pw = self.passwd.find_user_password(realm, host)
@@ -1144,7 +1175,11 @@ class AbstractDigestAuthHandler:
A2 = "%s:%s" % (req.get_method(),
# XXX selector: what about proxies and full urls
req.selector)
if qop == 'auth':
# NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
# or `auth-int` to the response back. we use `auth` to send the response back.
if qop is None:
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
elif 'auth' in qop.split(','):
if nonce == self.last_nonce:
self.nonce_count += 1
else:
@@ -1152,10 +1187,8 @@ class AbstractDigestAuthHandler:
self.last_nonce = nonce
ncvalue = '%08x' % self.nonce_count
cnonce = self.get_cnonce(nonce)
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
respdig = KD(H(A1), noncebit)
elif qop is None:
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
else:
# XXX handle auth-int.
raise URLError("qop '%s' is not supported." % qop)
@@ -1262,8 +1295,8 @@ class AbstractHTTPHandler(BaseHandler):
sel_host = host
if request.has_proxy():
scheme, sel = splittype(request.selector)
sel_host, sel_path = splithost(sel)
scheme, sel = _splittype(request.selector)
sel_host, sel_path = _splithost(sel)
if not request.has_header('Host'):
request.add_unredirected_header('Host', sel_host)
for name, value in self.parent.addheaders:
@@ -1287,8 +1320,8 @@ class AbstractHTTPHandler(BaseHandler):
h.set_debuglevel(self._debuglevel)
headers = dict(req.unredirected_hdrs)
headers.update(dict((k, v) for k, v in req.headers.items()
if k not in headers))
headers.update({k: v for k, v in req.headers.items()
if k not in headers})
# TODO(jhylton): Should this be redesigned to handle
# persistent connections?
@@ -1300,7 +1333,7 @@ class AbstractHTTPHandler(BaseHandler):
# So make sure the connection gets closed after the (only)
# request.
headers["Connection"] = "close"
headers = dict((name.title(), val) for name, val in headers.items())
headers = {name.title(): val for name, val in headers.items()}
if req._tunnel_host:
tunnel_headers = {}
@@ -1479,7 +1512,7 @@ class FileHandler(BaseHandler):
'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
(mtype or 'text/plain', size, modified))
if host:
host, port = splitport(host)
host, port = _splitport(host)
if not host or \
(not port and _safe_gethostbyname(host) in self.get_names()):
if host:
@@ -1488,7 +1521,6 @@ class FileHandler(BaseHandler):
origurl = 'file://' + filename
return addinfourl(open(localfile, 'rb'), headers, origurl)
except OSError as exp:
# users shouldn't expect OSErrors coming from urlopen()
raise URLError(exp)
raise URLError('file not on local host')
@@ -1505,16 +1537,16 @@ class FTPHandler(BaseHandler):
host = req.host
if not host:
raise URLError('ftp error: no host given')
host, port = splitport(host)
host, port = _splitport(host)
if port is None:
port = ftplib.FTP_PORT
else:
port = int(port)
# username/password handling
user, host = splituser(host)
user, host = _splituser(host)
if user:
user, passwd = splitpasswd(user)
user, passwd = _splitpasswd(user)
else:
passwd = None
host = unquote(host)
@@ -1525,7 +1557,7 @@ class FTPHandler(BaseHandler):
host = socket.gethostbyname(host)
except OSError as msg:
raise URLError(msg)
path, attrs = splitattr(req.selector)
path, attrs = _splitattr(req.selector)
dirs = path.split('/')
dirs = list(map(unquote, dirs))
dirs, file = dirs[:-1], dirs[-1]
@@ -1535,7 +1567,7 @@ class FTPHandler(BaseHandler):
fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
type = file and 'I' or 'D'
for attr in attrs:
attr, value = splitvalue(attr)
attr, value = _splitvalue(attr)
if attr.lower() == 'type' and \
value in ('a', 'A', 'i', 'I', 'd', 'D'):
type = value.upper()
@@ -1658,14 +1690,10 @@ else:
of the 'file' scheme; not recommended for general use."""
return quote(pathname)
# This really consists of two pieces:
# (1) a class which handles opening of all sorts of URLs
# (plus assorted utilities etc.)
# (2) a set of functions for parsing URLs
# XXX Should these be separated out into different modules?
ftpcache = {}
class URLopener:
"""Class to open URLs.
This is a class rather than just a subroutine because we may need
@@ -1733,26 +1761,26 @@ class URLopener:
# External interface
def open(self, fullurl, data=None):
"""Use URLopener().open(file) instead of open(file, 'r')."""
fullurl = unwrap(to_bytes(fullurl))
fullurl = unwrap(_to_bytes(fullurl))
fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
if self.tempcache and fullurl in self.tempcache:
filename, headers = self.tempcache[fullurl]
fp = open(filename, 'rb')
return addinfourl(fp, headers, fullurl)
urltype, url = splittype(fullurl)
urltype, url = _splittype(fullurl)
if not urltype:
urltype = 'file'
if urltype in self.proxies:
proxy = self.proxies[urltype]
urltype, proxyhost = splittype(proxy)
host, selector = splithost(proxyhost)
urltype, proxyhost = _splittype(proxy)
host, selector = _splithost(proxyhost)
url = (host, fullurl) # Signal special case to open_*()
else:
proxy = None
name = 'open_' + urltype
self.type = urltype
name = name.replace('-', '_')
if not hasattr(self, name):
if not hasattr(self, name) or name == 'open_local_file':
if proxy:
return self.open_unknown_proxy(proxy, fullurl, data)
else:
@@ -1769,28 +1797,28 @@ class URLopener:
def open_unknown(self, fullurl, data=None):
"""Overridable interface to open unknown URL type."""
type, url = splittype(fullurl)
type, url = _splittype(fullurl)
raise OSError('url error', 'unknown url type', type)
def open_unknown_proxy(self, proxy, fullurl, data=None):
"""Overridable interface to open unknown URL type."""
type, url = splittype(fullurl)
type, url = _splittype(fullurl)
raise OSError('url error', 'invalid proxy for %s' % type, proxy)
# External interface
def retrieve(self, url, filename=None, reporthook=None, data=None):
"""retrieve(url) returns (filename, headers) for a local object
or (tempfilename, headers) for a remote object."""
url = unwrap(to_bytes(url))
url = unwrap(_to_bytes(url))
if self.tempcache and url in self.tempcache:
return self.tempcache[url]
type, url1 = splittype(url)
type, url1 = _splittype(url)
if filename is None and (not type or type == 'file'):
try:
fp = self.open_local_file(url1)
hdrs = fp.info()
fp.close()
return url2pathname(splithost(url1)[1]), hdrs
return url2pathname(_splithost(url1)[1]), hdrs
except OSError as msg:
pass
fp = self.open(url, data)
@@ -1799,11 +1827,10 @@ class URLopener:
if filename:
tfp = open(filename, 'wb')
else:
import tempfile
garbage, path = splittype(url)
garbage, path = splithost(path or "")
path, garbage = splitquery(path or "")
path, garbage = splitattr(path or "")
garbage, path = _splittype(url)
garbage, path = _splithost(path or "")
path, garbage = _splitquery(path or "")
path, garbage = _splitattr(path or "")
suffix = os.path.splitext(path)[1]
(fd, filename) = tempfile.mkstemp(suffix)
self.__tempfiles.append(filename)
@@ -1860,25 +1887,25 @@ class URLopener:
user_passwd = None
proxy_passwd= None
if isinstance(url, str):
host, selector = splithost(url)
host, selector = _splithost(url)
if host:
user_passwd, host = splituser(host)
user_passwd, host = _splituser(host)
host = unquote(host)
realhost = host
else:
host, selector = url
# check whether the proxy contains authorization information
proxy_passwd, host = splituser(host)
proxy_passwd, host = _splituser(host)
# now we proceed with the url we want to obtain
urltype, rest = splittype(selector)
urltype, rest = _splittype(selector)
url = rest
user_passwd = None
if urltype.lower() != 'http':
realhost = None
else:
realhost, rest = splithost(rest)
realhost, rest = _splithost(rest)
if realhost:
user_passwd, realhost = splituser(realhost)
user_passwd, realhost = _splituser(realhost)
if user_passwd:
selector = "%s://%s%s" % (urltype, realhost, rest)
if proxy_bypass(realhost):
@@ -1984,7 +2011,7 @@ class URLopener:
"""Use local file."""
import email.utils
import mimetypes
host, file = splithost(url)
host, file = _splithost(url)
localname = url2pathname(file)
try:
stats = os.stat(localname)
@@ -2001,7 +2028,7 @@ class URLopener:
if file[:1] == '/':
urlfile = 'file://' + file
return addinfourl(open(localname, 'rb'), headers, urlfile)
host, port = splitport(host)
host, port = _splitport(host)
if (not port
and socket.gethostbyname(host) in ((localhost(),) + thishost())):
urlfile = file
@@ -2017,11 +2044,11 @@ class URLopener:
if not isinstance(url, str):
raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
import mimetypes
host, path = splithost(url)
host, path = _splithost(url)
if not host: raise URLError('ftp error: no host given')
host, port = splitport(host)
user, host = splituser(host)
if user: user, passwd = splitpasswd(user)
host, port = _splitport(host)
user, host = _splituser(host)
if user: user, passwd = _splitpasswd(user)
else: passwd = None
host = unquote(host)
user = unquote(user or '')
@@ -2032,7 +2059,7 @@ class URLopener:
port = ftplib.FTP_PORT
else:
port = int(port)
path, attrs = splitattr(path)
path, attrs = _splitattr(path)
path = unquote(path)
dirs = path.split('/')
dirs, file = dirs[:-1], dirs[-1]
@@ -2054,7 +2081,7 @@ class URLopener:
if not file: type = 'D'
else: type = 'I'
for attr in attrs:
attr, value = splitvalue(attr)
attr, value = _splitvalue(attr)
if attr.lower() == 'type' and \
value in ('a', 'A', 'i', 'I', 'd', 'D'):
type = value.upper()
@@ -2237,11 +2264,11 @@ class FancyURLopener(URLopener):
return getattr(self,name)(url, realm, data)
def retry_proxy_http_basic_auth(self, url, realm, data=None):
host, selector = splithost(url)
host, selector = _splithost(url)
newurl = 'http://' + host + selector
proxy = self.proxies['http']
urltype, proxyhost = splittype(proxy)
proxyhost, proxyselector = splithost(proxyhost)
urltype, proxyhost = _splittype(proxy)
proxyhost, proxyselector = _splithost(proxyhost)
i = proxyhost.find('@') + 1
proxyhost = proxyhost[i:]
user, passwd = self.get_user_passwd(proxyhost, realm, i)
@@ -2255,11 +2282,11 @@ class FancyURLopener(URLopener):
return self.open(newurl, data)
def retry_proxy_https_basic_auth(self, url, realm, data=None):
host, selector = splithost(url)
host, selector = _splithost(url)
newurl = 'https://' + host + selector
proxy = self.proxies['https']
urltype, proxyhost = splittype(proxy)
proxyhost, proxyselector = splithost(proxyhost)
urltype, proxyhost = _splittype(proxy)
proxyhost, proxyselector = _splithost(proxyhost)
i = proxyhost.find('@') + 1
proxyhost = proxyhost[i:]
user, passwd = self.get_user_passwd(proxyhost, realm, i)
@@ -2273,7 +2300,7 @@ class FancyURLopener(URLopener):
return self.open(newurl, data)
def retry_http_basic_auth(self, url, realm, data=None):
host, selector = splithost(url)
host, selector = _splithost(url)
i = host.find('@') + 1
host = host[i:]
user, passwd = self.get_user_passwd(host, realm, i)
@@ -2287,7 +2314,7 @@ class FancyURLopener(URLopener):
return self.open(newurl, data)
def retry_https_basic_auth(self, url, realm, data=None):
host, selector = splithost(url)
host, selector = _splithost(url)
i = host.find('@') + 1
host = host[i:]
user, passwd = self.get_user_passwd(host, realm, i)
@@ -2504,23 +2531,26 @@ def proxy_bypass_environment(host, proxies=None):
try:
no_proxy = proxies['no']
except KeyError:
return 0
return False
# '*' is special case for always bypass
if no_proxy == '*':
return 1
return True
host = host.lower()
# strip port off host
hostonly, port = splitport(host)
hostonly, port = _splitport(host)
# check if the host ends with any of the DNS suffixes
no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
for name in no_proxy_list:
for name in no_proxy.split(','):
name = name.strip()
if name:
name = re.escape(name)
pattern = r'(.+\.)?%s$' % name
if (re.match(pattern, hostonly, re.I)
or re.match(pattern, host, re.I)):
return 1
name = name.lstrip('.') # ignore leading dots
name = name.lower()
if hostonly == name or host == name:
return True
name = '.' + name
if hostonly.endswith(name) or host.endswith(name):
return True
# otherwise, don't bypass
return 0
return False
# This code tests an OSX specific data structure but is testable on all
@@ -2539,7 +2569,7 @@ def _proxy_bypass_macosx_sysconf(host, proxy_settings):
"""
from fnmatch import fnmatch
hostonly, port = splitport(host)
hostonly, port = _splitport(host)
def ip2num(ipAddr):
parts = ipAddr.split('.')
@@ -2646,7 +2676,7 @@ elif os.name == 'nt':
for p in proxyServer.split(';'):
protocol, address = p.split('=', 1)
# See if address has a type:// prefix
if not re.match('^([^/:]+)://', address):
if not re.match('(?:[^/:]+)://', address):
address = '%s://%s' % (protocol, address)
proxies[protocol] = address
else:
@@ -2693,7 +2723,7 @@ elif os.name == 'nt':
if not proxyEnable or not proxyOverride:
return 0
# try to make a host list from name and IP address.
rawHost, port = splitport(host)
rawHost, port = _splitport(host)
host = [rawHost]
try:
addr = socket.gethostbyname(rawHost)

View File

@@ -16,6 +16,9 @@ import urllib.request
__all__ = ["RobotFileParser"]
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
@@ -24,6 +27,7 @@ class RobotFileParser:
def __init__(self, url=''):
self.entries = []
self.sitemaps = []
self.default_entry = None
self.disallow_all = False
self.allow_all = False
@@ -136,12 +140,14 @@ class RobotFileParser:
# check if all values are sane
if (len(numbers) == 2 and numbers[0].strip().isdigit()
and numbers[1].strip().isdigit()):
req_rate = collections.namedtuple('req_rate',
'requests seconds')
entry.req_rate = req_rate
entry.req_rate.requests = int(numbers[0])
entry.req_rate.seconds = int(numbers[1])
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2
elif line[0] == "sitemap":
# According to http://www.sitemaps.org/protocol.html
# "This directive is independent of the user-agent line,
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
if state == 2:
self._add_entry(entry)
@@ -180,7 +186,9 @@ class RobotFileParser:
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return self.default_entry.delay
if self.default_entry:
return self.default_entry.delay
return None
def request_rate(self, useragent):
if not self.mtime():
@@ -188,10 +196,20 @@ class RobotFileParser:
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return self.default_entry.req_rate
if self.default_entry:
return self.default_entry.req_rate
return None
def site_maps(self):
if not self.sitemaps:
return None
return self.sitemaps
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
entries = self.entries
if self.default_entry is not None:
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))
class RuleLine:
@@ -223,10 +241,14 @@ class Entry:
def __str__(self):
ret = []
for agent in self.useragents:
ret.extend(["User-agent: ", agent, "\n"])
for line in self.rulelines:
ret.extend([str(line), "\n"])
return ''.join(ret)
ret.append(f"User-agent: {agent}")
if self.delay is not None:
ret.append(f"Crawl-delay: {self.delay}")
if self.req_rate is not None:
rate = self.req_rate
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
ret.extend(map(str, self.rulelines))
return '\n'.join(ret)
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""