forked from Rust-related/RustPython
Copy updated urllib from CPython 3.8.3.
This commit is contained in:
@@ -16,14 +16,10 @@ import urllib.response
|
||||
__all__ = ['URLError', 'HTTPError', 'ContentTooShortError']
|
||||
|
||||
|
||||
# do these error classes make sense?
|
||||
# make sure all of the OSError stuff is overridden. we just want to be
|
||||
# subtypes.
|
||||
|
||||
class URLError(OSError):
|
||||
# URLError is a sub-type of OSError, but it doesn't share any of
|
||||
# the implementation. need to override __init__ and __str__.
|
||||
# It sets self.args for compatibility with other EnvironmentError
|
||||
# It sets self.args for compatibility with other OSError
|
||||
# subclasses, but args doesn't have the typical format with errno in
|
||||
# slot 0 and strerror in slot 1. This may be better than nothing.
|
||||
def __init__(self, reason, filename=None):
|
||||
|
||||
@@ -30,6 +30,7 @@ test_urlparse.py provides a good indicator of parsing behavior.
|
||||
import re
|
||||
import sys
|
||||
import collections
|
||||
import warnings
|
||||
|
||||
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
|
||||
"urlsplit", "urlunsplit", "urlencode", "parse_qs",
|
||||
@@ -38,29 +39,37 @@ __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
|
||||
"DefragResult", "ParseResult", "SplitResult",
|
||||
"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
|
||||
|
||||
# A classification of schemes ('' means apply by default)
|
||||
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
|
||||
# A classification of schemes.
|
||||
# The empty string classifies URLs with no scheme specified,
|
||||
# being the default value returned by “urlsplit” and “urlparse”.
|
||||
|
||||
uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
|
||||
'wais', 'file', 'https', 'shttp', 'mms',
|
||||
'prospero', 'rtsp', 'rtspu', '', 'sftp',
|
||||
'prospero', 'rtsp', 'rtspu', 'sftp',
|
||||
'svn', 'svn+ssh', 'ws', 'wss']
|
||||
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
|
||||
|
||||
uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
|
||||
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
|
||||
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
|
||||
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
|
||||
'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
|
||||
'ws', 'wss']
|
||||
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
|
||||
|
||||
uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
|
||||
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
|
||||
'mms', '', 'sftp', 'tel']
|
||||
'mms', 'sftp', 'tel']
|
||||
|
||||
# These are not actually used anymore, but should stay for backwards
|
||||
# compatibility. (They are undocumented, but have a public-looking name.)
|
||||
|
||||
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
|
||||
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
|
||||
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
|
||||
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
|
||||
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
|
||||
|
||||
uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
|
||||
'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
|
||||
|
||||
uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
|
||||
'nntp', 'wais', 'https', 'shttp', 'snews',
|
||||
'file', 'prospero', '']
|
||||
'file', 'prospero']
|
||||
|
||||
# Characters valid in scheme names
|
||||
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
|
||||
@@ -147,16 +156,22 @@ class _NetlocResultMixinBase(object):
|
||||
def hostname(self):
|
||||
hostname = self._hostinfo[0]
|
||||
if not hostname:
|
||||
hostname = None
|
||||
elif hostname is not None:
|
||||
hostname = hostname.lower()
|
||||
return hostname
|
||||
return None
|
||||
# Scoped IPv6 address may have zone info, which must not be lowercased
|
||||
# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
|
||||
separator = '%' if isinstance(hostname, str) else b'%'
|
||||
hostname, percent, zone = hostname.partition(separator)
|
||||
return hostname.lower() + percent + zone
|
||||
|
||||
@property
|
||||
def port(self):
|
||||
port = self._hostinfo[1]
|
||||
if port is not None:
|
||||
port = int(port, 10)
|
||||
try:
|
||||
port = int(port, 10)
|
||||
except ValueError:
|
||||
message = f'Port could not be cast to integer value as {port!r}'
|
||||
raise ValueError(message) from None
|
||||
if not ( 0 <= port <= 65535):
|
||||
raise ValueError("Port out of range 0-65535")
|
||||
return port
|
||||
@@ -274,7 +289,7 @@ by reference to a primary resource and additional identifying information.
|
||||
"""
|
||||
|
||||
_ParseResultBase.__doc__ = """
|
||||
ParseResult(scheme, netloc, path, params, query, fragment)
|
||||
ParseResult(scheme, netloc, path, params, query, fragment)
|
||||
|
||||
A 6-tuple that contains components of a parsed URL.
|
||||
"""
|
||||
@@ -381,6 +396,24 @@ def _splitnetloc(url, start=0):
|
||||
delim = min(delim, wdelim) # use earliest delim position
|
||||
return url[start:delim], url[delim:] # return (domain, rest)
|
||||
|
||||
def _checknetloc(netloc):
|
||||
if not netloc or netloc.isascii():
|
||||
return
|
||||
# looking for characters like \u2100 that expand to 'a/c'
|
||||
# IDNA uses NFKC equivalence, so normalize for this check
|
||||
import unicodedata
|
||||
n = netloc.replace('@', '') # ignore characters already included
|
||||
n = n.replace(':', '') # but not the surrounding text
|
||||
n = n.replace('#', '')
|
||||
n = n.replace('?', '')
|
||||
netloc2 = unicodedata.normalize('NFKC', n)
|
||||
if n == netloc2:
|
||||
return
|
||||
for c in '/?#@:':
|
||||
if c in netloc2:
|
||||
raise ValueError("netloc '" + netloc + "' contains invalid " +
|
||||
"characters under NFKC normalization")
|
||||
|
||||
def urlsplit(url, scheme='', allow_fragments=True):
|
||||
"""Parse a URL into 5 components:
|
||||
<scheme>://<netloc>/<path>?<query>#<fragment>
|
||||
@@ -399,7 +432,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
|
||||
i = url.find(':')
|
||||
if i > 0:
|
||||
if url[:i] == 'http': # optimize the common case
|
||||
scheme = url[:i].lower()
|
||||
url = url[i+1:]
|
||||
if url[:2] == '//':
|
||||
netloc, url = _splitnetloc(url, 2)
|
||||
@@ -410,7 +442,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
|
||||
url, fragment = url.split('#', 1)
|
||||
if '?' in url:
|
||||
url, query = url.split('?', 1)
|
||||
v = SplitResult(scheme, netloc, url, query, fragment)
|
||||
_checknetloc(netloc)
|
||||
v = SplitResult('http', netloc, url, query, fragment)
|
||||
_parse_cache[key] = v
|
||||
return _coerce_result(v)
|
||||
for c in url[:i]:
|
||||
@@ -433,6 +466,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
|
||||
url, fragment = url.split('#', 1)
|
||||
if '?' in url:
|
||||
url, query = url.split('?', 1)
|
||||
_checknetloc(netloc)
|
||||
v = SplitResult(scheme, netloc, url, query, fragment)
|
||||
_parse_cache[key] = v
|
||||
return _coerce_result(v)
|
||||
@@ -574,7 +608,7 @@ def unquote_to_bytes(string):
|
||||
# if the function is never called
|
||||
global _hextobyte
|
||||
if _hextobyte is None:
|
||||
_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
|
||||
_hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
|
||||
for a in _hexdig for b in _hexdig}
|
||||
for item in bits[1:]:
|
||||
try:
|
||||
@@ -612,8 +646,9 @@ def unquote(string, encoding='utf-8', errors='replace'):
|
||||
append(bits[i + 1])
|
||||
return ''.join(res)
|
||||
|
||||
|
||||
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
||||
encoding='utf-8', errors='replace'):
|
||||
encoding='utf-8', errors='replace', max_num_fields=None):
|
||||
"""Parse a query given as a string argument.
|
||||
|
||||
Arguments:
|
||||
@@ -633,10 +668,16 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
||||
|
||||
encoding and errors: specify how to decode percent-encoded sequences
|
||||
into Unicode characters, as accepted by the bytes.decode() method.
|
||||
|
||||
max_num_fields: int. If set, then throws a ValueError if there
|
||||
are more than n fields read by parse_qsl().
|
||||
|
||||
Returns a dictionary.
|
||||
"""
|
||||
parsed_result = {}
|
||||
pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
|
||||
encoding=encoding, errors=errors)
|
||||
encoding=encoding, errors=errors,
|
||||
max_num_fields=max_num_fields)
|
||||
for name, value in pairs:
|
||||
if name in parsed_result:
|
||||
parsed_result[name].append(value)
|
||||
@@ -644,30 +685,43 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
||||
parsed_result[name] = [value]
|
||||
return parsed_result
|
||||
|
||||
|
||||
def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
|
||||
encoding='utf-8', errors='replace'):
|
||||
encoding='utf-8', errors='replace', max_num_fields=None):
|
||||
"""Parse a query given as a string argument.
|
||||
|
||||
Arguments:
|
||||
Arguments:
|
||||
|
||||
qs: percent-encoded query string to be parsed
|
||||
qs: percent-encoded query string to be parsed
|
||||
|
||||
keep_blank_values: flag indicating whether blank values in
|
||||
percent-encoded queries should be treated as blank strings. A
|
||||
true value indicates that blanks should be retained as blank
|
||||
strings. The default false value indicates that blank values
|
||||
are to be ignored and treated as if they were not included.
|
||||
keep_blank_values: flag indicating whether blank values in
|
||||
percent-encoded queries should be treated as blank strings.
|
||||
A true value indicates that blanks should be retained as blank
|
||||
strings. The default false value indicates that blank values
|
||||
are to be ignored and treated as if they were not included.
|
||||
|
||||
strict_parsing: flag indicating what to do with parsing errors. If
|
||||
false (the default), errors are silently ignored. If true,
|
||||
errors raise a ValueError exception.
|
||||
strict_parsing: flag indicating what to do with parsing errors. If
|
||||
false (the default), errors are silently ignored. If true,
|
||||
errors raise a ValueError exception.
|
||||
|
||||
encoding and errors: specify how to decode percent-encoded sequences
|
||||
into Unicode characters, as accepted by the bytes.decode() method.
|
||||
encoding and errors: specify how to decode percent-encoded sequences
|
||||
into Unicode characters, as accepted by the bytes.decode() method.
|
||||
|
||||
Returns a list, as G-d intended.
|
||||
max_num_fields: int. If set, then throws a ValueError
|
||||
if there are more than n fields read by parse_qsl().
|
||||
|
||||
Returns a list, as G-d intended.
|
||||
"""
|
||||
qs, _coerce_result = _coerce_args(qs)
|
||||
|
||||
# If max_num_fields is defined then check that the number of fields
|
||||
# is less than max_num_fields. This prevents a memory exhaustion DOS
|
||||
# attack via post bodies with many fields.
|
||||
if max_num_fields is not None:
|
||||
num_fields = 1 + qs.count('&') + qs.count(';')
|
||||
if max_num_fields < num_fields:
|
||||
raise ValueError('Max number of fields exceeded')
|
||||
|
||||
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
||||
r = []
|
||||
for name_value in pairs:
|
||||
@@ -704,7 +758,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
|
||||
_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
b'abcdefghijklmnopqrstuvwxyz'
|
||||
b'0123456789'
|
||||
b'_.-')
|
||||
b'_.-~')
|
||||
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
|
||||
_safe_quoters = {}
|
||||
|
||||
@@ -734,22 +788,32 @@ def quote(string, safe='/', encoding=None, errors=None):
|
||||
"""quote('abc def') -> 'abc%20def'
|
||||
|
||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
||||
different set of reserved characters that must be quoted.
|
||||
different set of reserved characters that must be quoted. The
|
||||
quote function offers a cautious (not minimal) way to quote a
|
||||
string for most of these parts.
|
||||
|
||||
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
||||
the following reserved characters.
|
||||
RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
|
||||
the following (un)reserved characters.
|
||||
|
||||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||||
"$" | ","
|
||||
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
|
||||
reserved = gen-delims / sub-delims
|
||||
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
|
||||
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
|
||||
/ "*" / "+" / "," / ";" / "="
|
||||
|
||||
Each of these characters is reserved in some component of a URL,
|
||||
Each of the reserved characters is reserved in some component of a URL,
|
||||
but not necessarily in all of them.
|
||||
|
||||
By default, the quote function is intended for quoting the path
|
||||
section of a URL. Thus, it will not encode '/'. This character
|
||||
is reserved, but in typical usage the quote function is being
|
||||
called on a path where the existing slash characters are used as
|
||||
reserved characters.
|
||||
The quote function %-escapes all characters that are neither in the
|
||||
unreserved chars ("always safe") nor the additional chars set via the
|
||||
safe arg.
|
||||
|
||||
The default for the safe arg is '/'. The character is reserved, but in
|
||||
typical usage the quote function is being called on a path where the
|
||||
existing slash characters are to be preserved.
|
||||
|
||||
Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
|
||||
Now, "~" is included in the set of unreserved characters.
|
||||
|
||||
string and safe may be either str or bytes objects. encoding and errors
|
||||
must not be specified if string is a bytes object.
|
||||
@@ -893,7 +957,14 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
|
||||
l.append(k + '=' + elt)
|
||||
return '&'.join(l)
|
||||
|
||||
|
||||
def to_bytes(url):
|
||||
warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _to_bytes(url)
|
||||
|
||||
|
||||
def _to_bytes(url):
|
||||
"""to_bytes(u"URL") --> 'URL'."""
|
||||
# Most URL schemes require ASCII. If that changes, the conversion
|
||||
# can be relaxed.
|
||||
@@ -906,16 +977,29 @@ def to_bytes(url):
|
||||
" contains non-ASCII characters")
|
||||
return url
|
||||
|
||||
|
||||
def unwrap(url):
|
||||
"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
|
||||
"""Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
|
||||
|
||||
The string is returned unchanged if it's not a wrapped URL.
|
||||
"""
|
||||
url = str(url).strip()
|
||||
if url[:1] == '<' and url[-1:] == '>':
|
||||
url = url[1:-1].strip()
|
||||
if url[:4] == 'URL:': url = url[4:].strip()
|
||||
if url[:4] == 'URL:':
|
||||
url = url[4:].strip()
|
||||
return url
|
||||
|
||||
_typeprog = None
|
||||
|
||||
def splittype(url):
|
||||
warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splittype(url)
|
||||
|
||||
|
||||
_typeprog = None
|
||||
def _splittype(url):
|
||||
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
|
||||
global _typeprog
|
||||
if _typeprog is None:
|
||||
@@ -927,12 +1011,20 @@ def splittype(url):
|
||||
return scheme.lower(), data
|
||||
return None, url
|
||||
|
||||
_hostprog = None
|
||||
|
||||
def splithost(url):
|
||||
warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splithost(url)
|
||||
|
||||
|
||||
_hostprog = None
|
||||
def _splithost(url):
|
||||
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
|
||||
global _hostprog
|
||||
if _hostprog is None:
|
||||
_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
|
||||
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
|
||||
|
||||
match = _hostprog.match(url)
|
||||
if match:
|
||||
@@ -942,32 +1034,64 @@ def splithost(url):
|
||||
return host_port, path
|
||||
return None, url
|
||||
|
||||
|
||||
def splituser(host):
|
||||
warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splituser(host)
|
||||
|
||||
|
||||
def _splituser(host):
|
||||
"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
|
||||
user, delim, host = host.rpartition('@')
|
||||
return (user if delim else None), host
|
||||
|
||||
|
||||
def splitpasswd(user):
|
||||
warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitpasswd(user)
|
||||
|
||||
|
||||
def _splitpasswd(user):
|
||||
"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
|
||||
user, delim, passwd = user.partition(':')
|
||||
return user, (passwd if delim else None)
|
||||
|
||||
|
||||
def splitport(host):
|
||||
warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitport(host)
|
||||
|
||||
|
||||
# splittag('/path#tag') --> '/path', 'tag'
|
||||
_portprog = None
|
||||
def splitport(host):
|
||||
def _splitport(host):
|
||||
"""splitport('host:port') --> 'host', 'port'."""
|
||||
global _portprog
|
||||
if _portprog is None:
|
||||
_portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
|
||||
_portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
|
||||
|
||||
match = _portprog.match(host)
|
||||
match = _portprog.fullmatch(host)
|
||||
if match:
|
||||
host, port = match.groups()
|
||||
if port:
|
||||
return host, port
|
||||
return host, None
|
||||
|
||||
|
||||
def splitnport(host, defport=-1):
|
||||
warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitnport(host, defport)
|
||||
|
||||
|
||||
def _splitnport(host, defport=-1):
|
||||
"""Split host and port, returning numeric port.
|
||||
Return given default port if no ':' found; defaults to -1.
|
||||
Return numerical port if a valid number are found after ':'.
|
||||
@@ -983,27 +1107,59 @@ def splitnport(host, defport=-1):
|
||||
return host, nport
|
||||
return host, defport
|
||||
|
||||
|
||||
def splitquery(url):
|
||||
warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitquery(url)
|
||||
|
||||
|
||||
def _splitquery(url):
|
||||
"""splitquery('/path?query') --> '/path', 'query'."""
|
||||
path, delim, query = url.rpartition('?')
|
||||
if delim:
|
||||
return path, query
|
||||
return url, None
|
||||
|
||||
|
||||
def splittag(url):
|
||||
warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splittag(url)
|
||||
|
||||
|
||||
def _splittag(url):
|
||||
"""splittag('/path#tag') --> '/path', 'tag'."""
|
||||
path, delim, tag = url.rpartition('#')
|
||||
if delim:
|
||||
return path, tag
|
||||
return url, None
|
||||
|
||||
|
||||
def splitattr(url):
|
||||
warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
|
||||
"use urllib.parse.urlparse() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitattr(url)
|
||||
|
||||
|
||||
def _splitattr(url):
|
||||
"""splitattr('/path;attr1=value1;attr2=value2;...') ->
|
||||
'/path', ['attr1=value1', 'attr2=value2', ...]."""
|
||||
words = url.split(';')
|
||||
return words[0], words[1:]
|
||||
|
||||
|
||||
def splitvalue(attr):
|
||||
warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
|
||||
"use urllib.parse.parse_qsl() instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
return _splitvalue(attr)
|
||||
|
||||
|
||||
def _splitvalue(attr):
|
||||
"""splitvalue('attr=value') --> 'attr', 'value'."""
|
||||
attr, delim, value = attr.partition('=')
|
||||
return attr, (value if delim else None)
|
||||
|
||||
@@ -94,7 +94,6 @@ import socket
|
||||
import string
|
||||
import sys
|
||||
import time
|
||||
import collections
|
||||
import tempfile
|
||||
import contextlib
|
||||
import warnings
|
||||
@@ -103,8 +102,8 @@ import warnings
|
||||
from urllib.error import URLError, HTTPError, ContentTooShortError
|
||||
from urllib.parse import (
|
||||
urlparse, urlsplit, urljoin, unwrap, quote, unquote,
|
||||
splittype, splithost, splitport, splituser, splitpasswd,
|
||||
splitattr, splitquery, splitvalue, splittag, to_bytes,
|
||||
_splittype, _splithost, _splitport, _splituser, _splitpasswd,
|
||||
_splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
|
||||
unquote_to_bytes, urlunparse)
|
||||
from urllib.response import addinfourl, addclosehook
|
||||
|
||||
@@ -199,7 +198,7 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||
global _opener
|
||||
if cafile or capath or cadefault:
|
||||
import warnings
|
||||
warnings.warn("cafile, cpath and cadefault are deprecated, use a "
|
||||
warnings.warn("cafile, capath and cadefault are deprecated, use a "
|
||||
"custom context instead.", DeprecationWarning, 2)
|
||||
if context is not None:
|
||||
raise ValueError(
|
||||
@@ -243,7 +242,7 @@ def urlretrieve(url, filename=None, reporthook=None, data=None):
|
||||
Returns a tuple containing the path to the newly created
|
||||
data file as well as the resulting HTTPMessage object.
|
||||
"""
|
||||
url_type, path = splittype(url)
|
||||
url_type, path = _splittype(url)
|
||||
|
||||
with contextlib.closing(urlopen(url, data)) as fp:
|
||||
headers = fp.info()
|
||||
@@ -351,7 +350,7 @@ class Request:
|
||||
def full_url(self, url):
|
||||
# unwrap('<URL:type://host/path>') --> 'type://host/path'
|
||||
self._full_url = unwrap(url)
|
||||
self._full_url, self.fragment = splittag(self._full_url)
|
||||
self._full_url, self.fragment = _splittag(self._full_url)
|
||||
self._parse()
|
||||
|
||||
@full_url.deleter
|
||||
@@ -379,10 +378,10 @@ class Request:
|
||||
self.data = None
|
||||
|
||||
def _parse(self):
|
||||
self.type, rest = splittype(self._full_url)
|
||||
self.type, rest = _splittype(self._full_url)
|
||||
if self.type is None:
|
||||
raise ValueError("unknown url type: %r" % self.full_url)
|
||||
self.host, self.selector = splithost(rest)
|
||||
self.host, self.selector = _splithost(rest)
|
||||
if self.host:
|
||||
self.host = unquote(self.host)
|
||||
|
||||
@@ -427,8 +426,7 @@ class Request:
|
||||
self.unredirected_hdrs.pop(header_name, None)
|
||||
|
||||
def header_items(self):
|
||||
hdrs = self.unredirected_hdrs.copy()
|
||||
hdrs.update(self.headers)
|
||||
hdrs = {**self.unredirected_hdrs, **self.headers}
|
||||
return list(hdrs.items())
|
||||
|
||||
class OpenerDirector:
|
||||
@@ -523,6 +521,7 @@ class OpenerDirector:
|
||||
meth = getattr(processor, meth_name)
|
||||
req = meth(req)
|
||||
|
||||
sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
|
||||
response = self._open(req, data)
|
||||
|
||||
# post-process response
|
||||
@@ -684,8 +683,8 @@ class HTTPRedirectHandler(BaseHandler):
|
||||
newurl = newurl.replace(' ', '%20')
|
||||
|
||||
CONTENT_HEADERS = ("content-length", "content-type")
|
||||
newheaders = dict((k, v) for k, v in req.headers.items()
|
||||
if k.lower() not in CONTENT_HEADERS)
|
||||
newheaders = {k: v for k, v in req.headers.items()
|
||||
if k.lower() not in CONTENT_HEADERS}
|
||||
return Request(newurl,
|
||||
headers=newheaders,
|
||||
origin_req_host=req.origin_req_host,
|
||||
@@ -769,7 +768,7 @@ def _parse_proxy(proxy):
|
||||
According to RFC 3986, having an authority component means the URL must
|
||||
have two slashes after the scheme.
|
||||
"""
|
||||
scheme, r_scheme = splittype(proxy)
|
||||
scheme, r_scheme = _splittype(proxy)
|
||||
if not r_scheme.startswith("/"):
|
||||
# authority
|
||||
scheme = None
|
||||
@@ -784,9 +783,9 @@ def _parse_proxy(proxy):
|
||||
if end == -1:
|
||||
end = None
|
||||
authority = r_scheme[2:end]
|
||||
userinfo, hostport = splituser(authority)
|
||||
userinfo, hostport = _splituser(authority)
|
||||
if userinfo is not None:
|
||||
user, password = splitpasswd(userinfo)
|
||||
user, password = _splitpasswd(userinfo)
|
||||
else:
|
||||
user = password = None
|
||||
return scheme, user, password, hostport
|
||||
@@ -801,6 +800,7 @@ class ProxyHandler(BaseHandler):
|
||||
assert hasattr(proxies, 'keys'), "proxies must be a mapping"
|
||||
self.proxies = proxies
|
||||
for type, url in proxies.items():
|
||||
type = type.lower()
|
||||
setattr(self, '%s_open' % type,
|
||||
lambda r, proxy=url, type=type, meth=self.proxy_open:
|
||||
meth(r, proxy, type))
|
||||
@@ -846,7 +846,7 @@ class HTTPPasswordMgr:
|
||||
self.passwd[realm] = {}
|
||||
for default_port in True, False:
|
||||
reduced_uri = tuple(
|
||||
[self.reduce_uri(u, default_port) for u in uri])
|
||||
self.reduce_uri(u, default_port) for u in uri)
|
||||
self.passwd[realm][reduced_uri] = (user, passwd)
|
||||
|
||||
def find_user_password(self, realm, authuri):
|
||||
@@ -873,7 +873,7 @@ class HTTPPasswordMgr:
|
||||
scheme = None
|
||||
authority = uri
|
||||
path = '/'
|
||||
host, port = splitport(authority)
|
||||
host, port = _splitport(authority)
|
||||
if default_port and port is None and scheme is not None:
|
||||
dport = {"http": 80,
|
||||
"https": 443,
|
||||
@@ -945,8 +945,15 @@ class AbstractBasicAuthHandler:
|
||||
|
||||
# allow for double- and single-quoted realm values
|
||||
# (single quotes are a violation of the RFC, but appear in the wild)
|
||||
rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
|
||||
'realm=(["\']?)([^"\']*)\\2', re.I)
|
||||
rx = re.compile('(?:^|,)' # start of the string or ','
|
||||
'[ \t]*' # optional whitespaces
|
||||
'([^ \t]+)' # scheme like "Basic"
|
||||
'[ \t]+' # mandatory whitespaces
|
||||
# realm=xxx
|
||||
# realm='xxx'
|
||||
# realm="xxx"
|
||||
'realm=(["\']?)([^"\']*)\\2',
|
||||
re.I)
|
||||
|
||||
# XXX could pre-emptively send auth info already accepted (RFC 2617,
|
||||
# end of section 2, and section 1.2 immediately after "credentials"
|
||||
@@ -958,27 +965,51 @@ class AbstractBasicAuthHandler:
|
||||
self.passwd = password_mgr
|
||||
self.add_password = self.passwd.add_password
|
||||
|
||||
def _parse_realm(self, header):
|
||||
# parse WWW-Authenticate header: accept multiple challenges per header
|
||||
found_challenge = False
|
||||
for mo in AbstractBasicAuthHandler.rx.finditer(header):
|
||||
scheme, quote, realm = mo.groups()
|
||||
if quote not in ['"', "'"]:
|
||||
warnings.warn("Basic Auth Realm was unquoted",
|
||||
UserWarning, 3)
|
||||
|
||||
yield (scheme, realm)
|
||||
|
||||
found_challenge = True
|
||||
|
||||
if not found_challenge:
|
||||
if header:
|
||||
scheme = header.split()[0]
|
||||
else:
|
||||
scheme = ''
|
||||
yield (scheme, None)
|
||||
|
||||
def http_error_auth_reqed(self, authreq, host, req, headers):
|
||||
# host may be an authority (without userinfo) or a URL with an
|
||||
# authority
|
||||
# XXX could be multiple headers
|
||||
authreq = headers.get(authreq, None)
|
||||
headers = headers.get_all(authreq)
|
||||
if not headers:
|
||||
# no header found
|
||||
return
|
||||
|
||||
if authreq:
|
||||
scheme = authreq.split()[0]
|
||||
if scheme.lower() != 'basic':
|
||||
raise ValueError("AbstractBasicAuthHandler does not"
|
||||
" support the following scheme: '%s'" %
|
||||
scheme)
|
||||
else:
|
||||
mo = AbstractBasicAuthHandler.rx.search(authreq)
|
||||
if mo:
|
||||
scheme, quote, realm = mo.groups()
|
||||
if quote not in ['"',"'"]:
|
||||
warnings.warn("Basic Auth Realm was unquoted",
|
||||
UserWarning, 2)
|
||||
if scheme.lower() == 'basic':
|
||||
return self.retry_http_basic_auth(host, req, realm)
|
||||
unsupported = None
|
||||
for header in headers:
|
||||
for scheme, realm in self._parse_realm(header):
|
||||
if scheme.lower() != 'basic':
|
||||
unsupported = scheme
|
||||
continue
|
||||
|
||||
if realm is not None:
|
||||
# Use the first matching Basic challenge.
|
||||
# Ignore following challenges even if they use the Basic
|
||||
# scheme.
|
||||
return self.retry_http_basic_auth(host, req, realm)
|
||||
|
||||
if unsupported is not None:
|
||||
raise ValueError("AbstractBasicAuthHandler does not "
|
||||
"support the following scheme: %r"
|
||||
% (scheme,))
|
||||
|
||||
def retry_http_basic_auth(self, host, req, realm):
|
||||
user, pw = self.passwd.find_user_password(realm, host)
|
||||
@@ -1144,7 +1175,11 @@ class AbstractDigestAuthHandler:
|
||||
A2 = "%s:%s" % (req.get_method(),
|
||||
# XXX selector: what about proxies and full urls
|
||||
req.selector)
|
||||
if qop == 'auth':
|
||||
# NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
|
||||
# or `auth-int` to the response back. we use `auth` to send the response back.
|
||||
if qop is None:
|
||||
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
|
||||
elif 'auth' in qop.split(','):
|
||||
if nonce == self.last_nonce:
|
||||
self.nonce_count += 1
|
||||
else:
|
||||
@@ -1152,10 +1187,8 @@ class AbstractDigestAuthHandler:
|
||||
self.last_nonce = nonce
|
||||
ncvalue = '%08x' % self.nonce_count
|
||||
cnonce = self.get_cnonce(nonce)
|
||||
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
|
||||
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
|
||||
respdig = KD(H(A1), noncebit)
|
||||
elif qop is None:
|
||||
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
|
||||
else:
|
||||
# XXX handle auth-int.
|
||||
raise URLError("qop '%s' is not supported." % qop)
|
||||
@@ -1262,8 +1295,8 @@ class AbstractHTTPHandler(BaseHandler):
|
||||
|
||||
sel_host = host
|
||||
if request.has_proxy():
|
||||
scheme, sel = splittype(request.selector)
|
||||
sel_host, sel_path = splithost(sel)
|
||||
scheme, sel = _splittype(request.selector)
|
||||
sel_host, sel_path = _splithost(sel)
|
||||
if not request.has_header('Host'):
|
||||
request.add_unredirected_header('Host', sel_host)
|
||||
for name, value in self.parent.addheaders:
|
||||
@@ -1287,8 +1320,8 @@ class AbstractHTTPHandler(BaseHandler):
|
||||
h.set_debuglevel(self._debuglevel)
|
||||
|
||||
headers = dict(req.unredirected_hdrs)
|
||||
headers.update(dict((k, v) for k, v in req.headers.items()
|
||||
if k not in headers))
|
||||
headers.update({k: v for k, v in req.headers.items()
|
||||
if k not in headers})
|
||||
|
||||
# TODO(jhylton): Should this be redesigned to handle
|
||||
# persistent connections?
|
||||
@@ -1300,7 +1333,7 @@ class AbstractHTTPHandler(BaseHandler):
|
||||
# So make sure the connection gets closed after the (only)
|
||||
# request.
|
||||
headers["Connection"] = "close"
|
||||
headers = dict((name.title(), val) for name, val in headers.items())
|
||||
headers = {name.title(): val for name, val in headers.items()}
|
||||
|
||||
if req._tunnel_host:
|
||||
tunnel_headers = {}
|
||||
@@ -1479,7 +1512,7 @@ class FileHandler(BaseHandler):
|
||||
'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
|
||||
(mtype or 'text/plain', size, modified))
|
||||
if host:
|
||||
host, port = splitport(host)
|
||||
host, port = _splitport(host)
|
||||
if not host or \
|
||||
(not port and _safe_gethostbyname(host) in self.get_names()):
|
||||
if host:
|
||||
@@ -1488,7 +1521,6 @@ class FileHandler(BaseHandler):
|
||||
origurl = 'file://' + filename
|
||||
return addinfourl(open(localfile, 'rb'), headers, origurl)
|
||||
except OSError as exp:
|
||||
# users shouldn't expect OSErrors coming from urlopen()
|
||||
raise URLError(exp)
|
||||
raise URLError('file not on local host')
|
||||
|
||||
@@ -1505,16 +1537,16 @@ class FTPHandler(BaseHandler):
|
||||
host = req.host
|
||||
if not host:
|
||||
raise URLError('ftp error: no host given')
|
||||
host, port = splitport(host)
|
||||
host, port = _splitport(host)
|
||||
if port is None:
|
||||
port = ftplib.FTP_PORT
|
||||
else:
|
||||
port = int(port)
|
||||
|
||||
# username/password handling
|
||||
user, host = splituser(host)
|
||||
user, host = _splituser(host)
|
||||
if user:
|
||||
user, passwd = splitpasswd(user)
|
||||
user, passwd = _splitpasswd(user)
|
||||
else:
|
||||
passwd = None
|
||||
host = unquote(host)
|
||||
@@ -1525,7 +1557,7 @@ class FTPHandler(BaseHandler):
|
||||
host = socket.gethostbyname(host)
|
||||
except OSError as msg:
|
||||
raise URLError(msg)
|
||||
path, attrs = splitattr(req.selector)
|
||||
path, attrs = _splitattr(req.selector)
|
||||
dirs = path.split('/')
|
||||
dirs = list(map(unquote, dirs))
|
||||
dirs, file = dirs[:-1], dirs[-1]
|
||||
@@ -1535,7 +1567,7 @@ class FTPHandler(BaseHandler):
|
||||
fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
|
||||
type = file and 'I' or 'D'
|
||||
for attr in attrs:
|
||||
attr, value = splitvalue(attr)
|
||||
attr, value = _splitvalue(attr)
|
||||
if attr.lower() == 'type' and \
|
||||
value in ('a', 'A', 'i', 'I', 'd', 'D'):
|
||||
type = value.upper()
|
||||
@@ -1658,14 +1690,10 @@ else:
|
||||
of the 'file' scheme; not recommended for general use."""
|
||||
return quote(pathname)
|
||||
|
||||
# This really consists of two pieces:
|
||||
# (1) a class which handles opening of all sorts of URLs
|
||||
# (plus assorted utilities etc.)
|
||||
# (2) a set of functions for parsing URLs
|
||||
# XXX Should these be separated out into different modules?
|
||||
|
||||
|
||||
ftpcache = {}
|
||||
|
||||
|
||||
class URLopener:
|
||||
"""Class to open URLs.
|
||||
This is a class rather than just a subroutine because we may need
|
||||
@@ -1733,26 +1761,26 @@ class URLopener:
|
||||
# External interface
|
||||
def open(self, fullurl, data=None):
|
||||
"""Use URLopener().open(file) instead of open(file, 'r')."""
|
||||
fullurl = unwrap(to_bytes(fullurl))
|
||||
fullurl = unwrap(_to_bytes(fullurl))
|
||||
fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
|
||||
if self.tempcache and fullurl in self.tempcache:
|
||||
filename, headers = self.tempcache[fullurl]
|
||||
fp = open(filename, 'rb')
|
||||
return addinfourl(fp, headers, fullurl)
|
||||
urltype, url = splittype(fullurl)
|
||||
urltype, url = _splittype(fullurl)
|
||||
if not urltype:
|
||||
urltype = 'file'
|
||||
if urltype in self.proxies:
|
||||
proxy = self.proxies[urltype]
|
||||
urltype, proxyhost = splittype(proxy)
|
||||
host, selector = splithost(proxyhost)
|
||||
urltype, proxyhost = _splittype(proxy)
|
||||
host, selector = _splithost(proxyhost)
|
||||
url = (host, fullurl) # Signal special case to open_*()
|
||||
else:
|
||||
proxy = None
|
||||
name = 'open_' + urltype
|
||||
self.type = urltype
|
||||
name = name.replace('-', '_')
|
||||
if not hasattr(self, name):
|
||||
if not hasattr(self, name) or name == 'open_local_file':
|
||||
if proxy:
|
||||
return self.open_unknown_proxy(proxy, fullurl, data)
|
||||
else:
|
||||
@@ -1769,28 +1797,28 @@ class URLopener:
|
||||
|
||||
def open_unknown(self, fullurl, data=None):
|
||||
"""Overridable interface to open unknown URL type."""
|
||||
type, url = splittype(fullurl)
|
||||
type, url = _splittype(fullurl)
|
||||
raise OSError('url error', 'unknown url type', type)
|
||||
|
||||
def open_unknown_proxy(self, proxy, fullurl, data=None):
|
||||
"""Overridable interface to open unknown URL type."""
|
||||
type, url = splittype(fullurl)
|
||||
type, url = _splittype(fullurl)
|
||||
raise OSError('url error', 'invalid proxy for %s' % type, proxy)
|
||||
|
||||
# External interface
|
||||
def retrieve(self, url, filename=None, reporthook=None, data=None):
|
||||
"""retrieve(url) returns (filename, headers) for a local object
|
||||
or (tempfilename, headers) for a remote object."""
|
||||
url = unwrap(to_bytes(url))
|
||||
url = unwrap(_to_bytes(url))
|
||||
if self.tempcache and url in self.tempcache:
|
||||
return self.tempcache[url]
|
||||
type, url1 = splittype(url)
|
||||
type, url1 = _splittype(url)
|
||||
if filename is None and (not type or type == 'file'):
|
||||
try:
|
||||
fp = self.open_local_file(url1)
|
||||
hdrs = fp.info()
|
||||
fp.close()
|
||||
return url2pathname(splithost(url1)[1]), hdrs
|
||||
return url2pathname(_splithost(url1)[1]), hdrs
|
||||
except OSError as msg:
|
||||
pass
|
||||
fp = self.open(url, data)
|
||||
@@ -1799,11 +1827,10 @@ class URLopener:
|
||||
if filename:
|
||||
tfp = open(filename, 'wb')
|
||||
else:
|
||||
import tempfile
|
||||
garbage, path = splittype(url)
|
||||
garbage, path = splithost(path or "")
|
||||
path, garbage = splitquery(path or "")
|
||||
path, garbage = splitattr(path or "")
|
||||
garbage, path = _splittype(url)
|
||||
garbage, path = _splithost(path or "")
|
||||
path, garbage = _splitquery(path or "")
|
||||
path, garbage = _splitattr(path or "")
|
||||
suffix = os.path.splitext(path)[1]
|
||||
(fd, filename) = tempfile.mkstemp(suffix)
|
||||
self.__tempfiles.append(filename)
|
||||
@@ -1860,25 +1887,25 @@ class URLopener:
|
||||
user_passwd = None
|
||||
proxy_passwd= None
|
||||
if isinstance(url, str):
|
||||
host, selector = splithost(url)
|
||||
host, selector = _splithost(url)
|
||||
if host:
|
||||
user_passwd, host = splituser(host)
|
||||
user_passwd, host = _splituser(host)
|
||||
host = unquote(host)
|
||||
realhost = host
|
||||
else:
|
||||
host, selector = url
|
||||
# check whether the proxy contains authorization information
|
||||
proxy_passwd, host = splituser(host)
|
||||
proxy_passwd, host = _splituser(host)
|
||||
# now we proceed with the url we want to obtain
|
||||
urltype, rest = splittype(selector)
|
||||
urltype, rest = _splittype(selector)
|
||||
url = rest
|
||||
user_passwd = None
|
||||
if urltype.lower() != 'http':
|
||||
realhost = None
|
||||
else:
|
||||
realhost, rest = splithost(rest)
|
||||
realhost, rest = _splithost(rest)
|
||||
if realhost:
|
||||
user_passwd, realhost = splituser(realhost)
|
||||
user_passwd, realhost = _splituser(realhost)
|
||||
if user_passwd:
|
||||
selector = "%s://%s%s" % (urltype, realhost, rest)
|
||||
if proxy_bypass(realhost):
|
||||
@@ -1984,7 +2011,7 @@ class URLopener:
|
||||
"""Use local file."""
|
||||
import email.utils
|
||||
import mimetypes
|
||||
host, file = splithost(url)
|
||||
host, file = _splithost(url)
|
||||
localname = url2pathname(file)
|
||||
try:
|
||||
stats = os.stat(localname)
|
||||
@@ -2001,7 +2028,7 @@ class URLopener:
|
||||
if file[:1] == '/':
|
||||
urlfile = 'file://' + file
|
||||
return addinfourl(open(localname, 'rb'), headers, urlfile)
|
||||
host, port = splitport(host)
|
||||
host, port = _splitport(host)
|
||||
if (not port
|
||||
and socket.gethostbyname(host) in ((localhost(),) + thishost())):
|
||||
urlfile = file
|
||||
@@ -2017,11 +2044,11 @@ class URLopener:
|
||||
if not isinstance(url, str):
|
||||
raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
|
||||
import mimetypes
|
||||
host, path = splithost(url)
|
||||
host, path = _splithost(url)
|
||||
if not host: raise URLError('ftp error: no host given')
|
||||
host, port = splitport(host)
|
||||
user, host = splituser(host)
|
||||
if user: user, passwd = splitpasswd(user)
|
||||
host, port = _splitport(host)
|
||||
user, host = _splituser(host)
|
||||
if user: user, passwd = _splitpasswd(user)
|
||||
else: passwd = None
|
||||
host = unquote(host)
|
||||
user = unquote(user or '')
|
||||
@@ -2032,7 +2059,7 @@ class URLopener:
|
||||
port = ftplib.FTP_PORT
|
||||
else:
|
||||
port = int(port)
|
||||
path, attrs = splitattr(path)
|
||||
path, attrs = _splitattr(path)
|
||||
path = unquote(path)
|
||||
dirs = path.split('/')
|
||||
dirs, file = dirs[:-1], dirs[-1]
|
||||
@@ -2054,7 +2081,7 @@ class URLopener:
|
||||
if not file: type = 'D'
|
||||
else: type = 'I'
|
||||
for attr in attrs:
|
||||
attr, value = splitvalue(attr)
|
||||
attr, value = _splitvalue(attr)
|
||||
if attr.lower() == 'type' and \
|
||||
value in ('a', 'A', 'i', 'I', 'd', 'D'):
|
||||
type = value.upper()
|
||||
@@ -2237,11 +2264,11 @@ class FancyURLopener(URLopener):
|
||||
return getattr(self,name)(url, realm, data)
|
||||
|
||||
def retry_proxy_http_basic_auth(self, url, realm, data=None):
|
||||
host, selector = splithost(url)
|
||||
host, selector = _splithost(url)
|
||||
newurl = 'http://' + host + selector
|
||||
proxy = self.proxies['http']
|
||||
urltype, proxyhost = splittype(proxy)
|
||||
proxyhost, proxyselector = splithost(proxyhost)
|
||||
urltype, proxyhost = _splittype(proxy)
|
||||
proxyhost, proxyselector = _splithost(proxyhost)
|
||||
i = proxyhost.find('@') + 1
|
||||
proxyhost = proxyhost[i:]
|
||||
user, passwd = self.get_user_passwd(proxyhost, realm, i)
|
||||
@@ -2255,11 +2282,11 @@ class FancyURLopener(URLopener):
|
||||
return self.open(newurl, data)
|
||||
|
||||
def retry_proxy_https_basic_auth(self, url, realm, data=None):
|
||||
host, selector = splithost(url)
|
||||
host, selector = _splithost(url)
|
||||
newurl = 'https://' + host + selector
|
||||
proxy = self.proxies['https']
|
||||
urltype, proxyhost = splittype(proxy)
|
||||
proxyhost, proxyselector = splithost(proxyhost)
|
||||
urltype, proxyhost = _splittype(proxy)
|
||||
proxyhost, proxyselector = _splithost(proxyhost)
|
||||
i = proxyhost.find('@') + 1
|
||||
proxyhost = proxyhost[i:]
|
||||
user, passwd = self.get_user_passwd(proxyhost, realm, i)
|
||||
@@ -2273,7 +2300,7 @@ class FancyURLopener(URLopener):
|
||||
return self.open(newurl, data)
|
||||
|
||||
def retry_http_basic_auth(self, url, realm, data=None):
|
||||
host, selector = splithost(url)
|
||||
host, selector = _splithost(url)
|
||||
i = host.find('@') + 1
|
||||
host = host[i:]
|
||||
user, passwd = self.get_user_passwd(host, realm, i)
|
||||
@@ -2287,7 +2314,7 @@ class FancyURLopener(URLopener):
|
||||
return self.open(newurl, data)
|
||||
|
||||
def retry_https_basic_auth(self, url, realm, data=None):
|
||||
host, selector = splithost(url)
|
||||
host, selector = _splithost(url)
|
||||
i = host.find('@') + 1
|
||||
host = host[i:]
|
||||
user, passwd = self.get_user_passwd(host, realm, i)
|
||||
@@ -2504,23 +2531,26 @@ def proxy_bypass_environment(host, proxies=None):
|
||||
try:
|
||||
no_proxy = proxies['no']
|
||||
except KeyError:
|
||||
return 0
|
||||
return False
|
||||
# '*' is special case for always bypass
|
||||
if no_proxy == '*':
|
||||
return 1
|
||||
return True
|
||||
host = host.lower()
|
||||
# strip port off host
|
||||
hostonly, port = splitport(host)
|
||||
hostonly, port = _splitport(host)
|
||||
# check if the host ends with any of the DNS suffixes
|
||||
no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
|
||||
for name in no_proxy_list:
|
||||
for name in no_proxy.split(','):
|
||||
name = name.strip()
|
||||
if name:
|
||||
name = re.escape(name)
|
||||
pattern = r'(.+\.)?%s$' % name
|
||||
if (re.match(pattern, hostonly, re.I)
|
||||
or re.match(pattern, host, re.I)):
|
||||
return 1
|
||||
name = name.lstrip('.') # ignore leading dots
|
||||
name = name.lower()
|
||||
if hostonly == name or host == name:
|
||||
return True
|
||||
name = '.' + name
|
||||
if hostonly.endswith(name) or host.endswith(name):
|
||||
return True
|
||||
# otherwise, don't bypass
|
||||
return 0
|
||||
return False
|
||||
|
||||
|
||||
# This code tests an OSX specific data structure but is testable on all
|
||||
@@ -2539,7 +2569,7 @@ def _proxy_bypass_macosx_sysconf(host, proxy_settings):
|
||||
"""
|
||||
from fnmatch import fnmatch
|
||||
|
||||
hostonly, port = splitport(host)
|
||||
hostonly, port = _splitport(host)
|
||||
|
||||
def ip2num(ipAddr):
|
||||
parts = ipAddr.split('.')
|
||||
@@ -2646,7 +2676,7 @@ elif os.name == 'nt':
|
||||
for p in proxyServer.split(';'):
|
||||
protocol, address = p.split('=', 1)
|
||||
# See if address has a type:// prefix
|
||||
if not re.match('^([^/:]+)://', address):
|
||||
if not re.match('(?:[^/:]+)://', address):
|
||||
address = '%s://%s' % (protocol, address)
|
||||
proxies[protocol] = address
|
||||
else:
|
||||
@@ -2693,7 +2723,7 @@ elif os.name == 'nt':
|
||||
if not proxyEnable or not proxyOverride:
|
||||
return 0
|
||||
# try to make a host list from name and IP address.
|
||||
rawHost, port = splitport(host)
|
||||
rawHost, port = _splitport(host)
|
||||
host = [rawHost]
|
||||
try:
|
||||
addr = socket.gethostbyname(rawHost)
|
||||
|
||||
@@ -16,6 +16,9 @@ import urllib.request
|
||||
|
||||
__all__ = ["RobotFileParser"]
|
||||
|
||||
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
||||
|
||||
|
||||
class RobotFileParser:
|
||||
""" This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file.
|
||||
@@ -24,6 +27,7 @@ class RobotFileParser:
|
||||
|
||||
def __init__(self, url=''):
|
||||
self.entries = []
|
||||
self.sitemaps = []
|
||||
self.default_entry = None
|
||||
self.disallow_all = False
|
||||
self.allow_all = False
|
||||
@@ -136,12 +140,14 @@ class RobotFileParser:
|
||||
# check if all values are sane
|
||||
if (len(numbers) == 2 and numbers[0].strip().isdigit()
|
||||
and numbers[1].strip().isdigit()):
|
||||
req_rate = collections.namedtuple('req_rate',
|
||||
'requests seconds')
|
||||
entry.req_rate = req_rate
|
||||
entry.req_rate.requests = int(numbers[0])
|
||||
entry.req_rate.seconds = int(numbers[1])
|
||||
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
|
||||
state = 2
|
||||
elif line[0] == "sitemap":
|
||||
# According to http://www.sitemaps.org/protocol.html
|
||||
# "This directive is independent of the user-agent line,
|
||||
# so it doesn't matter where you place it in your file."
|
||||
# Therefore we do not change the state of the parser.
|
||||
self.sitemaps.append(line[1])
|
||||
if state == 2:
|
||||
self._add_entry(entry)
|
||||
|
||||
@@ -180,7 +186,9 @@ class RobotFileParser:
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.delay
|
||||
return self.default_entry.delay
|
||||
if self.default_entry:
|
||||
return self.default_entry.delay
|
||||
return None
|
||||
|
||||
def request_rate(self, useragent):
|
||||
if not self.mtime():
|
||||
@@ -188,10 +196,20 @@ class RobotFileParser:
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.req_rate
|
||||
return self.default_entry.req_rate
|
||||
if self.default_entry:
|
||||
return self.default_entry.req_rate
|
||||
return None
|
||||
|
||||
def site_maps(self):
|
||||
if not self.sitemaps:
|
||||
return None
|
||||
return self.sitemaps
|
||||
|
||||
def __str__(self):
|
||||
return ''.join([str(entry) + "\n" for entry in self.entries])
|
||||
entries = self.entries
|
||||
if self.default_entry is not None:
|
||||
entries = entries + [self.default_entry]
|
||||
return '\n\n'.join(map(str, entries))
|
||||
|
||||
|
||||
class RuleLine:
|
||||
@@ -223,10 +241,14 @@ class Entry:
|
||||
def __str__(self):
|
||||
ret = []
|
||||
for agent in self.useragents:
|
||||
ret.extend(["User-agent: ", agent, "\n"])
|
||||
for line in self.rulelines:
|
||||
ret.extend([str(line), "\n"])
|
||||
return ''.join(ret)
|
||||
ret.append(f"User-agent: {agent}")
|
||||
if self.delay is not None:
|
||||
ret.append(f"Crawl-delay: {self.delay}")
|
||||
if self.req_rate is not None:
|
||||
rate = self.req_rate
|
||||
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
|
||||
ret.extend(map(str, self.rulelines))
|
||||
return '\n'.join(ret)
|
||||
|
||||
def applies_to(self, useragent):
|
||||
"""check if this entry applies to the specified agent"""
|
||||
|
||||
Reference in New Issue
Block a user