Merge pull request #4009 from youknowone/htmlurllib

Update html/urllib and their tests
This commit is contained in:
Jeong YunWon
2022-08-07 17:23:14 +09:00
committed by GitHub
10 changed files with 121 additions and 118 deletions

View File

@@ -4,6 +4,7 @@ __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']
# maps the HTML entity name to the Unicode code point
# from https://html.spec.whatwg.org/multipage/named-characters.html
name2codepoint = {
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1

14
Lib/html/parser.py vendored
View File

@@ -9,7 +9,6 @@
import re
import warnings
import _markupbase
from html import unescape
@@ -47,7 +46,7 @@ locatestarttagend_tolerant = re.compile(r"""
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
\s* # possibly followed by a space
)?(?:\s|/(?!>))*
)*
)?
@@ -406,7 +405,7 @@ class HTMLParser(_markupbase.ParserBase):
tagname = namematch.group(1).lower()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# </tag attr=">">, but looking for > after the name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
self.handle_endtag(tagname)
@@ -418,7 +417,7 @@ class HTMLParser(_markupbase.ParserBase):
self.handle_data(rawdata[i:gtpos])
return gtpos
self.handle_endtag(elem.lower())
self.handle_endtag(elem)
self.clear_cdata_mode()
return gtpos
@@ -461,10 +460,3 @@ class HTMLParser(_markupbase.ParserBase):
def unknown_decl(self, data):
pass
# Internal -- helper to remove special character quoting
def unescape(self, s):
warnings.warn('The unescape method is deprecated and will be removed '
'in 3.5, use html.unescape() instead.',
DeprecationWarning, stacklevel=2)
return unescape(s)

View File

@@ -537,13 +537,6 @@ text
for html, expected in data:
self._run_check(html, expected)
def test_unescape_method(self):
from html import unescape
p = self.get_collector()
with self.assertWarns(DeprecationWarning):
s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
self.assertEqual(p.unescape(s), unescape(s))
def test_broken_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'
@@ -761,8 +754,6 @@ class AttributesTestCase(TestCaseBase):
]
self._run_check(html, expected)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_comma_between_attributes(self):
# see bpo 41478
# HTMLParser preserves duplicate attributes, leaving the task of

View File

@@ -9,7 +9,8 @@ import io
import unittest
from unittest.mock import patch
from test import support
from test.support import os_helper, warnings_helper
from test.support import os_helper
from test.support import warnings_helper
import os
try:
import ssl

View File

@@ -141,8 +141,6 @@ class RequestHdrsTests(unittest.TestCase):
req.remove_header("Unredirected-spam")
self.assertFalse(req.has_header("Unredirected-spam"))
# TODO: RUSTPYTHON, AssertionError: Tuples differ: ('foo', 'ni') != (None, None)
@unittest.expectedFailure
def test_password_manager(self):
mgr = urllib.request.HTTPPasswordMgr()
add = mgr.add_password

View File

@@ -613,8 +613,8 @@ class UrlParseTestCase(unittest.TestCase):
p.port
def test_urlsplit_remove_unsafe_bytes(self):
# Remove ASCII tabs and newlines from input, for http common case scenario.
url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
# Remove ASCII tabs and newlines from input
url = "http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "www.python.org")
@@ -627,8 +627,8 @@ class UrlParseTestCase(unittest.TestCase):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
# Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
# Remove ASCII tabs and newlines from input as bytes.
url = b"http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"www.python.org")
@@ -641,24 +641,13 @@ class UrlParseTestCase(unittest.TestCase):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment")
# any scheme
url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
# Remove ASCII tabs and newlines from input as bytes, any scheme.
url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
# Unsafe bytes is not returned from urlparse cache.
# scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme
url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
scheme = "htt\nps"
# with scheme as cache-key
url = "http://www.python.org/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
scheme = "ht\ntp"
for _ in range(2):
p = urllib.parse.urlsplit(url, scheme=scheme)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
self.assertEqual(p.scheme, "http")
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
def test_attributes_bad_port(self):
"""Check handling of invalid ports."""
@@ -745,15 +734,17 @@ class UrlParseTestCase(unittest.TestCase):
def test_portseparator(self):
# Issue 754016 makes changes for port separator ':' from scheme separator
self.assertEqual(urllib.parse.urlparse("path:80"),
('','','path:80','','',''))
self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','',''))
self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','',''))
self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','',''))
self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
('http','www.python.org:80','','','',''))
# As usual, need to check bytes input as well
self.assertEqual(urllib.parse.urlparse(b"path:80"),
(b'',b'',b'path:80',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),

83
Lib/urllib/parse.py vendored
View File

@@ -29,6 +29,7 @@ test_urlparse.py provides a good indicator of parsing behavior.
import re
import sys
import types
import collections
import warnings
@@ -179,6 +180,8 @@ class _NetlocResultMixinBase(object):
raise ValueError("Port out of range 0-65535")
return port
__class_getitem__ = classmethod(types.GenericAlias)
class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
__slots__ = ()
@@ -369,9 +372,23 @@ del _fix_result_transcoding
def urlparse(url, scheme='', allow_fragments=True):
"""Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
The result is a named 6-tuple with fields corresponding to the
above. It is either a ParseResult or ParseResultBytes object,
depending on the type of the url parameter.
The username, password, hostname, and port sub-components of netloc
can also be accessed as attributes of the returned object.
The scheme argument provides the default value of the scheme
component when no scheme is found in url.
If allow_fragments is False, no attempt is made to separate the
fragment component from the previous component, which can be either
path or query.
Note that % escapes are not expanded.
"""
url, scheme, _coerce_result = _coerce_args(url, scheme)
splitresult = urlsplit(url, scheme, allow_fragments)
scheme, netloc, url, query, fragment = splitresult
@@ -417,20 +434,33 @@ def _checknetloc(netloc):
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")
def _remove_unsafe_bytes_from_url(url):
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
return url
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
The result is a named 5-tuple with fields corresponding to the
above. It is either a SplitResult or SplitResultBytes object,
depending on the type of the url parameter.
The username, password, hostname, and port sub-components of netloc
can also be accessed as attributes of the returned object.
The scheme argument provides the default value of the scheme
component when no scheme is found in url.
If allow_fragments is False, no attempt is made to separate the
fragment component from the previous component, which can be either
path or query.
Note that % escapes are not expanded.
"""
url, scheme, _coerce_result = _coerce_args(url, scheme)
url = _remove_unsafe_bytes_from_url(url)
scheme = _remove_unsafe_bytes_from_url(scheme)
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
scheme = scheme.replace(b, "")
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
@@ -441,31 +471,11 @@ def urlsplit(url, scheme='', allow_fragments=True):
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
if url[:i] == 'http': # optimize the common case
url = url[i+1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
(']' in netloc and '[' not in netloc)):
raise ValueError("Invalid IPv6 URL")
if allow_fragments and '#' in url:
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult('http', netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
for c in url[:i]:
if c not in scheme_chars:
break
else:
# make sure "url" is not actually a port number (in which case
# "scheme" is really part of the path)
rest = url[i+1:]
if not rest or any(c not in '0123456789' for c in rest):
# not a port number
scheme, url = url[:i].lower(), rest
scheme, url = url[:i].lower(), url[i+1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
@@ -642,7 +652,7 @@ def unquote(string, encoding='utf-8', errors='replace'):
unquote('abc%20def') -> 'abc def'.
"""
if isinstance(string, bytes):
raise TypeError('Expected str, got bytes')
return unquote_to_bytes(string).decode(encoding, errors)
if '%' not in string:
string.split
return string
@@ -744,9 +754,8 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
if max_num_fields < num_fields:
raise ValueError('Max number of fields exceeded')
pairs = [s1 for s1 in qs.split(separator)]
r = []
for name_value in pairs:
for name_value in qs.split(separator):
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)

64
Lib/urllib/request.py vendored
View File

@@ -64,7 +64,7 @@ opener = urllib.request.build_opener(proxy_support, authinfo,
# install it
urllib.request.install_opener(opener)
f = urllib.request.urlopen('http://www.python.org/')
f = urllib.request.urlopen('https://www.python.org/')
"""
# XXX issues:
@@ -163,18 +163,10 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
The *cadefault* parameter is ignored.
This function always returns an object which can work as a context
manager and has methods such as
* geturl() - return the URL of the resource retrieved, commonly used to
determine if a redirect was followed
* info() - return the meta-information of the page, such as headers, in the
form of an email.message_from_string() instance (see Quick Reference to
HTTP Headers)
* getcode() - return the HTTP status code of the response. Raises URLError
on errors.
This function always returns an object which can work as a
context manager and has the properties url, headers, and status.
See urllib.response.addinfourl for more detail on these properties.
For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
object slightly modified. In addition to the three new methods above, the
@@ -210,6 +202,8 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
cafile=cafile,
capath=capath)
# send ALPN extension to indicate HTTP/1.1 protocol
context.set_alpn_protocols(['http/1.1'])
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
elif context:
@@ -895,10 +889,10 @@ class HTTPPasswordMgr:
return True
if base[0] != test[0]:
return False
common = posixpath.commonprefix((base[1], test[1]))
if len(common) == len(base[1]):
return True
return False
prefix = base[1]
if prefix[-1:] != '/':
prefix += '/'
return test[1].startswith(prefix)
class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
@@ -1823,7 +1817,7 @@ class URLopener:
hdrs = fp.info()
fp.close()
return url2pathname(_splithost(url1)[1]), hdrs
except OSError as msg:
except OSError:
pass
fp = self.open(url, data)
try:
@@ -2680,22 +2674,26 @@ elif os.name == 'nt':
# Returned as Unicode but problems if not converted to ASCII
proxyServer = str(winreg.QueryValueEx(internetSettings,
'ProxyServer')[0])
if '=' in proxyServer:
# Per-protocol settings
for p in proxyServer.split(';'):
protocol, address = p.split('=', 1)
# See if address has a type:// prefix
if not re.match('(?:[^/:]+)://', address):
address = '%s://%s' % (protocol, address)
proxies[protocol] = address
else:
# Use one setting for all protocols
if proxyServer[:5] == 'http:':
proxies['http'] = proxyServer
else:
proxies['http'] = 'http://%s' % proxyServer
proxies['https'] = 'https://%s' % proxyServer
proxies['ftp'] = 'ftp://%s' % proxyServer
if '=' not in proxyServer and ';' not in proxyServer:
# Use one setting for all protocols.
proxyServer = 'http={0};https={0};ftp={0}'.format(proxyServer)
for p in proxyServer.split(';'):
protocol, address = p.split('=', 1)
# See if address has a type:// prefix
if not re.match('(?:[^/:]+)://', address):
# Add type:// prefix to address without specifying type
if protocol in ('http', 'https', 'ftp'):
# The default proxy type of Windows is HTTP
address = 'http://' + address
elif protocol == 'socks':
address = 'socks://' + address
proxies[protocol] = address
# Use SOCKS proxy for HTTP(S) protocols
if proxies.get('socks'):
# The default SOCKS proxy type of Windows is SOCKS4
address = re.sub(r'^socks://', 'socks4://', proxies['socks'])
proxies['http'] = proxies.get('http') or address
proxies['https'] = proxies.get('https') or address
internetSettings.Close()
except (OSError, ValueError, TypeError):
# Either registry key not found etc, or the value in an

View File

@@ -73,6 +73,10 @@ class addinfourl(addinfo):
self.url = url
self.code = code
@property
def status(self):
return self.code
def getcode(self):
return self.code

View File

@@ -26,7 +26,7 @@ mod _ssl {
use crate::{
common::{
ascii,
lock::{PyRwLock, PyRwLockWriteGuard},
lock::{PyMutex, PyRwLock, PyRwLockWriteGuard},
},
socket::{self, PySocket},
vm::{
@@ -423,6 +423,7 @@ mod _ssl {
ctx: PyRwLock<SslContextBuilder>,
check_hostname: AtomicCell<bool>,
protocol: SslVersion,
post_handshake_auth: PyMutex<bool>,
}
impl fmt::Debug for PySslContext {
@@ -491,6 +492,7 @@ mod _ssl {
ctx: PyRwLock::new(builder),
check_hostname: AtomicCell::new(check_hostname),
protocol: proto,
post_handshake_auth: PyMutex::new(false),
}
.into_ref_with_type(vm, cls)
.map(Into::into)
@@ -510,6 +512,22 @@ mod _ssl {
func(builder_as_ctx(&c))
}
#[pyproperty]
fn post_handshake_auth(&self) -> bool {
*self.post_handshake_auth.lock()
}
#[pyproperty(setter)]
fn set_post_handshake_auth(
&self,
value: Option<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult<()> {
let value = value
.ok_or_else(|| vm.new_attribute_error("cannot delete attribute".to_owned()))?;
*self.post_handshake_auth.lock() = value.is_true(vm)?;
Ok(())
}
#[pymethod]
fn set_ciphers(&self, cipherlist: PyStrRef, vm: &VirtualMachine) -> PyResult<()> {
let ciphers = cipherlist.as_str();