forked from Rust-related/RustPython
Edit _codecs to work w/ Python 3
This commit is contained in:
174
Lib/_codecs.py
vendored
174
Lib/_codecs.py
vendored
@@ -37,6 +37,8 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||
|
||||
Copyright (c) Corporation for National Research Initiatives.
|
||||
|
||||
From PyPy v1.0.0
|
||||
|
||||
"""
|
||||
#from unicodecodec import *
|
||||
|
||||
@@ -47,7 +49,7 @@ codec_search_cache = {}
|
||||
codec_error_registry = {}
|
||||
codec_need_encodings = [True]
|
||||
|
||||
def codec_register( search_function ):
|
||||
def register( search_function ):
|
||||
"""register(search_function)
|
||||
|
||||
Register a codec search function. Search functions are expected to take
|
||||
@@ -55,12 +57,12 @@ def codec_register( search_function ):
|
||||
a tuple of functions (encoder, decoder, stream_reader, stream_writer).
|
||||
"""
|
||||
|
||||
if callable(search_function):
|
||||
codec_search_path.append(search_function)
|
||||
if not callable(search_function):
|
||||
raise TypeError("argument must be callable")
|
||||
codec_search_path.append(search_function)
|
||||
|
||||
register = codec_register
|
||||
|
||||
def codec_lookup(encoding):
|
||||
def lookup(encoding):
|
||||
"""lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
|
||||
Looks up a codec tuple in the Python codec registry and returns
|
||||
a tuple of functions.
|
||||
@@ -78,17 +80,13 @@ def codec_lookup(encoding):
|
||||
for search in codec_search_path:
|
||||
result = search(normalized_encoding)
|
||||
if result:
|
||||
if not (type(result) == tuple and len(result) == 4):
|
||||
raise TypeError("codec search functions must return 4-tuples")
|
||||
else:
|
||||
codec_search_cache[normalized_encoding] = result
|
||||
return result
|
||||
codec_search_cache[normalized_encoding] = result
|
||||
return result
|
||||
if not result:
|
||||
raise LookupError("unknown encoding: %s" % encoding)
|
||||
return result
|
||||
|
||||
|
||||
lookup = codec_lookup
|
||||
|
||||
def encode(v, encoding=None, errors='strict'):
|
||||
"""encode(obj, [encoding[,errors]]) -> object
|
||||
@@ -102,15 +100,15 @@ def encode(v, encoding=None, errors='strict'):
|
||||
"""
|
||||
if encoding == None:
|
||||
encoding = sys.getdefaultencoding()
|
||||
if isinstance(encoding, str):
|
||||
encoder = lookup(encoding)[0]
|
||||
if encoder and isinstance(errors, str):
|
||||
res = encoder(v, errors)
|
||||
return res[0]
|
||||
else:
|
||||
raise TypeError("Errors must be a string")
|
||||
else:
|
||||
if not isinstance(encoding, str):
|
||||
raise TypeError("Encoding must be a string")
|
||||
if not isinstance(errors, str):
|
||||
raise TypeError("Errors must be a string")
|
||||
codec = lookup(encoding)
|
||||
res = codec.encode(v, errors)
|
||||
if not isinstance(res, tuple) or len(res) != 2:
|
||||
raise TypeError("encoder must return a tuple (object, integer)")
|
||||
return res[0]
|
||||
|
||||
def decode(obj, encoding=None, errors='strict'):
|
||||
"""decode(obj, [encoding[,errors]]) -> object
|
||||
@@ -124,23 +122,22 @@ def decode(obj, encoding=None, errors='strict'):
|
||||
"""
|
||||
if encoding == None:
|
||||
encoding = sys.getdefaultencoding()
|
||||
if isinstance(encoding, str):
|
||||
decoder = lookup(encoding)[1]
|
||||
if decoder and isinstance(errors, str):
|
||||
res = decoder(obj, errors)
|
||||
if not isinstance(res, tuple) or len(res) != 2:
|
||||
raise TypeError("encoder must return a tuple (object, integer)")
|
||||
return res[0]
|
||||
else:
|
||||
raise TypeError("Errors must be a string")
|
||||
else:
|
||||
if not isinstance(encoding, str):
|
||||
raise TypeError("Encoding must be a string")
|
||||
if not isinstance(errors, str):
|
||||
raise TypeError("Errors must be a string")
|
||||
codec = lookup(encoding)
|
||||
res = codec.decode(obj, errors)
|
||||
if not isinstance(res, tuple) or len(res) != 2:
|
||||
raise TypeError("encoder must return a tuple (object, integer)")
|
||||
return res[0]
|
||||
|
||||
|
||||
def latin_1_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeLatin1(obj, len(obj), errors)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
# XXX MBCS codec might involve ctypes ?
|
||||
def mbcs_decode():
|
||||
@@ -189,7 +186,7 @@ def unicode_escape_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = unicodeescape_string(obj, len(obj), 0)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def latin_1_decode( data, errors='strict'):
|
||||
@@ -229,7 +226,7 @@ def charmap_encode(obj, errors='strict', mapping='latin-1'):
|
||||
"""
|
||||
|
||||
res = PyUnicode_EncodeCharmap(obj, len(obj), mapping, errors)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
@@ -241,17 +238,17 @@ def unicode_internal_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
if type(obj) == str:
|
||||
p = []
|
||||
p = bytearray()
|
||||
t = [ord(x) for x in obj]
|
||||
for i in t:
|
||||
bytes = []
|
||||
b = bytearray()
|
||||
for j in range(unicode_bytes):
|
||||
bytes += chr(i%256)
|
||||
b.append(i%256)
|
||||
i >>= 8
|
||||
if sys.byteorder == "big":
|
||||
bytes.reverse()
|
||||
p += bytes
|
||||
res = ''.join(p)
|
||||
b.reverse()
|
||||
p += b
|
||||
res = bytes(p)
|
||||
return res, len(res)
|
||||
else:
|
||||
res = "You can do better than this" # XXX make this right
|
||||
@@ -350,13 +347,6 @@ def escape_decode(data, errors='strict'):
|
||||
res = ''.join(res)
|
||||
return res, len(res)
|
||||
|
||||
def charbuffer_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = str(obj)
|
||||
res = ''.join(res)
|
||||
return res, len(res)
|
||||
|
||||
def charmap_decode( data, errors='strict', mapping=None):
|
||||
"""None
|
||||
"""
|
||||
@@ -369,7 +359,7 @@ def utf_7_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def mbcs_encode( obj, errors='strict'):
|
||||
@@ -387,42 +377,42 @@ def ascii_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeASCII(obj, len(obj), errors)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def utf_16_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native')
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def raw_unicode_escape_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeRawUnicodeEscape(obj, len(obj))
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def utf_8_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def utf_16_le_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little')
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def utf_16_be_encode( obj, errors='strict'):
|
||||
"""None
|
||||
"""
|
||||
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big')
|
||||
res = ''.join(res)
|
||||
res = bytes(res)
|
||||
return res, len(res)
|
||||
|
||||
def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0):
|
||||
@@ -842,15 +832,15 @@ def PyUnicode_DecodeASCII(s, size, errors):
|
||||
pos = 0
|
||||
while pos < len(s):
|
||||
c = s[pos]
|
||||
if ord(c) < 128:
|
||||
p += chr(ord(c))
|
||||
if c < 128:
|
||||
p += chr(c)
|
||||
pos += 1
|
||||
else:
|
||||
|
||||
res = unicode_call_errorhandler(
|
||||
errors, "ascii", "ordinal not in range(128)",
|
||||
s, pos, pos+1)
|
||||
p += [chr(ord(x)) for x in res[0]]
|
||||
p += res[0]
|
||||
pos = res[1]
|
||||
return p
|
||||
|
||||
@@ -1083,12 +1073,12 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
pos = 0
|
||||
while pos < size:
|
||||
ch = s[pos]
|
||||
if ord(ch) < 0x80:
|
||||
p += ch
|
||||
if ch < 0x80:
|
||||
p += chr(ch)
|
||||
pos += 1
|
||||
continue
|
||||
|
||||
n = utf8_code_length[ord(ch)]
|
||||
n = utf8_code_length[ch]
|
||||
startinpos = pos
|
||||
if (startinpos + n > size):
|
||||
if not final:
|
||||
@@ -1118,7 +1108,7 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
p += res[0]
|
||||
pos = res[1]
|
||||
elif n == 2:
|
||||
if ((ord(s[pos+1]) & 0xc0) != 0x80):
|
||||
if ((s[pos+1] & 0xc0) != 0x80):
|
||||
errmsg = "invalid data"
|
||||
endinpos = startinpos+2
|
||||
res = unicode_call_errorhandler(
|
||||
@@ -1127,7 +1117,7 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
p += res[0]
|
||||
pos = res[1]
|
||||
else:
|
||||
c = ((ord(s[pos]) & 0x1f) << 6) + (ord(s[pos+1]) & 0x3f)
|
||||
c = ((s[pos] & 0x1f) << 6) + (s[pos+1] & 0x3f)
|
||||
if c < 0x80:
|
||||
errmsg = "illegal encoding"
|
||||
endinpos = startinpos+2
|
||||
@@ -1141,8 +1131,8 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
pos += n
|
||||
#break
|
||||
elif n == 3:
|
||||
if ((ord(s[pos+1]) & 0xc0) != 0x80 or
|
||||
(ord(s[pos+2]) & 0xc0) != 0x80):
|
||||
if ((s[pos+1] & 0xc0) != 0x80 or
|
||||
(s[pos+2] & 0xc0) != 0x80):
|
||||
errmsg = "invalid data"
|
||||
endinpos = startinpos+3
|
||||
res = unicode_call_errorhandler(
|
||||
@@ -1151,9 +1141,9 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
p += res[0]
|
||||
pos = res[1]
|
||||
else:
|
||||
c = ((ord(s[pos]) & 0x0f) << 12) + \
|
||||
((ord(s[pos+1]) & 0x3f) << 6) +\
|
||||
(ord(s[pos+2]) & 0x3f)
|
||||
c = ((s[pos] & 0x0f) << 12) + \
|
||||
((s[pos+1] & 0x3f) << 6) +\
|
||||
(s[pos+2] & 0x3f)
|
||||
|
||||
## /* Note: UTF-8 encodings of surrogates are considered
|
||||
## legal UTF-8 sequences;
|
||||
@@ -1175,9 +1165,9 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
pos += n
|
||||
elif n == 4:
|
||||
## case 4:
|
||||
if ((ord(s[pos+1]) & 0xc0) != 0x80 or
|
||||
(ord(s[pos+2]) & 0xc0) != 0x80 or
|
||||
(ord(s[pos+3]) & 0xc0) != 0x80):
|
||||
if ((s[pos+1] & 0xc0) != 0x80 or
|
||||
(s[pos+2] & 0xc0) != 0x80 or
|
||||
(s[pos+3] & 0xc0) != 0x80):
|
||||
|
||||
errmsg = "invalid data"
|
||||
startinpos = pos
|
||||
@@ -1188,8 +1178,8 @@ def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
||||
p += res[0]
|
||||
pos = res[1]
|
||||
else:
|
||||
c = ((ord(s[pos+0]) & 0x7) << 18) + ((ord(s[pos+1]) & 0x3f) << 12) +\
|
||||
((ord(s[pos+2]) & 0x3f) << 6) + (ord(s[pos+3]) & 0x3f)
|
||||
c = ((s[pos+0] & 0x7) << 18) + ((s[pos+1] & 0x3f) << 12) +\
|
||||
((s[pos+2] & 0x3f) << 6) + (s[pos+3] & 0x3f)
|
||||
#/* validate and convert to UTF-16 */
|
||||
if ((c < 0x10000) or (c > 0x10ffff)):
|
||||
#/* minimum value allowed for 4 byte encoding */
|
||||
@@ -1239,18 +1229,18 @@ def PyUnicode_EncodeUTF8(s, size, errors):
|
||||
|
||||
#assert(s != None)
|
||||
assert(size >= 0)
|
||||
p = []
|
||||
p = bytearray()
|
||||
i = 0
|
||||
while i < size:
|
||||
ch = s[i]
|
||||
i += 1
|
||||
if (ord(ch) < 0x80):
|
||||
## /* Encode ASCII */
|
||||
p += chr(ord(ch))
|
||||
p.append(ord(ch))
|
||||
elif (ord(ch) < 0x0800) :
|
||||
## /* Encode Latin-1 */
|
||||
p += chr((0xc0 | (ord(ch) >> 6)))
|
||||
p += chr((0x80 | (ord(ch) & 0x3f)))
|
||||
p.append(0xc0 | (ord(ch) >> 6))
|
||||
p.append(0x80 | (ord(ch) & 0x3f))
|
||||
else:
|
||||
## /* Encode UCS2 Unicode ordinals */
|
||||
if (ord(ch) < 0x10000):
|
||||
@@ -1262,24 +1252,24 @@ def PyUnicode_EncodeUTF8(s, size, errors):
|
||||
if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
|
||||
ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
|
||||
i += 1
|
||||
p.extend(encodeUCS4(ch3))
|
||||
p += encodeUCS4(ch3)
|
||||
continue
|
||||
## /* Fall through: handles isolated high surrogates */
|
||||
p += (chr((0xe0 | (ord(ch) >> 12))))
|
||||
p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
|
||||
p += (chr((0x80 | (ord(ch) & 0x3f))))
|
||||
p.append(0xe0 | (ord(ch) >> 12))
|
||||
p.append(0x80 | ((ord(ch) >> 6) & 0x3f))
|
||||
p.append(0x80 | (ord(ch) & 0x3f))
|
||||
continue
|
||||
else:
|
||||
p.extend(encodeUCS4(ord(ch)))
|
||||
p += encodeUCS4(ord(ch))
|
||||
return p
|
||||
|
||||
def encodeUCS4(ch):
|
||||
## /* Encode UCS4 Unicode ordinals */
|
||||
p = []
|
||||
p += (chr((0xf0 | (ch >> 18))))
|
||||
p += (chr((0x80 | ((ch >> 12) & 0x3f))))
|
||||
p += (chr((0x80 | ((ch >> 6) & 0x3f))))
|
||||
p += (chr((0x80 | (ch & 0x3f))))
|
||||
p = bytearray()
|
||||
p.append(0xf0 | (ch >> 18))
|
||||
p.append(0x80 | ((ch >> 12) & 0x3f))
|
||||
p.append(0x80 | ((ch >> 6) & 0x3f))
|
||||
p.append(0x80 | (ch & 0x3f))
|
||||
return p
|
||||
|
||||
#/* --- Latin-1 Codec ------------------------------------------------------ */
|
||||
@@ -1291,7 +1281,7 @@ def PyUnicode_DecodeLatin1(s, size, errors):
|
||||
pos = 0
|
||||
p = []
|
||||
while (pos < size):
|
||||
p += chr(ord(s[pos]))
|
||||
p += chr(s[pos])
|
||||
pos += 1
|
||||
return p
|
||||
|
||||
@@ -1305,15 +1295,15 @@ def unicode_encode_ucs1(p, size, errors, limit):
|
||||
encoding = "ascii"
|
||||
|
||||
if (size == 0):
|
||||
return ['']
|
||||
res = []
|
||||
return []
|
||||
res = bytearray()
|
||||
pos = 0
|
||||
while pos < len(p):
|
||||
#for ch in p:
|
||||
ch = p[pos]
|
||||
|
||||
if ord(ch) < limit:
|
||||
res += chr(ord(ch))
|
||||
res.append(ord(ch))
|
||||
pos += 1
|
||||
else:
|
||||
#/* startpos for collecting unencodable chars */
|
||||
@@ -1481,7 +1471,7 @@ def PyUnicode_EncodeRawUnicodeEscape(s, size):
|
||||
if (size == 0):
|
||||
return ''
|
||||
|
||||
p = []
|
||||
p = bytearray()
|
||||
for ch in s:
|
||||
# /* Map 32-bit characters to '\Uxxxxxxxx' */
|
||||
if (ord(ch) >= 0x10000):
|
||||
@@ -1490,12 +1480,10 @@ def PyUnicode_EncodeRawUnicodeEscape(s, size):
|
||||
p += '%08x' % (ord(ch))
|
||||
elif (ord(ch) >= 256) :
|
||||
# /* Map 16-bit characters to '\uxxxx' */
|
||||
p += '\\'
|
||||
p += 'u'
|
||||
p += '%04x' % (ord(ch))
|
||||
p += b'\\u%04x' % (ord(ch))
|
||||
# /* Copy everything else as-is */
|
||||
else:
|
||||
p += chr(ord(ch))
|
||||
p.append(ord(ch))
|
||||
|
||||
#p += '\0'
|
||||
return p
|
||||
|
||||
Reference in New Issue
Block a user