mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
1674 lines
52 KiB
Python
Vendored
1674 lines
52 KiB
Python
Vendored
# Note:
|
|
# This *is* now explicitly RPython.
|
|
# Please make sure not to break this.
|
|
|
|
"""
|
|
|
|
_codecs -- Provides access to the codec registry and the builtin
|
|
codecs.
|
|
|
|
This module should never be imported directly. The standard library
|
|
module "codecs" wraps this builtin module for use within Python.
|
|
|
|
The codec registry is accessible via:
|
|
|
|
register(search_function) -> None
|
|
|
|
lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
|
|
|
|
The builtin Unicode codecs use the following interface:
|
|
|
|
<encoding>_encode(Unicode_object[,errors='strict']) ->
|
|
(string object, bytes consumed)
|
|
|
|
<encoding>_decode(char_buffer_obj[,errors='strict']) ->
|
|
(Unicode object, bytes consumed)
|
|
|
|
<encoding>_encode() interfaces also accept non-Unicode object as
|
|
input. The objects are then converted to Unicode using
|
|
PyUnicode_FromObject() prior to applying the conversion.
|
|
|
|
These <encoding>s are available: utf_8, unicode_escape,
|
|
raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
|
|
mbcs (on win32).
|
|
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
From PyPy v1.0.0
|
|
|
|
"""
|
|
#from unicodecodec import *
|
|
|
|
__all__ = ['register', 'lookup', 'lookup_error', 'register_error', 'encode', 'decode',
|
|
'latin_1_encode', 'mbcs_decode', 'readbuffer_encode', 'escape_encode',
|
|
'utf_8_decode', 'raw_unicode_escape_decode', 'utf_7_decode',
|
|
'unicode_escape_encode', 'latin_1_decode', 'utf_16_decode',
|
|
'unicode_escape_decode', 'ascii_decode', 'charmap_encode',
|
|
'unicode_internal_encode', 'unicode_internal_decode', 'utf_16_ex_decode',
|
|
'escape_decode', 'charmap_decode', 'utf_7_encode', 'mbcs_encode',
|
|
'ascii_encode', 'utf_16_encode', 'raw_unicode_escape_encode', 'utf_8_encode',
|
|
'utf_16_le_encode', 'utf_16_be_encode', 'utf_16_le_decode', 'utf_16_be_decode',]
|
|
|
|
import sys
|
|
#/* --- Registry ----------------------------------------------------------- */
|
|
codec_search_path = []
|
|
codec_search_cache = {}
|
|
codec_error_registry = {}
|
|
codec_need_encodings = [True]
|
|
|
|
def register( search_function ):
|
|
"""register(search_function)
|
|
|
|
Register a codec search function. Search functions are expected to take
|
|
one argument, the encoding name in all lower case letters, and return
|
|
a tuple of functions (encoder, decoder, stream_reader, stream_writer).
|
|
"""
|
|
|
|
if not callable(search_function):
|
|
raise TypeError("argument must be callable")
|
|
codec_search_path.append(search_function)
|
|
|
|
|
|
def lookup(encoding):
|
|
"""lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
|
|
Looks up a codec tuple in the Python codec registry and returns
|
|
a tuple of functions.
|
|
"""
|
|
if not isinstance(encoding, str):
|
|
raise TypeError("Encoding must be a string")
|
|
normalized_encoding = encoding.replace(" ", "-").lower()
|
|
result = codec_search_cache.get(normalized_encoding, None)
|
|
if not result:
|
|
if codec_need_encodings:
|
|
import encodings
|
|
if len(codec_search_path) == 0:
|
|
raise LookupError("no codec search functions registered: can't find encoding")
|
|
del codec_need_encodings[:]
|
|
for search in codec_search_path:
|
|
result = search(normalized_encoding)
|
|
if result:
|
|
codec_search_cache[normalized_encoding] = result
|
|
return result
|
|
if not result:
|
|
raise LookupError("unknown encoding: %s" % encoding)
|
|
return result
|
|
|
|
|
|
|
|
def encode(v, encoding=None, errors='strict'):
|
|
"""encode(obj, [encoding[,errors]]) -> object
|
|
|
|
Encodes obj using the codec registered for encoding. encoding defaults
|
|
to the default encoding. errors may be given to set a different error
|
|
handling scheme. Default is 'strict' meaning that encoding errors raise
|
|
a ValueError. Other possible values are 'ignore', 'replace' and
|
|
'xmlcharrefreplace' as well as any other name registered with
|
|
codecs.register_error that can handle ValueErrors.
|
|
"""
|
|
if encoding == None:
|
|
encoding = sys.getdefaultencoding()
|
|
if not isinstance(encoding, str):
|
|
raise TypeError("Encoding must be a string")
|
|
if not isinstance(errors, str):
|
|
raise TypeError("Errors must be a string")
|
|
codec = lookup(encoding)
|
|
res = codec.encode(v, errors)
|
|
if not isinstance(res, tuple) or len(res) != 2:
|
|
raise TypeError("encoder must return a tuple (object, integer)")
|
|
return res[0]
|
|
|
|
def decode(obj, encoding=None, errors='strict'):
|
|
"""decode(obj, [encoding[,errors]]) -> object
|
|
|
|
Decodes obj using the codec registered for encoding. encoding defaults
|
|
to the default encoding. errors may be given to set a different error
|
|
handling scheme. Default is 'strict' meaning that encoding errors raise
|
|
a ValueError. Other possible values are 'ignore' and 'replace'
|
|
as well as any other name registerd with codecs.register_error that is
|
|
able to handle ValueErrors.
|
|
"""
|
|
if encoding == None:
|
|
encoding = sys.getdefaultencoding()
|
|
if not isinstance(encoding, str):
|
|
raise TypeError("Encoding must be a string")
|
|
if not isinstance(errors, str):
|
|
raise TypeError("Errors must be a string")
|
|
codec = lookup(encoding)
|
|
res = codec.decode(obj, errors)
|
|
if not isinstance(res, tuple) or len(res) != 2:
|
|
raise TypeError("encoder must return a tuple (object, integer)")
|
|
return res[0]
|
|
|
|
|
|
def latin_1_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeLatin1(obj, len(obj), errors)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
# XXX MBCS codec might involve ctypes ?
|
|
def mbcs_decode():
|
|
"""None
|
|
"""
|
|
pass
|
|
|
|
def readbuffer_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = str(obj)
|
|
return res, len(res)
|
|
|
|
def escape_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
s = repr(obj)
|
|
v = s[1:-1]
|
|
return v, len(v)
|
|
|
|
def utf_8_decode( data, errors='strict', final=False):
|
|
"""None
|
|
"""
|
|
consumed = len(data)
|
|
if final:
|
|
consumed = 0
|
|
res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
|
|
res = ''.join(res)
|
|
return res, consumed
|
|
|
|
def raw_unicode_escape_decode( data, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
def utf_7_decode( data, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeUTF7(data, len(data), errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
def unicode_escape_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = unicodeescape_string(obj, len(obj), 0)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def latin_1_decode( data, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeLatin1(data, len(data), errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
def utf_16_decode( data, errors='strict', final=False):
|
|
"""None
|
|
"""
|
|
consumed = len(data)
|
|
if final:
|
|
consumed = 0
|
|
res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final)
|
|
res = ''.join(res)
|
|
return res, consumed
|
|
|
|
def unicode_escape_decode( data, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeUnicodeEscape(data, len(data), errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
|
|
def ascii_decode( data, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeASCII(data, len(data), errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
def charmap_encode(obj, errors='strict', mapping='latin-1'):
|
|
"""None
|
|
"""
|
|
|
|
res = PyUnicode_EncodeCharmap(obj, len(obj), mapping, errors)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
if sys.maxunicode == 65535:
|
|
unicode_bytes = 2
|
|
else:
|
|
unicode_bytes = 4
|
|
|
|
def unicode_internal_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
if type(obj) == str:
|
|
p = bytearray()
|
|
t = [ord(x) for x in obj]
|
|
for i in t:
|
|
b = bytearray()
|
|
for j in range(unicode_bytes):
|
|
b.append(i%256)
|
|
i >>= 8
|
|
if sys.byteorder == "big":
|
|
b.reverse()
|
|
p += b
|
|
res = bytes(p)
|
|
return res, len(res)
|
|
else:
|
|
res = "You can do better than this" # XXX make this right
|
|
return res, len(res)
|
|
|
|
def unicode_internal_decode( unistr, errors='strict'):
|
|
"""None
|
|
"""
|
|
if type(unistr) == str:
|
|
return unistr, len(unistr)
|
|
else:
|
|
p = []
|
|
i = 0
|
|
if sys.byteorder == "big":
|
|
start = unicode_bytes - 1
|
|
stop = -1
|
|
step = -1
|
|
else:
|
|
start = 0
|
|
stop = unicode_bytes
|
|
step = 1
|
|
while i < len(unistr)-unicode_bytes+1:
|
|
t = 0
|
|
h = 0
|
|
for j in range(start, stop, step):
|
|
t += ord(unistr[i+j])<<(h*8)
|
|
h += 1
|
|
i += unicode_bytes
|
|
p += chr(t)
|
|
res = ''.join(p)
|
|
return res, len(res)
|
|
|
|
def utf_16_ex_decode( data, errors='strict', byteorder=0, final=0):
|
|
"""None
|
|
"""
|
|
if byteorder == 0:
|
|
bm = 'native'
|
|
elif byteorder == -1:
|
|
bm = 'little'
|
|
else:
|
|
bm = 'big'
|
|
consumed = len(data)
|
|
if final:
|
|
consumed = 0
|
|
res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, bm, final)
|
|
res = ''.join(res)
|
|
return res, consumed, byteorder
|
|
|
|
# XXX needs error messages when the input is invalid
|
|
def escape_decode(data, errors='strict'):
|
|
"""None
|
|
"""
|
|
l = len(data)
|
|
i = 0
|
|
res = []
|
|
while i < l:
|
|
|
|
if data[i] == '\\':
|
|
i += 1
|
|
if i >= l:
|
|
raise ValueError("Trailing \\ in string")
|
|
else:
|
|
if data[i] == '\\':
|
|
res += '\\'
|
|
elif data[i] == 'n':
|
|
res += '\n'
|
|
elif data[i] == 't':
|
|
res += '\t'
|
|
elif data[i] == 'r':
|
|
res += '\r'
|
|
elif data[i] == 'b':
|
|
res += '\b'
|
|
elif data[i] == '\'':
|
|
res += '\''
|
|
elif data[i] == '\"':
|
|
res += '\"'
|
|
elif data[i] == 'f':
|
|
res += '\f'
|
|
elif data[i] == 'a':
|
|
res += '\a'
|
|
elif data[i] == 'v':
|
|
res += '\v'
|
|
elif '0' <= data[i] <= '9':
|
|
# emulate a strange wrap-around behavior of CPython:
|
|
# \400 is the same as \000 because 0400 == 256
|
|
octal = data[i:i+3]
|
|
res += chr(int(octal, 8) & 0xFF)
|
|
i += 2
|
|
elif data[i] == 'x':
|
|
hexa = data[i+1:i+3]
|
|
res += chr(int(hexa, 16))
|
|
i += 2
|
|
else:
|
|
res += data[i]
|
|
i += 1
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
def charmap_decode( data, errors='strict', mapping=None):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_DecodeCharmap(data, len(data), mapping, errors)
|
|
res = ''.join(res)
|
|
return res, len(res)
|
|
|
|
|
|
def utf_7_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def mbcs_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
pass
|
|
## return (PyUnicode_EncodeMBCS(
|
|
## (obj),
|
|
## len(obj),
|
|
## errors),
|
|
## len(obj))
|
|
|
|
|
|
def ascii_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeASCII(obj, len(obj), errors)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def utf_16_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native')
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def raw_unicode_escape_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeRawUnicodeEscape(obj, len(obj))
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def utf_8_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def utf_16_le_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little')
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def utf_16_be_encode( obj, errors='strict'):
|
|
"""None
|
|
"""
|
|
res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big')
|
|
res = bytes(res)
|
|
return res, len(res)
|
|
|
|
def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0):
|
|
"""None
|
|
"""
|
|
consumed = len(data)
|
|
if final:
|
|
consumed = 0
|
|
res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final)
|
|
res = ''.join(res)
|
|
return res, consumed
|
|
|
|
def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
|
|
"""None
|
|
"""
|
|
consumed = len(data)
|
|
if final:
|
|
consumed = 0
|
|
res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final)
|
|
res = ''.join(res)
|
|
return res, consumed
|
|
|
|
|
|
class _PretendCFunction:
|
|
def __init__(self, f):
|
|
self.__f = f
|
|
@property
|
|
def _f(self):
|
|
return self.__dict__['__f']
|
|
def __repr__(self):
|
|
return repr(self._f)
|
|
def __str__(self):
|
|
return str(self._f)
|
|
def __call__(self, *args, **kwargs):
|
|
return self._f(*args, **kwargs)
|
|
|
|
_mod = globals()
|
|
for exp in __all__:
|
|
if exp.endswith('_encode') or exp.endswith('_decode'):
|
|
# encodings expect most of these to be builtin functions, so we pretend they are
|
|
_mod[exp] = _PretendCFunction(_mod[exp])
|
|
del _mod, exp
|
|
|
|
|
|
def strict_errors(exc):
|
|
if isinstance(exc, Exception):
|
|
raise exc
|
|
else:
|
|
raise TypeError("codec must pass exception instance")
|
|
|
|
def ignore_errors(exc):
|
|
if isinstance(exc, UnicodeEncodeError):
|
|
return '', exc.end
|
|
elif isinstance(exc, (UnicodeDecodeError, UnicodeTranslateError)):
|
|
return '', exc.end
|
|
else:
|
|
raise TypeError("don't know how to handle %.400s in error callback"%exc)
|
|
|
|
Py_UNICODE_REPLACEMENT_CHARACTER = "\ufffd"
|
|
|
|
def replace_errors(exc):
|
|
if isinstance(exc, UnicodeEncodeError):
|
|
return '?'*(exc.end-exc.start), exc.end
|
|
elif isinstance(exc, (UnicodeTranslateError, UnicodeDecodeError)):
|
|
return Py_UNICODE_REPLACEMENT_CHARACTER*(exc.end-exc.start), exc.end
|
|
else:
|
|
raise TypeError("don't know how to handle %.400s in error callback"%exc)
|
|
|
|
def xmlcharrefreplace_errors(exc):
|
|
if isinstance(exc, UnicodeEncodeError):
|
|
res = []
|
|
for ch in exc.object[exc.start:exc.end]:
|
|
res += '&#'
|
|
res += str(ord(ch))
|
|
res += ';'
|
|
return ''.join(res), exc.end
|
|
else:
|
|
raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
|
|
|
|
def backslashreplace_errors(exc):
|
|
if isinstance(exc, UnicodeEncodeError):
|
|
p = []
|
|
for c in exc.object[exc.start:exc.end]:
|
|
p += '\\'
|
|
oc = ord(c)
|
|
if (oc >= 0x00010000):
|
|
p += 'U'
|
|
p += "%.8x" % ord(c)
|
|
elif (oc >= 0x100):
|
|
p += 'u'
|
|
p += "%.4x" % ord(c)
|
|
else:
|
|
p += 'x'
|
|
p += "%.2x" % ord(c)
|
|
return ''.join(p), exc.end
|
|
else:
|
|
raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
##import sys
|
|
##""" Python implementation of CPythons builtin unicode codecs.
|
|
##
|
|
## Generally the functions in this module take a list of characters an returns
|
|
## a list of characters.
|
|
##
|
|
## For use in the PyPy project"""
|
|
|
|
|
|
## indicate whether a UTF-7 character is special i.e. cannot be directly
|
|
## encoded:
|
|
## 0 - not special
|
|
## 1 - special
|
|
## 2 - whitespace (optional)
|
|
## 3 - RFC2152 Set O (optional)
|
|
|
|
utf7_special = [
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
|
|
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
|
|
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
|
|
]
|
|
unicode_latin1 = [None]*256
|
|
|
|
|
|
def lookup_error(errors):
|
|
"""lookup_error(errors) -> handler
|
|
|
|
Return the error handler for the specified error handling name
|
|
or raise a LookupError, if no handler exists under this name.
|
|
"""
|
|
|
|
try:
|
|
err_handler = codec_error_registry[errors]
|
|
except KeyError:
|
|
raise LookupError("unknown error handler name %s"%errors)
|
|
return err_handler
|
|
|
|
def register_error(errors, handler):
|
|
"""register_error(errors, handler)
|
|
|
|
Register the specified error handler under the name
|
|
errors. handler must be a callable object, that
|
|
will be called with an exception instance containing
|
|
information about the location of the encoding/decoding
|
|
error and must return a (replacement, new position) tuple.
|
|
"""
|
|
if callable(handler):
|
|
codec_error_registry[errors] = handler
|
|
else:
|
|
raise TypeError("handler must be callable")
|
|
|
|
register_error("strict", strict_errors)
|
|
register_error("ignore", ignore_errors)
|
|
register_error("replace", replace_errors)
|
|
register_error("xmlcharrefreplace", xmlcharrefreplace_errors)
|
|
register_error("backslashreplace", backslashreplace_errors)
|
|
|
|
def SPECIAL(c, encodeO, encodeWS):
|
|
c = ord(c)
|
|
return (c>127 or utf7_special[c] == 1) or \
|
|
(encodeWS and (utf7_special[(c)] == 2)) or \
|
|
(encodeO and (utf7_special[(c)] == 3))
|
|
def B64(n):
|
|
return ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
|
|
def B64CHAR(c):
|
|
return (c.isalnum() or (c) == '+' or (c) == '/')
|
|
def UB64(c):
|
|
if (c) == '+' :
|
|
return 62
|
|
elif (c) == '/':
|
|
return 63
|
|
elif (c) >= 'a':
|
|
return ord(c) - 71
|
|
elif (c) >= 'A':
|
|
return ord(c) - 65
|
|
else:
|
|
return ord(c) + 4
|
|
|
|
def ENCODE( ch, bits) :
|
|
out = []
|
|
while (bits >= 6):
|
|
out += B64(ch >> (bits-6))
|
|
bits -= 6
|
|
return out, bits
|
|
|
|
def PyUnicode_DecodeUTF7(s, size, errors):
|
|
|
|
starts = s
|
|
errmsg = ""
|
|
inShift = 0
|
|
bitsleft = 0
|
|
charsleft = 0
|
|
surrogate = 0
|
|
p = []
|
|
errorHandler = None
|
|
exc = None
|
|
|
|
if (size == 0):
|
|
return str('')
|
|
i = 0
|
|
while i < size:
|
|
|
|
ch = s[i]
|
|
if (inShift):
|
|
if ((ch == '-') or not B64CHAR(ch)):
|
|
inShift = 0
|
|
i += 1
|
|
|
|
while (bitsleft >= 16):
|
|
outCh = ((charsleft) >> (bitsleft-16)) & 0xffff
|
|
bitsleft -= 16
|
|
|
|
if (surrogate):
|
|
## We have already generated an error for the high surrogate
|
|
## so let's not bother seeing if the low surrogate is correct or not
|
|
surrogate = 0
|
|
elif (0xDC00 <= (outCh) and (outCh) <= 0xDFFF):
|
|
## This is a surrogate pair. Unfortunately we can't represent
|
|
## it in a 16-bit character
|
|
surrogate = 1
|
|
msg = "code pairs are not supported"
|
|
out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
|
|
p += out
|
|
bitsleft = 0
|
|
break
|
|
else:
|
|
p += chr(outCh )
|
|
#p += out
|
|
if (bitsleft >= 6):
|
|
## /* The shift sequence has a partial character in it. If
|
|
## bitsleft < 6 then we could just classify it as padding
|
|
## but that is not the case here */
|
|
msg = "partial character in shift sequence"
|
|
out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
|
|
|
|
## /* According to RFC2152 the remaining bits should be zero. We
|
|
## choose to signal an error/insert a replacement character
|
|
## here so indicate the potential of a misencoded character. */
|
|
|
|
## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
|
|
## if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))):
|
|
## raise UnicodeDecodeError, "non-zero padding bits in shift sequence"
|
|
if (ch == '-') :
|
|
if ((i < size) and (s[i] == '-')) :
|
|
p += '-'
|
|
inShift = 1
|
|
|
|
elif SPECIAL(ch, 0, 0) :
|
|
raise UnicodeDecodeError("unexpected special character")
|
|
|
|
else:
|
|
p += ch
|
|
else:
|
|
charsleft = (charsleft << 6) | UB64(ch)
|
|
bitsleft += 6
|
|
i += 1
|
|
## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
|
|
elif ( ch == '+' ):
|
|
startinpos = i
|
|
i += 1
|
|
if (i<size and s[i] == '-'):
|
|
i += 1
|
|
p += '+'
|
|
else:
|
|
inShift = 1
|
|
bitsleft = 0
|
|
|
|
elif (SPECIAL(ch, 0, 0)):
|
|
i += 1
|
|
raise UnicodeDecodeError("unexpected special character")
|
|
else:
|
|
p += ch
|
|
i += 1
|
|
|
|
if (inShift) :
|
|
#XXX This aint right
|
|
endinpos = size
|
|
raise UnicodeDecodeError("unterminated shift sequence")
|
|
|
|
return p
|
|
|
|
def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
|
|
|
|
# /* It might be possible to tighten this worst case */
|
|
inShift = False
|
|
i = 0
|
|
bitsleft = 0
|
|
charsleft = 0
|
|
out = []
|
|
for ch in s:
|
|
if (not inShift) :
|
|
if (ch == '+'):
|
|
out += '+'
|
|
out += '-'
|
|
elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
|
|
charsleft = ord(ch)
|
|
bitsleft = 16
|
|
out += '+'
|
|
p, bitsleft = ENCODE( charsleft, bitsleft)
|
|
out += p
|
|
inShift = bitsleft > 0
|
|
else:
|
|
out += chr(ord(ch))
|
|
else:
|
|
if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
|
|
out += B64((charsleft) << (6-bitsleft))
|
|
charsleft = 0
|
|
bitsleft = 0
|
|
## /* Characters not in the BASE64 set implicitly unshift the sequence
|
|
## so no '-' is required, except if the character is itself a '-' */
|
|
if (B64CHAR(ch) or ch == '-'):
|
|
out += '-'
|
|
inShift = False
|
|
out += chr(ord(ch))
|
|
else:
|
|
bitsleft += 16
|
|
charsleft = (((charsleft) << 16) | ord(ch))
|
|
p, bitsleft = ENCODE(charsleft, bitsleft)
|
|
out += p
|
|
## /* If the next character is special then we dont' need to terminate
|
|
## the shift sequence. If the next character is not a BASE64 character
|
|
## or '-' then the shift sequence will be terminated implicitly and we
|
|
## don't have to insert a '-'. */
|
|
|
|
if (bitsleft == 0):
|
|
if (i + 1 < size):
|
|
ch2 = s[i+1]
|
|
|
|
if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)):
|
|
pass
|
|
elif (B64CHAR(ch2) or ch2 == '-'):
|
|
out += '-'
|
|
inShift = False
|
|
else:
|
|
inShift = False
|
|
else:
|
|
out += '-'
|
|
inShift = False
|
|
i += 1
|
|
|
|
if (bitsleft):
|
|
out += B64(charsleft << (6-bitsleft) )
|
|
out += '-'
|
|
|
|
return out
|
|
|
|
unicode_empty = ''
|
|
|
|
def unicodeescape_string(s, size, quotes):
|
|
|
|
p = []
|
|
if (quotes) :
|
|
p += 'u'
|
|
if (s.find('\'') != -1 and s.find('"') == -1):
|
|
p += '"'
|
|
else:
|
|
p += '\''
|
|
pos = 0
|
|
while (pos < size):
|
|
ch = s[pos]
|
|
#/* Escape quotes */
|
|
if (quotes and (ch == p[1] or ch == '\\')):
|
|
p += '\\'
|
|
p += chr(ord(ch))
|
|
pos += 1
|
|
continue
|
|
|
|
#ifdef Py_UNICODE_WIDE
|
|
#/* Map 21-bit characters to '\U00xxxxxx' */
|
|
elif (ord(ch) >= 0x10000):
|
|
p += '\\'
|
|
p += 'U'
|
|
p += '%08x' % ord(ch)
|
|
pos += 1
|
|
continue
|
|
#endif
|
|
#/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
|
|
elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00):
|
|
pos += 1
|
|
ch2 = s[pos]
|
|
|
|
if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF):
|
|
ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000
|
|
p += '\\'
|
|
p += 'U'
|
|
p += '%08x' % ucs
|
|
pos += 1
|
|
continue
|
|
|
|
#/* Fall through: isolated surrogates are copied as-is */
|
|
pos -= 1
|
|
|
|
#/* Map 16-bit characters to '\uxxxx' */
|
|
if (ord(ch) >= 256):
|
|
p += '\\'
|
|
p += 'u'
|
|
p += '%04x' % ord(ch)
|
|
|
|
#/* Map special whitespace to '\t', \n', '\r' */
|
|
elif (ch == '\t'):
|
|
p += '\\'
|
|
p += 't'
|
|
|
|
elif (ch == '\n'):
|
|
p += '\\'
|
|
p += 'n'
|
|
|
|
elif (ch == '\r'):
|
|
p += '\\'
|
|
p += 'r'
|
|
|
|
#/* Map non-printable US ASCII to '\xhh' */
|
|
elif (ch < ' ' or ch >= 0x7F) :
|
|
p += '\\'
|
|
p += 'x'
|
|
p += '%02x' % ord(ch)
|
|
#/* Copy everything else as-is */
|
|
else:
|
|
p += chr(ord(ch))
|
|
pos += 1
|
|
if (quotes):
|
|
p += p[1]
|
|
return p
|
|
|
|
def PyUnicode_DecodeASCII(s, size, errors):
|
|
|
|
# /* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
|
if (size == 1 and ord(s) < 128) :
|
|
return [chr(ord(s))]
|
|
if (size == 0):
|
|
return [''] #unicode('')
|
|
p = []
|
|
pos = 0
|
|
while pos < len(s):
|
|
c = s[pos]
|
|
if c < 128:
|
|
p += chr(c)
|
|
pos += 1
|
|
else:
|
|
|
|
res = unicode_call_errorhandler(
|
|
errors, "ascii", "ordinal not in range(128)",
|
|
s, pos, pos+1)
|
|
p += res[0]
|
|
pos = res[1]
|
|
return p
|
|
|
|
def PyUnicode_EncodeASCII(p, size, errors):
|
|
|
|
return unicode_encode_ucs1(p, size, errors, 128)
|
|
|
|
def PyUnicode_AsASCIIString(unistr):
|
|
|
|
if not type(unistr) == str:
|
|
raise TypeError
|
|
return PyUnicode_EncodeASCII(str(unistr),
|
|
len(str),
|
|
None)
|
|
|
|
def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True):
|
|
|
|
bo = 0 #/* assume native ordering by default */
|
|
consumed = 0
|
|
errmsg = ""
|
|
|
|
if sys.byteorder == 'little':
|
|
ihi = 1
|
|
ilo = 0
|
|
else:
|
|
ihi = 0
|
|
ilo = 1
|
|
|
|
|
|
#/* Unpack UTF-16 encoded data */
|
|
|
|
## /* Check for BOM marks (U+FEFF) in the input and adjust current
|
|
## byte order setting accordingly. In native mode, the leading BOM
|
|
## mark is skipped, in all other modes, it is copied to the output
|
|
## stream as-is (giving a ZWNBSP character). */
|
|
q = 0
|
|
p = []
|
|
if byteorder == 'native':
|
|
if (size >= 2):
|
|
bom = (ord(s[ihi]) << 8) | ord(s[ilo])
|
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
|
if sys.byteorder == 'little':
|
|
if (bom == 0xFEFF):
|
|
q += 2
|
|
bo = -1
|
|
elif bom == 0xFFFE:
|
|
q += 2
|
|
bo = 1
|
|
else:
|
|
if bom == 0xFEFF:
|
|
q += 2
|
|
bo = 1
|
|
elif bom == 0xFFFE:
|
|
q += 2
|
|
bo = -1
|
|
elif byteorder == 'little':
|
|
bo = -1
|
|
else:
|
|
bo = 1
|
|
|
|
if (size == 0):
|
|
return [''], 0, bo
|
|
|
|
if (bo == -1):
|
|
#/* force LE */
|
|
ihi = 1
|
|
ilo = 0
|
|
|
|
elif (bo == 1):
|
|
#/* force BE */
|
|
ihi = 0
|
|
ilo = 1
|
|
|
|
while (q < len(s)):
|
|
|
|
#/* remaining bytes at the end? (size should be even) */
|
|
if (len(s)-q<2):
|
|
if not final:
|
|
break
|
|
errmsg = "truncated data"
|
|
startinpos = q
|
|
endinpos = len(s)
|
|
unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
|
|
# /* The remaining input chars are ignored if the callback
|
|
## chooses to skip the input */
|
|
|
|
ch = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
|
|
q += 2
|
|
|
|
if (ch < 0xD800 or ch > 0xDFFF):
|
|
p += chr(ch)
|
|
continue
|
|
|
|
#/* UTF-16 code pair: */
|
|
if (q >= len(s)):
|
|
errmsg = "unexpected end of data"
|
|
startinpos = q-2
|
|
endinpos = len(s)
|
|
unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
|
|
|
|
if (0xD800 <= ch and ch <= 0xDBFF):
|
|
ch2 = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
|
|
q += 2
|
|
if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
|
|
#ifndef Py_UNICODE_WIDE
|
|
if sys.maxunicode < 65536:
|
|
p += chr(ch)
|
|
p += chr(ch2)
|
|
else:
|
|
p += chr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
|
|
#endif
|
|
continue
|
|
|
|
else:
|
|
errmsg = "illegal UTF-16 surrogate"
|
|
startinpos = q-4
|
|
endinpos = startinpos+2
|
|
unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
|
|
|
|
errmsg = "illegal encoding"
|
|
startinpos = q-2
|
|
endinpos = startinpos+2
|
|
unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
|
|
|
|
return p, q, bo
|
|
|
|
# moved out of local scope, especially because it didn't
|
|
# have any nested variables.
|
|
|
|
def STORECHAR(CH, byteorder):
|
|
hi = chr(((CH) >> 8) & 0xff)
|
|
lo = chr((CH) & 0xff)
|
|
if byteorder == 'little':
|
|
return [lo, hi]
|
|
else:
|
|
return [hi, lo]
|
|
|
|
def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
|
|
|
|
# /* Offsets from p for storing byte pairs in the right order. */
|
|
|
|
|
|
p = []
|
|
bom = sys.byteorder
|
|
if (byteorder == 'native'):
|
|
|
|
bom = sys.byteorder
|
|
p += STORECHAR(0xFEFF, bom)
|
|
|
|
if (size == 0):
|
|
return ""
|
|
|
|
if (byteorder == 'little' ):
|
|
bom = 'little'
|
|
elif (byteorder == 'big'):
|
|
bom = 'big'
|
|
|
|
|
|
for c in s:
|
|
ch = ord(c)
|
|
ch2 = 0
|
|
if (ch >= 0x10000) :
|
|
ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
|
|
ch = 0xD800 | ((ch-0x10000) >> 10)
|
|
|
|
p += STORECHAR(ch, bom)
|
|
if (ch2):
|
|
p += STORECHAR(ch2, bom)
|
|
|
|
return p
|
|
|
|
|
|
def PyUnicode_DecodeMBCS(s, size, errors):
|
|
pass
|
|
|
|
def PyUnicode_EncodeMBCS(p, size, errors):
|
|
pass
|
|
|
|
def unicode_call_errorhandler(errors, encoding,
|
|
reason, input, startinpos, endinpos, decode=True):
|
|
|
|
errorHandler = lookup_error(errors)
|
|
if decode:
|
|
exceptionObject = UnicodeDecodeError(encoding, input, startinpos, endinpos, reason)
|
|
else:
|
|
exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason)
|
|
res = errorHandler(exceptionObject)
|
|
if isinstance(res, tuple) and isinstance(res[0], str) and isinstance(res[1], int):
|
|
newpos = res[1]
|
|
if (newpos < 0):
|
|
newpos = len(input) + newpos
|
|
if newpos < 0 or newpos > len(input):
|
|
raise IndexError( "position %d from error handler out of bounds" % newpos)
|
|
return res[0], newpos
|
|
else:
|
|
raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
|
|
|
|
def PyUnicode_DecodeUTF8(s, size, errors):
|
|
return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
|
|
|
|
## /* Map UTF-8 encoded prefix byte to sequence length. zero means
|
|
## illegal prefix. see RFC 2279 for details */
|
|
utf8_code_length = [
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
|
]
|
|
|
|
def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
|
|
|
|
consumed = 0
|
|
if (size == 0):
|
|
if not final:
|
|
consumed = 0
|
|
return '', consumed
|
|
p = []
|
|
pos = 0
|
|
while pos < size:
|
|
ch = s[pos]
|
|
if ch < 0x80:
|
|
p += chr(ch)
|
|
pos += 1
|
|
continue
|
|
|
|
n = utf8_code_length[ch]
|
|
startinpos = pos
|
|
if (startinpos + n > size):
|
|
if not final:
|
|
break
|
|
else:
|
|
errmsg = "unexpected end of data"
|
|
endinpos = size
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
if n == 0:
|
|
errmsg = "unexpected code byte"
|
|
endinpos = startinpos+1
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
elif n == 1:
|
|
errmsg = "internal error"
|
|
endinpos = startinpos+1
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
elif n == 2:
|
|
if ((s[pos+1] & 0xc0) != 0x80):
|
|
errmsg = "invalid data"
|
|
endinpos = startinpos+2
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
c = ((s[pos] & 0x1f) << 6) + (s[pos+1] & 0x3f)
|
|
if c < 0x80:
|
|
errmsg = "illegal encoding"
|
|
endinpos = startinpos+2
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
p += chr(c)
|
|
pos += n
|
|
#break
|
|
elif n == 3:
|
|
if ((s[pos+1] & 0xc0) != 0x80 or
|
|
(s[pos+2] & 0xc0) != 0x80):
|
|
errmsg = "invalid data"
|
|
endinpos = startinpos+3
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
c = ((s[pos] & 0x0f) << 12) + \
|
|
((s[pos+1] & 0x3f) << 6) +\
|
|
(s[pos+2] & 0x3f)
|
|
|
|
## /* Note: UTF-8 encodings of surrogates are considered
|
|
## legal UTF-8 sequences;
|
|
##
|
|
## XXX For wide builds (UCS-4) we should probably try
|
|
## to recombine the surrogates into a single code
|
|
## unit.
|
|
## */
|
|
if c < 0x0800:
|
|
errmsg = "illegal encoding"
|
|
endinpos = startinpos+3
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
p += chr(c)
|
|
pos += n
|
|
elif n == 4:
|
|
## case 4:
|
|
if ((s[pos+1] & 0xc0) != 0x80 or
|
|
(s[pos+2] & 0xc0) != 0x80 or
|
|
(s[pos+3] & 0xc0) != 0x80):
|
|
|
|
errmsg = "invalid data"
|
|
startinpos = pos
|
|
endinpos = startinpos+4
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
c = ((s[pos+0] & 0x7) << 18) + ((s[pos+1] & 0x3f) << 12) +\
|
|
((s[pos+2] & 0x3f) << 6) + (s[pos+3] & 0x3f)
|
|
#/* validate and convert to UTF-16 */
|
|
if ((c < 0x10000) or (c > 0x10ffff)):
|
|
#/* minimum value allowed for 4 byte encoding */
|
|
#/* maximum value allowed for UTF-16 */
|
|
|
|
errmsg = "illegal encoding"
|
|
startinpos = pos
|
|
endinpos = startinpos+4
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
#ifdef Py_UNICODE_WIDE
|
|
if c < sys.maxunicode:
|
|
p += chr(c)
|
|
pos += n
|
|
else:
|
|
## /* compute and append the two surrogates: */
|
|
## /* translate from 10000..10FFFF to 0..FFFF */
|
|
c -= 0x10000
|
|
#/* high surrogate = top 10 bits added to D800 */
|
|
p += chr(0xD800 + (c >> 10))
|
|
#/* low surrogate = bottom 10 bits added to DC00 */
|
|
p += chr(0xDC00 + (c & 0x03FF))
|
|
pos += n
|
|
else:
|
|
## default:
|
|
## /* Other sizes are only needed for UCS-4 */
|
|
errmsg = "unsupported Unicode code range"
|
|
startinpos = pos
|
|
endinpos = startinpos+n
|
|
res = unicode_call_errorhandler(
|
|
errors, "utf8", errmsg,
|
|
s, startinpos, endinpos)
|
|
p += res[0]
|
|
pos = res[1]
|
|
|
|
#continue
|
|
|
|
if not final:
|
|
consumed = pos
|
|
return p, pos # consumed
|
|
|
|
def PyUnicode_EncodeUTF8(s, size, errors):
|
|
|
|
#assert(s != None)
|
|
assert(size >= 0)
|
|
p = bytearray()
|
|
i = 0
|
|
while i < size:
|
|
ch = s[i]
|
|
i += 1
|
|
if (ord(ch) < 0x80):
|
|
## /* Encode ASCII */
|
|
p.append(ord(ch))
|
|
elif (ord(ch) < 0x0800) :
|
|
## /* Encode Latin-1 */
|
|
p.append(0xc0 | (ord(ch) >> 6))
|
|
p.append(0x80 | (ord(ch) & 0x3f))
|
|
else:
|
|
## /* Encode UCS2 Unicode ordinals */
|
|
if (ord(ch) < 0x10000):
|
|
## /* Special case: check for high surrogate */
|
|
if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
|
|
ch2 = s[i]
|
|
## /* Check for low surrogate and combine the two to
|
|
## form a UCS4 value */
|
|
if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
|
|
ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
|
|
i += 1
|
|
p += encodeUCS4(ch3)
|
|
continue
|
|
## /* Fall through: handles isolated high surrogates */
|
|
p.append(0xe0 | (ord(ch) >> 12))
|
|
p.append(0x80 | ((ord(ch) >> 6) & 0x3f))
|
|
p.append(0x80 | (ord(ch) & 0x3f))
|
|
continue
|
|
else:
|
|
p += encodeUCS4(ord(ch))
|
|
return p
|
|
|
|
def encodeUCS4(ch):
|
|
## /* Encode UCS4 Unicode ordinals */
|
|
p = bytearray()
|
|
p.append(0xf0 | (ch >> 18))
|
|
p.append(0x80 | ((ch >> 12) & 0x3f))
|
|
p.append(0x80 | ((ch >> 6) & 0x3f))
|
|
p.append(0x80 | (ch & 0x3f))
|
|
return p
|
|
|
|
#/* --- Latin-1 Codec ------------------------------------------------------ */
|
|
|
|
def PyUnicode_DecodeLatin1(s, size, errors):
|
|
#/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
|
|
## if (size == 1):
|
|
## return [PyUnicode_FromUnicode(s, 1)]
|
|
pos = 0
|
|
p = []
|
|
while (pos < size):
|
|
p += chr(s[pos])
|
|
pos += 1
|
|
return p
|
|
|
|
def unicode_encode_ucs1(p, size, errors, limit):
|
|
|
|
if limit == 256:
|
|
reason = "ordinal not in range(256)"
|
|
encoding = "latin-1"
|
|
else:
|
|
reason = "ordinal not in range(128)"
|
|
encoding = "ascii"
|
|
|
|
if (size == 0):
|
|
return []
|
|
res = bytearray()
|
|
pos = 0
|
|
while pos < len(p):
|
|
#for ch in p:
|
|
ch = p[pos]
|
|
|
|
if ord(ch) < limit:
|
|
res.append(ord(ch))
|
|
pos += 1
|
|
else:
|
|
#/* startpos for collecting unencodable chars */
|
|
collstart = pos
|
|
collend = pos+1
|
|
while collend < len(p) and ord(p[collend]) >= limit:
|
|
collend += 1
|
|
x = unicode_call_errorhandler(errors, encoding, reason, p, collstart, collend, False)
|
|
res += str(x[0])
|
|
pos = x[1]
|
|
|
|
return res
|
|
|
|
def PyUnicode_EncodeLatin1(p, size, errors):
|
|
res = unicode_encode_ucs1(p, size, errors, 256)
|
|
return res
|
|
|
|
hexdigits = [hex(i)[-1] for i in range(16)]+[hex(i)[-1].upper() for i in range(10, 16)]
|
|
|
|
def hexescape(s, pos, digits, message, errors):
|
|
chr = 0
|
|
p = []
|
|
if (pos+digits>len(s)):
|
|
message = "end of string in escape sequence"
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, len(s))
|
|
p += x[0]
|
|
pos = x[1]
|
|
else:
|
|
try:
|
|
chr = int(s[pos:pos+digits], 16)
|
|
except ValueError:
|
|
endinpos = pos
|
|
while s[endinpos] in hexdigits:
|
|
endinpos += 1
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
|
|
endinpos+1)
|
|
p += x[0]
|
|
pos = x[1]
|
|
#/* when we get here, chr is a 32-bit unicode character */
|
|
else:
|
|
if chr <= sys.maxunicode:
|
|
p += chr(chr)
|
|
pos += digits
|
|
|
|
elif (chr <= 0x10ffff):
|
|
chr -= 0x10000
|
|
p += chr(0xD800 + (chr >> 10))
|
|
p += chr(0xDC00 + (chr & 0x03FF))
|
|
pos += digits
|
|
else:
|
|
message = "illegal Unicode character"
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
|
|
pos+1)
|
|
p += x[0]
|
|
pos = x[1]
|
|
res = p
|
|
return res, pos
|
|
|
|
def PyUnicode_DecodeUnicodeEscape(s, size, errors):
|
|
|
|
if (size == 0):
|
|
return ''
|
|
|
|
p = []
|
|
pos = 0
|
|
while (pos < size):
|
|
## /* Non-escape characters are interpreted as Unicode ordinals */
|
|
if (s[pos] != '\\') :
|
|
p += chr(ord(s[pos]))
|
|
pos += 1
|
|
continue
|
|
## /* \ - Escapes */
|
|
else:
|
|
pos += 1
|
|
if pos >= len(s):
|
|
errmessage = "\\ at end of string"
|
|
unicode_call_errorhandler(errors, "unicodeescape", errmessage, s, pos-1, size)
|
|
ch = s[pos]
|
|
pos += 1
|
|
## /* \x escapes */
|
|
if ch == '\\' : p += '\\'
|
|
elif ch == '\'': p += '\''
|
|
elif ch == '\"': p += '\"'
|
|
elif ch == 'b' : p += '\b'
|
|
elif ch == 'f' : p += '\014' #/* FF */
|
|
elif ch == 't' : p += '\t'
|
|
elif ch == 'n' : p += '\n'
|
|
elif ch == 'r' : p += '\r'
|
|
elif ch == 'v': p += '\013' #break; /* VT */
|
|
elif ch == 'a': p += '\007' # break; /* BEL, not classic C */
|
|
elif '0' <= ch <= '7':
|
|
x = ord(ch) - ord('0')
|
|
if pos < size:
|
|
ch = s[pos]
|
|
if '0' <= ch <= '7':
|
|
pos += 1
|
|
x = (x<<3) + ord(ch) - ord('0')
|
|
if pos < size:
|
|
ch = s[pos]
|
|
if '0' <= ch <= '7':
|
|
pos += 1
|
|
x = (x<<3) + ord(ch) - ord('0')
|
|
p += chr(x)
|
|
## /* hex escapes */
|
|
## /* \xXX */
|
|
elif ch == 'x':
|
|
digits = 2
|
|
message = "truncated \\xXX escape"
|
|
x = hexescape(s, pos, digits, message, errors)
|
|
p += x[0]
|
|
pos = x[1]
|
|
|
|
# /* \uXXXX */
|
|
elif ch == 'u':
|
|
digits = 4
|
|
message = "truncated \\uXXXX escape"
|
|
x = hexescape(s, pos, digits, message, errors)
|
|
p += x[0]
|
|
pos = x[1]
|
|
|
|
# /* \UXXXXXXXX */
|
|
elif ch == 'U':
|
|
digits = 8
|
|
message = "truncated \\UXXXXXXXX escape"
|
|
x = hexescape(s, pos, digits, message, errors)
|
|
p += x[0]
|
|
pos = x[1]
|
|
## /* \N{name} */
|
|
elif ch == 'N':
|
|
message = "malformed \\N character escape"
|
|
#pos += 1
|
|
look = pos
|
|
try:
|
|
import unicodedata
|
|
except ImportError:
|
|
message = "\\N escapes not supported (can't load unicodedata module)"
|
|
unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, size)
|
|
if look < size and s[look] == '{':
|
|
#/* look for the closing brace */
|
|
while (look < size and s[look] != '}'):
|
|
look += 1
|
|
if (look > pos+1 and look < size and s[look] == '}'):
|
|
#/* found a name. look it up in the unicode database */
|
|
message = "unknown Unicode character name"
|
|
st = s[pos+1:look]
|
|
try:
|
|
chr = unicodedata.lookup("%s" % st)
|
|
except KeyError as e:
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
|
|
else:
|
|
x = chr, look + 1
|
|
p += x[0]
|
|
pos = x[1]
|
|
else:
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
|
|
else:
|
|
x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
|
|
else:
|
|
p += '\\'
|
|
p += ch
|
|
return p
|
|
|
|
def PyUnicode_EncodeRawUnicodeEscape(s, size):
|
|
|
|
if (size == 0):
|
|
return ''
|
|
|
|
p = bytearray()
|
|
for ch in s:
|
|
# /* Map 32-bit characters to '\Uxxxxxxxx' */
|
|
if (ord(ch) >= 0x10000):
|
|
p += '\\'
|
|
p += 'U'
|
|
p += '%08x' % (ord(ch))
|
|
elif (ord(ch) >= 256) :
|
|
# /* Map 16-bit characters to '\uxxxx' */
|
|
p += b'\\u%04x' % (ord(ch))
|
|
# /* Copy everything else as-is */
|
|
else:
|
|
p.append(ord(ch))
|
|
|
|
#p += '\0'
|
|
return p
|
|
|
|
def charmapencode_output(c, mapping):
|
|
|
|
rep = mapping[c]
|
|
if isinstance(rep, int) or isinstance(rep, int):
|
|
if rep < 256:
|
|
return chr(rep)
|
|
else:
|
|
raise TypeError("character mapping must be in range(256)")
|
|
elif isinstance(rep, str):
|
|
return rep
|
|
elif rep == None:
|
|
raise KeyError("character maps to <undefined>")
|
|
else:
|
|
raise TypeError("character mapping must return integer, None or str")
|
|
|
|
def PyUnicode_EncodeCharmap(p, size, mapping='latin-1', errors='strict'):
|
|
|
|
## /* the following variable is used for caching string comparisons
|
|
## * -1=not initialized, 0=unknown, 1=strict, 2=replace,
|
|
## * 3=ignore, 4=xmlcharrefreplace */
|
|
|
|
# /* Default to Latin-1 */
|
|
if mapping == 'latin-1':
|
|
return PyUnicode_EncodeLatin1(p, size, errors)
|
|
if (size == 0):
|
|
return ''
|
|
inpos = 0
|
|
res = []
|
|
while (inpos<size):
|
|
#/* try to encode it */
|
|
try:
|
|
x = charmapencode_output(ord(p[inpos]), mapping)
|
|
res += [x]
|
|
except KeyError:
|
|
x = unicode_call_errorhandler(errors, "charmap",
|
|
"character maps to <undefined>", p, inpos, inpos+1, False)
|
|
try:
|
|
res += [charmapencode_output(ord(y), mapping) for y in x[0]]
|
|
except KeyError:
|
|
raise UnicodeEncodeError("charmap", p, inpos, inpos+1,
|
|
"character maps to <undefined>")
|
|
inpos += 1
|
|
return res
|
|
|
|
def PyUnicode_DecodeCharmap(s, size, mapping, errors):
|
|
|
|
## /* Default to Latin-1 */
|
|
if (mapping == None):
|
|
return PyUnicode_DecodeLatin1(s, size, errors)
|
|
|
|
if (size == 0):
|
|
return ''
|
|
p = []
|
|
inpos = 0
|
|
while (inpos< len(s)):
|
|
|
|
#/* Get mapping (char ordinal -> integer, Unicode char or None) */
|
|
ch = s[inpos]
|
|
try:
|
|
x = mapping[ord(ch)]
|
|
if isinstance(x, int):
|
|
if x < 65536:
|
|
p += chr(x)
|
|
else:
|
|
raise TypeError("character mapping must be in range(65536)")
|
|
elif isinstance(x, str):
|
|
p += x
|
|
elif not x:
|
|
raise KeyError
|
|
else:
|
|
raise TypeError
|
|
except KeyError:
|
|
x = unicode_call_errorhandler(errors, "charmap",
|
|
"character maps to <undefined>", s, inpos, inpos+1)
|
|
p += x[0]
|
|
inpos += 1
|
|
return p
|
|
|
|
def PyUnicode_DecodeRawUnicodeEscape(s, size, errors):
|
|
|
|
if (size == 0):
|
|
return ''
|
|
pos = 0
|
|
p = []
|
|
while (pos < len(s)):
|
|
ch = s[pos]
|
|
#/* Non-escape characters are interpreted as Unicode ordinals */
|
|
if (ch != '\\'):
|
|
p += chr(ord(ch))
|
|
pos += 1
|
|
continue
|
|
startinpos = pos
|
|
## /* \u-escapes are only interpreted iff the number of leading
|
|
## backslashes is odd */
|
|
bs = pos
|
|
while pos < size:
|
|
if (s[pos] != '\\'):
|
|
break
|
|
p += chr(ord(s[pos]))
|
|
pos += 1
|
|
|
|
if (((pos - bs) & 1) == 0 or
|
|
pos >= size or
|
|
(s[pos] != 'u' and s[pos] != 'U')) :
|
|
p += chr(ord(s[pos]))
|
|
pos += 1
|
|
continue
|
|
|
|
p.pop(-1)
|
|
if s[pos] == 'u':
|
|
count = 4
|
|
else:
|
|
count = 8
|
|
pos += 1
|
|
|
|
#/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
|
|
x = 0
|
|
try:
|
|
x = int(s[pos:pos+count], 16)
|
|
except ValueError:
|
|
res = unicode_call_errorhandler(
|
|
errors, "rawunicodeescape", "truncated \\uXXXX",
|
|
s, size, pos, pos+count)
|
|
p += res[0]
|
|
pos = res[1]
|
|
else:
|
|
#ifndef Py_UNICODE_WIDE
|
|
if sys.maxunicode > 0xffff:
|
|
if (x > sys.maxunicode):
|
|
res = unicode_call_errorhandler(
|
|
errors, "rawunicodeescape", "\\Uxxxxxxxx out of range",
|
|
s, size, pos, pos+1)
|
|
pos = res[1]
|
|
p += res[0]
|
|
else:
|
|
p += chr(x)
|
|
pos += count
|
|
else:
|
|
if (x > 0x10000):
|
|
res = unicode_call_errorhandler(
|
|
errors, "rawunicodeescape", "\\Uxxxxxxxx out of range",
|
|
s, size, pos, pos+1)
|
|
pos = res[1]
|
|
p += res[0]
|
|
|
|
#endif
|
|
else:
|
|
p += chr(x)
|
|
pos += count
|
|
|
|
return p
|