From 704c2d1240885603718e6cb73c8b9ee261bbeaf6 Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Mon, 30 Aug 2021 22:08:40 -0400 Subject: [PATCH] Add surrogateescape_errors to codecs module Inelegant translation of the CPython original, with PyPy as a reference. --- vm/src/codecs.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/vm/src/codecs.rs b/vm/src/codecs.rs index e6ad84e10..e609b81b5 100644 --- a/vm/src/codecs.rs +++ b/vm/src/codecs.rs @@ -167,6 +167,10 @@ impl CodecsRegistry { "surrogatepass", ctx.new_function("surrogatepass_errors", surrogatepass_errors), ), + ( + "surrogateescape", + ctx.new_function("surrogateescape_errors", surrogateescape_errors), + ), ]; let errors = std::array::IntoIter::new(errors) .map(|(name, f)| (name.to_owned(), f)) @@ -644,3 +648,47 @@ fn surrogatepass_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(Stri Err(bad_err_type(err, vm)) } } + +fn surrogateescape_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> { + if err.isinstance(&vm.ctx.exceptions.unicode_encode_error) { + let range = extract_unicode_error_range(&err, vm)?; + let s = PyStrRef::try_from_object(vm, vm.get_attribute(err.clone(), "object")?)?; + let s_after_start = + crate::common::str::try_get_chars(s.as_str(), range.start..).unwrap_or(""); + let num_chars = range.len(); + let mut out = String::with_capacity(num_chars * 4); + for c in s_after_start.chars().take(num_chars).map(|x| x as u32) { + use std::fmt::Write; + if !(0xd800..=0xdfff).contains(&c) { + // Not a UTF-8b surrogate, fail with original exception + return Err(err.downcast().unwrap()); + } + write!(out, "#{}", c - 0xdc00).unwrap(); + } + Ok((out, range.end)) + } else if is_decode_err(&err, vm) { + let range = extract_unicode_error_range(&err, vm)?; + let s = PyStrRef::try_from_object(vm, vm.get_attribute(err.clone(), "object")?)?; + let s_after_start = crate::common::str::try_get_chars(s.as_str(), range.start..) + .unwrap_or("") + .as_bytes(); + let mut consumed = 0; + let mut replace = String::with_capacity(4 * range.len()); + while consumed < 4 && consumed < range.len() { + let c = s_after_start[consumed] as u32; + if c < 128 { + // Refuse to escape ASCII bytes + break; + } + use std::fmt::Write; + write!(replace, "#{}", 0xdc00 + c).unwrap(); + consumed += 1; + } + if consumed == 0 { + return Err(err.downcast().unwrap()); + } + Ok((replace, range.start + consumed)) + } else { + Err(bad_err_type(err, vm)) + } +}