diff --git a/common/src/bytes.rs b/common/src/bytes.rs deleted file mode 100644 index 2ed17ad953..0000000000 --- a/common/src/bytes.rs +++ /dev/null @@ -1,72 +0,0 @@ -use crate::escape::Quote; -use crate::str::ReprOverflowError; - -pub fn repr(b: &[u8]) -> Result { - repr_with(b, &[], "", Quote::Single) -} - -pub fn repr_with_quote(b: &[u8], quote: Quote) -> Result { - repr_with(b, &[], "", quote) -} - -pub fn repr_with( - b: &[u8], - prefixes: &[&str], - suffix: &str, - quote: Quote, -) -> Result { - use std::fmt::Write; - - let mut out_len = 0usize; - let mut squote = 0; - let mut dquote = 0; - - for &ch in b { - let incr = match ch { - b'\'' => { - squote += 1; - 1 - } - b'"' => { - dquote += 1; - 1 - } - b'\\' | b'\t' | b'\r' | b'\n' => 2, - 0x20..=0x7e => 1, - _ => 4, // \xHH - }; - out_len = out_len.checked_add(incr).ok_or(ReprOverflowError)?; - } - - let (quote, num_escaped_quotes) = crate::escape::choose_quote(squote, dquote, quote); - // we'll be adding backslashes in front of the existing inner quotes - out_len += num_escaped_quotes; - - // 3 is for b prefix + outer quotes - out_len += 3 + prefixes.iter().map(|s| s.len()).sum::() + suffix.len(); - - let mut res = String::with_capacity(out_len); - res.extend(prefixes.iter().copied()); - res.push('b'); - res.push(quote.to_char()); - for &ch in b { - match ch { - b'\t' => res.push_str("\\t"), - b'\n' => res.push_str("\\n"), - b'\r' => res.push_str("\\r"), - // printable ascii range - 0x20..=0x7e => { - let ch = ch as char; - if ch == quote.to_char() || ch == '\\' { - res.push('\\'); - } - res.push(ch); - } - _ => write!(res, "\\x{ch:02x}").unwrap(), - } - } - res.push(quote.to_char()); - res.push_str(suffix); - - Ok(res) -} diff --git a/common/src/escape.rs b/common/src/escape.rs index d4c288d101..de7a51433e 100644 --- a/common/src/escape.rs +++ b/common/src/escape.rs @@ -36,29 +36,12 @@ pub struct EscapeLayout { } pub trait Escape { - type Source: ?Sized; - fn source_len(&self) -> usize; fn layout(&self) -> &EscapeLayout; fn changed(&self) -> bool { self.layout().len != Some(self.source_len()) } - fn output_layout_with_checker( - source: &Self::Source, - preferred_quote: Quote, - reserved_len: usize, - length_add: impl Fn(usize, usize) -> Option, - ) -> EscapeLayout; - // fn output_layout(source: &Self::Source, preferred_quote: Quote) -> EscapeLayout { - // Self::output_layout_with_checker(source, preferred_quote, 2, |a, b| a.checked_add(b)) - // } - fn output_layout(source: &Self::Source, preferred_quote: Quote) -> EscapeLayout { - Self::output_layout_with_checker(source, preferred_quote, 2, |a, b| { - Some((a as isize).checked_add(b as isize)? as usize) - }) - } - fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result; fn write_body_slow(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result; fn write_body(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { @@ -68,18 +51,6 @@ pub trait Escape { self.write_source(formatter) } } - fn write_quoted(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { - let quote = self.layout().quote.to_char(); - formatter.write_char(quote)?; - self.write_body(formatter)?; - formatter.write_char(quote) - } - fn to_quoted_string(&self) -> Option { - let len = self.layout().len?.checked_add(2)?; - let mut s = String::with_capacity(len); - self.write_quoted(&mut s).unwrap(); - Some(s) - } } /// Returns the outer quotes to use and the number of quotes that need to be @@ -114,23 +85,90 @@ impl<'a> UnicodeEscape<'a> { Self { source, layout } } pub fn new_repr(source: &'a str) -> Self { - let layout = Self::output_layout(source, Quote::Single); + let layout = Self::repr_layout(source, Quote::Single); Self { source, layout } } - pub fn repr<'r>(&'a self) -> UnicodeRepr<'r, 'a> { - UnicodeRepr(self) + + pub fn str_repr<'r>(&'a self) -> StrRepr<'r, 'a> { + StrRepr(self) } } -pub struct UnicodeRepr<'r, 'a>(&'r UnicodeEscape<'a>); +pub struct StrRepr<'r, 'a>(&'r UnicodeEscape<'a>); -impl std::fmt::Display for UnicodeRepr<'_, '_> { +impl StrRepr<'_, '_> { + pub fn write(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { + let quote = self.0.layout().quote.to_char(); + formatter.write_char(quote)?; + self.0.write_body(formatter)?; + formatter.write_char(quote) + } + + pub fn to_string(&self) -> Option { + let mut s = String::with_capacity(self.0.layout().len?); + self.write(&mut s).unwrap(); + Some(s) + } +} + +impl std::fmt::Display for StrRepr<'_, '_> { fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.write_quoted(formatter) + self.write(formatter) } } impl UnicodeEscape<'_> { + const REPR_RESERVED_LEN: usize = 2; // for quotes + + pub fn repr_layout(source: &str, preferred_quote: Quote) -> EscapeLayout { + Self::output_layout_with_checker(source, preferred_quote, |a, b| { + Some((a as isize).checked_add(b as isize)? as usize) + }) + } + + fn output_layout_with_checker( + source: &str, + preferred_quote: Quote, + length_add: impl Fn(usize, usize) -> Option, + ) -> EscapeLayout { + let mut out_len = Self::REPR_RESERVED_LEN; + let mut single_count = 0; + let mut double_count = 0; + + for ch in source.chars() { + let incr = match ch { + '\'' => { + single_count += 1; + 1 + } + '"' => { + double_count += 1; + 1 + } + c => Self::escaped_char_len(c), + }; + let Some(new_len) = length_add(out_len, incr) else { + #[cold] + fn stop(single_count: usize, double_count: usize, preferred_quote: Quote) -> EscapeLayout { + EscapeLayout { quote: choose_quote(single_count, double_count, preferred_quote).0, len: None } + } + return stop(single_count, double_count, preferred_quote); + }; + out_len = new_len; + } + + let (quote, num_escaped_quotes) = choose_quote(single_count, double_count, preferred_quote); + // we'll be adding backslashes in front of the existing inner quotes + let Some(out_len) = length_add(out_len, num_escaped_quotes) else { + return EscapeLayout { quote, len: None }; + }; + + EscapeLayout { + quote, + len: Some(out_len - Self::REPR_RESERVED_LEN), + } + } + fn escaped_char_len(ch: char) -> usize { match ch { '\\' | '\t' | '\r' | '\n' => 2, @@ -182,8 +220,6 @@ impl UnicodeEscape<'_> { } impl<'a> Escape for UnicodeEscape<'a> { - type Source = str; - fn source_len(&self) -> usize { self.source.len() } @@ -192,50 +228,6 @@ impl<'a> Escape for UnicodeEscape<'a> { &self.layout } - fn output_layout_with_checker( - source: &str, - preferred_quote: Quote, - reserved_len: usize, - length_add: impl Fn(usize, usize) -> Option, - ) -> EscapeLayout { - let mut out_len = reserved_len; - let mut single_count = 0; - let mut double_count = 0; - - for ch in source.chars() { - let incr = match ch { - '\'' => { - single_count += 1; - 1 - } - '"' => { - double_count += 1; - 1 - } - c => Self::escaped_char_len(c), - }; - let Some(new_len) = length_add(out_len, incr) else { - #[cold] - fn stop(single_count: usize, double_count: usize, preferred_quote: Quote) -> EscapeLayout { - EscapeLayout { quote: choose_quote(single_count, double_count, preferred_quote).0, len: None } - } - return stop(single_count, double_count, preferred_quote); - }; - out_len = new_len; - } - - let (quote, num_escaped_quotes) = choose_quote(single_count, double_count, preferred_quote); - // we'll be adding backslashes in front of the existing inner quotes - let Some(out_len) = length_add(out_len, num_escaped_quotes) else { - return EscapeLayout { quote, len: None }; - }; - - EscapeLayout { - quote, - len: Some(out_len - reserved_len), - } - } - fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { formatter.write_str(self.source) } @@ -266,3 +258,157 @@ mod unicode_escapse_tests { assert!(test("hello\n")); } } + +pub struct AsciiEscape<'a> { + source: &'a [u8], + layout: EscapeLayout, +} + +impl<'a> AsciiEscape<'a> { + pub fn new(source: &'a [u8], layout: EscapeLayout) -> Self { + Self { source, layout } + } + pub fn with_forced_quote(source: &'a [u8], quote: Quote) -> Self { + let layout = EscapeLayout { quote, len: None }; + Self { source, layout } + } + pub fn new_repr(source: &'a [u8]) -> Self { + let layout = Self::repr_layout(source, Quote::Single); + Self { source, layout } + } + + pub fn bytes_repr<'r>(&'a self) -> BytesRepr<'r, 'a> { + BytesRepr(self) + } +} + +impl AsciiEscape<'_> { + pub fn repr_layout(source: &[u8], preferred_quote: Quote) -> EscapeLayout { + Self::output_layout_with_checker(source, preferred_quote, 3, |a, b| { + Some((a as isize).checked_add(b as isize)? as usize) + }) + } + + pub fn named_repr_layout(source: &[u8], name: &str) -> EscapeLayout { + Self::output_layout_with_checker(source, Quote::Single, name.len() + 2 + 3, |a, b| { + Some((a as isize).checked_add(b as isize)? as usize) + }) + } + + fn output_layout_with_checker( + source: &[u8], + preferred_quote: Quote, + reserved_len: usize, + length_add: impl Fn(usize, usize) -> Option, + ) -> EscapeLayout { + let mut out_len = reserved_len; + let mut single_count = 0; + let mut double_count = 0; + + for ch in source.iter() { + let incr = match ch { + b'\'' => { + single_count += 1; + 1 + } + b'"' => { + double_count += 1; + 1 + } + c => Self::escaped_char_len(*c), + }; + let Some(new_len) = length_add(out_len, incr) else { + #[cold] + fn stop(single_count: usize, double_count: usize, preferred_quote: Quote) -> EscapeLayout { + EscapeLayout { quote: choose_quote(single_count, double_count, preferred_quote).0, len: None } + } + return stop(single_count, double_count, preferred_quote); + }; + out_len = new_len; + } + + let (quote, num_escaped_quotes) = choose_quote(single_count, double_count, preferred_quote); + // we'll be adding backslashes in front of the existing inner quotes + let Some(out_len) = length_add(out_len, num_escaped_quotes) else { + return EscapeLayout { quote, len: None }; + }; + + EscapeLayout { + quote, + len: Some(out_len - reserved_len), + } + } + + fn escaped_char_len(ch: u8) -> usize { + match ch { + b'\\' | b'\t' | b'\r' | b'\n' => 2, + 0x20..=0x7e => 1, + _ => 4, // \xHH + } + } + + fn write_char(ch: u8, quote: Quote, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { + match ch { + b'\t' => formatter.write_str("\\t"), + b'\n' => formatter.write_str("\\n"), + b'\r' => formatter.write_str("\\r"), + 0x20..=0x7e => { + // printable ascii range + if ch == quote.to_byte() || ch == b'\\' { + formatter.write_char('\\')?; + } + formatter.write_char(ch as char) + } + ch => write!(formatter, "\\x{ch:02x}"), + } + } +} + +impl<'a> Escape for AsciiEscape<'a> { + fn source_len(&self) -> usize { + self.source.len() + } + + fn layout(&self) -> &EscapeLayout { + &self.layout + } + + fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { + formatter.write_str(unsafe { + // SAFETY: this function must be called only when source is printable ascii characters + std::str::from_utf8_unchecked(self.source) + }) + } + + #[cold] + fn write_body_slow(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { + for ch in self.source.iter() { + Self::write_char(*ch, self.layout().quote, formatter)?; + } + Ok(()) + } +} + +pub struct BytesRepr<'r, 'a>(&'r AsciiEscape<'a>); + +impl BytesRepr<'_, '_> { + pub fn write(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { + let quote = self.0.layout().quote.to_char(); + formatter.write_char('b')?; + formatter.write_char(quote)?; + self.0.write_body(formatter)?; + formatter.write_char(quote) + } + + pub fn to_string(&self) -> Option { + let mut s = String::with_capacity(self.0.layout().len?); + self.write(&mut s).unwrap(); + Some(s) + } +} + +impl std::fmt::Display for BytesRepr<'_, '_> { + fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.write(formatter) + } +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 9dc3612667..ae6e55a793 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -7,7 +7,6 @@ pub use macros::*; pub mod atomic; pub mod borrow; pub mod boxvec; -pub mod bytes; pub mod cformat; pub mod char; pub mod cmp; diff --git a/common/src/str.rs b/common/src/str.rs index b9a4fae9c4..3c01b755bf 100644 --- a/common/src/str.rs +++ b/common/src/str.rs @@ -3,10 +3,7 @@ use crate::{ hash::PyHash, }; use ascii::AsciiString; -use std::{ - fmt, - ops::{Bound, RangeBounds}, -}; +use std::ops::{Bound, RangeBounds}; #[cfg(not(target_arch = "wasm32"))] #[allow(non_camel_case_types)] @@ -341,15 +338,6 @@ macro_rules! ascii { }}; } -#[derive(Debug, Copy, Clone)] -#[non_exhaustive] -pub struct ReprOverflowError; -impl fmt::Display for ReprOverflowError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("string is too long to generate repr") - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/compiler/ast/src/constant.rs b/compiler/ast/src/constant.rs index 7b40fb1f39..d81d7fbede 100644 --- a/compiler/ast/src/constant.rs +++ b/compiler/ast/src/constant.rs @@ -41,12 +41,13 @@ impl std::fmt::Display for Constant { match self { Constant::None => f.pad("None"), Constant::Bool(b) => f.pad(if *b { "True" } else { "False" }), - Constant::Str(s) => { - use rustpython_common::escape::Escape; - rustpython_common::escape::UnicodeEscape::new_repr(s.as_str()).write_quoted(f) - } + Constant::Str(s) => rustpython_common::escape::UnicodeEscape::new_repr(s.as_str()) + .str_repr() + .write(f), Constant::Bytes(b) => { - f.pad(&rustpython_common::bytes::repr(b).map_err(|_err| std::fmt::Error)?) + let escape = rustpython_common::escape::AsciiEscape::new_repr(b); + let repr = escape.bytes_repr().to_string().unwrap(); + f.pad(&repr) } Constant::Int(i) => i.fmt(f), Constant::Tuple(tup) => { diff --git a/compiler/ast/src/unparse.rs b/compiler/ast/src/unparse.rs index 1af7bf2e98..e7e1bced18 100644 --- a/compiler/ast/src/unparse.rs +++ b/compiler/ast/src/unparse.rs @@ -509,10 +509,11 @@ impl<'a> Unparser<'a> { if is_spec { self.unparse_fstring_body(values, is_spec) } else { - use rustpython_common::escape::Escape; self.p("f")?; let body = to_string_fmt(|f| Unparser::new(f).unparse_fstring_body(values, is_spec)); - rustpython_common::escape::UnicodeEscape::new_repr(&body).write_quoted(&mut self.f) + rustpython_common::escape::UnicodeEscape::new_repr(&body) + .str_repr() + .write(&mut self.f) } } } diff --git a/stdlib/src/array.rs b/stdlib/src/array.rs index e94a37eb45..2ea4c2d5a2 100644 --- a/stdlib/src/array.rs +++ b/stdlib/src/array.rs @@ -1294,7 +1294,7 @@ mod array { } let to_unicode = zelf.tounicode(vm)?; let escape = crate::common::escape::UnicodeEscape::new_repr(&to_unicode); - return Ok(format!("{}('u', {})", class_name, escape.repr(),)); + return Ok(format!("{}('u', {})", class_name, escape.str_repr())); } zelf.read().repr(&class_name, vm) } diff --git a/vm/src/builtins/bytearray.rs b/vm/src/builtins/bytearray.rs index 87757fae36..0f792c24e2 100644 --- a/vm/src/builtins/bytearray.rs +++ b/vm/src/builtins/bytearray.rs @@ -871,7 +871,7 @@ impl Representable for PyByteArray { fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { let class = zelf.class(); let class_name = class.name(); - zelf.inner().repr(Some(&class_name), vm) + zelf.inner().repr_with_name(&class_name, vm) } } diff --git a/vm/src/builtins/bytes.rs b/vm/src/builtins/bytes.rs index 18ea0d01ca..4e40301e23 100644 --- a/vm/src/builtins/bytes.rs +++ b/vm/src/builtins/bytes.rs @@ -672,7 +672,7 @@ impl Iterable for PyBytes { impl Representable for PyBytes { #[inline] fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - zelf.inner.repr(None, vm) + zelf.inner.repr_bytes(vm) } } diff --git a/vm/src/builtins/str.rs b/vm/src/builtins/str.rs index 16458f1af7..a1f7e99f17 100644 --- a/vm/src/builtins/str.rs +++ b/vm/src/builtins/str.rs @@ -500,10 +500,11 @@ impl PyStr { #[inline] pub(crate) fn repr(&self, vm: &VirtualMachine) -> PyResult { - use rustpython_common::escape::{Escape, UnicodeEscape}; + use rustpython_common::escape::UnicodeEscape; let escape = UnicodeEscape::new_repr(self.as_str()); escape - .to_quoted_string() + .str_repr() + .to_string() .ok_or_else(|| vm.new_overflow_error("string is too long to generate repr".to_owned())) } diff --git a/vm/src/bytesinner.rs b/vm/src/bytesinner.rs index 425863307f..2e0bf1ae2e 100644 --- a/vm/src/bytesinner.rs +++ b/vm/src/bytesinner.rs @@ -1,10 +1,12 @@ use crate::{ anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper}, builtins::{ - pystr, PyByteArray, PyBytes, PyBytesRef, PyInt, PyIntRef, PyStr, PyStrRef, PyTypeRef, + pystr, PyBaseExceptionRef, PyByteArray, PyBytes, PyBytesRef, PyInt, PyIntRef, PyStr, + PyStrRef, PyTypeRef, }, byte::bytes_from_object, cformat::cformat_bytes, + common::{escape::Escape, hash}, function::{ArgIterable, Either, OptionalArg, OptionalOption, PyComparisonValue}, identifier, protocol::PyBuffer, @@ -16,7 +18,6 @@ use bstr::ByteSlice; use itertools::Itertools; use num_bigint::BigInt; use num_traits::ToPrimitive; -use rustpython_common::hash; #[derive(Debug, Default, Clone)] pub struct PyBytesInner { @@ -246,18 +247,36 @@ impl PyBytesInner { &self.elements } - pub fn repr(&self, class_name: Option<&str>, vm: &VirtualMachine) -> PyResult { - let repr = if let Some(class_name) = class_name { - rustpython_common::bytes::repr_with( - &self.elements, - &[class_name, "("], - ")", - rustpython_common::escape::Quote::Single, - ) - } else { - rustpython_common::bytes::repr(&self.elements) - }; - repr.map_err(|_| vm.new_overflow_error("bytes object is too large to make repr".to_owned())) + fn new_repr_overflow_error(vm: &VirtualMachine) -> PyBaseExceptionRef { + vm.new_overflow_error("bytes object is too large to make repr".to_owned()) + } + + pub fn repr_with_name(&self, class_name: &str, vm: &VirtualMachine) -> PyResult { + let escape = rustpython_common::escape::AsciiEscape::new_repr(&self.elements); + let len = escape + .layout() + .len + .and_then(|len| (len as isize).checked_add(2 + class_name.len() as isize)) + .ok_or_else(|| Self::new_repr_overflow_error(vm))? as usize; + let mut buf = String::with_capacity(len); + buf.push_str(class_name); + buf.push('('); + escape.bytes_repr().write(&mut buf).unwrap(); + buf.push(')'); + debug_assert_eq!(buf.len(), len); + Ok(buf) + } + + pub fn repr_bytes(&self, vm: &VirtualMachine) -> PyResult { + let escape = rustpython_common::escape::AsciiEscape::new_repr(&self.elements); + let len = escape + .layout() + .len + .ok_or_else(|| Self::new_repr_overflow_error(vm))?; + let mut buf = String::with_capacity(len); + escape.bytes_repr().write(&mut buf).unwrap(); + debug_assert_eq!(buf.len(), len); + Ok(buf) } #[inline]