diff --git a/Cargo.lock b/Cargo.lock index 241d2a595..70d58bc44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,6 +2319,7 @@ dependencies = [ "itertools 0.14.0", "log", "malachite-bigint", + "memchr", "num-complex", "num-traits", "ruff_python_ast", @@ -2330,6 +2331,7 @@ dependencies = [ "rustpython-compiler-core", "rustpython-compiler-source", "thiserror 2.0.11", + "unicode_names2", ] [[package]] @@ -2387,6 +2389,7 @@ dependencies = [ "ruff_python_ast", "ruff_python_parser", "ruff_source_file", + "rustpython-common", "serde", ] diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 09a6d883f..bd1dbcd62 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -536,8 +536,6 @@ class CodecCallbackTest(unittest.TestCase): ("".join("&#%d;" % c for c in cs), 1 + len(s)) ) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_badandgoodbackslashreplaceexceptions(self): # "backslashreplace" complains about a non-exception passed in self.assertRaises( @@ -596,8 +594,6 @@ class CodecCallbackTest(unittest.TestCase): (r, 2) ) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_badandgoodnamereplaceexceptions(self): # "namereplace" complains about a non-exception passed in self.assertRaises( @@ -644,8 +640,6 @@ class CodecCallbackTest(unittest.TestCase): (r, 1 + len(s)) ) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_badandgoodsurrogateescapeexceptions(self): surrogateescape_errors = codecs.lookup_error('surrogateescape') # "surrogateescape" complains about a non-exception passed in diff --git a/common/src/encodings.rs b/common/src/encodings.rs index 097dae17b..c444e27a5 100644 --- a/common/src/encodings.rs +++ b/common/src/encodings.rs @@ -401,7 +401,7 @@ pub mod errors { let mut out = String::with_capacity(num_chars * 4); for c in err_str.code_points() { let c_u32 = c.to_u32(); - if let Some(c_name) = unicode_names2::name(c.to_char_lossy()) { + if let Some(c_name) = c.to_char().and_then(unicode_names2::name) { write!(out, "\\N{{{c_name}}}").unwrap(); } else if c_u32 >= 0x10000 { write!(out, "\\U{c_u32:08x}").unwrap(); diff --git a/common/src/wtf8/mod.rs b/common/src/wtf8/mod.rs index f6ae628ba..21c5de28b 100644 --- a/common/src/wtf8/mod.rs +++ b/common/src/wtf8/mod.rs @@ -574,6 +574,12 @@ impl> FromIterator for Wtf8Buf { } } +impl Hash for Wtf8Buf { + fn hash(&self, state: &mut H) { + Wtf8::hash(self, state) + } +} + impl AsRef for Wtf8Buf { fn as_ref(&self) -> &Wtf8 { self @@ -692,6 +698,13 @@ impl Default for &Wtf8 { } } +impl Hash for Wtf8 { + fn hash(&self, state: &mut H) { + state.write(self.as_bytes()); + state.write_u8(0xff); + } +} + impl Wtf8 { /// Creates a WTF-8 slice from a UTF-8 `&str` slice. /// @@ -722,6 +735,32 @@ impl Wtf8 { unsafe { &mut *(value as *mut [u8] as *mut Wtf8) } } + /// Create a WTF-8 slice from a WTF-8 byte slice. + // + // whooops! using WTF-8 for interchange! + #[inline] + pub fn from_bytes(b: &[u8]) -> Option<&Self> { + let mut rest = b; + while let Err(e) = std::str::from_utf8(rest) { + rest = &rest[e.valid_up_to()..]; + Self::decode_surrogate(rest)?; + rest = &rest[3..]; + } + Some(unsafe { Wtf8::from_bytes_unchecked(b) }) + } + + fn decode_surrogate(b: &[u8]) -> Option { + let [a, b, c, ..] = *b else { return None }; + if (a & 0xf0) == 0xe0 && (b & 0xc0) == 0x80 && (c & 0xc0) == 0x80 { + // it's a three-byte code + let c = ((a as u32 & 0x0f) << 12) + ((b as u32 & 0x3f) << 6) + (c as u32 & 0x3f); + let 0xD800..=0xDFFF = c else { return None }; + Some(CodePoint { value: c }) + } else { + None + } + } + /// Returns the length, in WTF-8 bytes. #[inline] pub fn len(&self) -> usize { @@ -875,6 +914,14 @@ impl Wtf8 { } } + #[inline] + fn final_lead_surrogate(&self) -> Option { + match self.bytes { + [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), + _ => None, + } + } + pub fn is_code_point_boundary(&self, index: usize) -> bool { is_code_point_boundary(self, index) } @@ -1481,6 +1528,12 @@ impl From for Box { } } +impl From> for Wtf8Buf { + fn from(w: Box) -> Self { + Wtf8Buf::from_box(w) + } +} + impl From for Box { fn from(s: String) -> Self { s.into_boxed_str().into() diff --git a/compiler/codegen/Cargo.toml b/compiler/codegen/Cargo.toml index 9cf93bc22..c7ff439f7 100644 --- a/compiler/codegen/Cargo.toml +++ b/compiler/codegen/Cargo.toml @@ -30,6 +30,8 @@ num-complex = { workspace = true } num-traits = { workspace = true } thiserror = { workspace = true } malachite-bigint = { workspace = true } +memchr = { workspace = true } +unicode_names2 = { workspace = true } [dev-dependencies] # rustpython-parser = { workspace = true } diff --git a/compiler/codegen/src/compile.rs b/compiler/codegen/src/compile.rs index a6eb216e2..a03a1fdb5 100644 --- a/compiler/codegen/src/compile.rs +++ b/compiler/codegen/src/compile.rs @@ -21,13 +21,14 @@ use ruff_python_ast::{ Alias, Arguments, BoolOp, CmpOp, Comprehension, ConversionFlag, DebugText, Decorator, DictItem, ExceptHandler, ExceptHandlerExceptHandler, Expr, ExprAttribute, ExprBoolOp, ExprFString, ExprList, ExprName, ExprStarred, ExprSubscript, ExprTuple, ExprUnaryOp, FString, - FStringElement, FStringElements, FStringPart, Int, Keyword, MatchCase, ModExpression, - ModModule, Operator, Parameters, Pattern, PatternMatchAs, PatternMatchValue, Stmt, StmtExpr, - TypeParam, TypeParamParamSpec, TypeParamTypeVar, TypeParamTypeVarTuple, TypeParams, UnaryOp, - WithItem, + FStringElement, FStringElements, FStringFlags, FStringPart, Int, Keyword, MatchCase, + ModExpression, ModModule, Operator, Parameters, Pattern, PatternMatchAs, PatternMatchValue, + Stmt, StmtExpr, TypeParam, TypeParamParamSpec, TypeParamTypeVar, TypeParamTypeVarTuple, + TypeParams, UnaryOp, WithItem, }; use ruff_source_file::OneIndexed; use ruff_text_size::{Ranged, TextRange}; +use rustpython_common::wtf8::Wtf8Buf; // use rustpython_ast::located::{self as located_ast, Located}; use rustpython_compiler_core::{ Mode, @@ -375,7 +376,9 @@ impl Compiler<'_> { let (doc, statements) = split_doc(&body.body, &self.opts); if let Some(value) = doc { - self.emit_load_const(ConstantData::Str { value }); + self.emit_load_const(ConstantData::Str { + value: value.into(), + }); let doc = self.name("__doc__"); emit!(self, Instruction::StoreGlobal(doc)) } @@ -636,14 +639,12 @@ impl Compiler<'_> { statement.range(), )); } - vec![ConstantData::Str { - value: "*".to_owned(), - }] + vec![ConstantData::Str { value: "*".into() }] } else { names .iter() .map(|n| ConstantData::Str { - value: n.name.to_string(), + value: n.name.as_str().into(), }) .collect() }; @@ -954,7 +955,7 @@ impl Compiler<'_> { self.pop_symbol_table(); } self.emit_load_const(ConstantData::Str { - value: name_string.clone(), + value: name_string.clone().into(), }); emit!(self, Instruction::TypeAlias); self.store_name(&name_string)?; @@ -1028,7 +1029,7 @@ impl Compiler<'_> { let default_kw_count = kw_with_defaults.len(); for (arg, default) in kw_with_defaults.iter() { self.emit_load_const(ConstantData::Str { - value: arg.name.to_string(), + value: arg.name.as_str().into(), }); self.compile_expression(default)?; } @@ -1101,7 +1102,7 @@ impl Compiler<'_> { if let Some(expr) = &bound { self.compile_expression(expr)?; self.emit_load_const(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); emit!(self, Instruction::TypeVarWithBound); emit!(self, Instruction::Duplicate); @@ -1109,7 +1110,7 @@ impl Compiler<'_> { } else { // self.store_name(type_name.as_str())?; self.emit_load_const(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); emit!(self, Instruction::TypeVar); emit!(self, Instruction::Duplicate); @@ -1118,7 +1119,7 @@ impl Compiler<'_> { } TypeParam::ParamSpec(TypeParamParamSpec { name, .. }) => { self.emit_load_const(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); emit!(self, Instruction::ParamSpec); emit!(self, Instruction::Duplicate); @@ -1126,7 +1127,7 @@ impl Compiler<'_> { } TypeParam::TypeVarTuple(TypeParamTypeVarTuple { name, .. }) => { self.emit_load_const(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); emit!(self, Instruction::TypeVarTuple); emit!(self, Instruction::Duplicate); @@ -1363,7 +1364,7 @@ impl Compiler<'_> { if let Some(annotation) = returns { // key: self.emit_load_const(ConstantData::Str { - value: "return".to_owned(), + value: "return".into(), }); // value: self.compile_annotation(annotation)?; @@ -1380,7 +1381,7 @@ impl Compiler<'_> { for param in parameters_iter { if let Some(annotation) = ¶m.annotation { self.emit_load_const(ConstantData::Str { - value: self.mangle(param.name.as_str()).into_owned(), + value: self.mangle(param.name.as_str()).into_owned().into(), }); self.compile_annotation(annotation)?; num_annotations += 1; @@ -1410,7 +1411,7 @@ impl Compiler<'_> { code: Box::new(code), }); self.emit_load_const(ConstantData::Str { - value: qualified_name, + value: qualified_name.into(), }); // Turn code object into function object: @@ -1418,7 +1419,9 @@ impl Compiler<'_> { if let Some(value) = doc_str { emit!(self, Instruction::Duplicate); - self.emit_load_const(ConstantData::Str { value }); + self.emit_load_const(ConstantData::Str { + value: value.into(), + }); emit!(self, Instruction::Rotate2); let doc = self.name("__doc__"); emit!(self, Instruction::StoreAttr { idx: doc }); @@ -1547,7 +1550,7 @@ impl Compiler<'_> { let dunder_module = self.name("__module__"); emit!(self, Instruction::StoreLocal(dunder_module)); self.emit_load_const(ConstantData::Str { - value: qualified_name, + value: qualified_name.into(), }); let qualname = self.name("__qualname__"); emit!(self, Instruction::StoreLocal(qualname)); @@ -1608,16 +1611,12 @@ impl Compiler<'_> { self.emit_load_const(ConstantData::Code { code: Box::new(code), }); - self.emit_load_const(ConstantData::Str { - value: name.to_owned(), - }); + self.emit_load_const(ConstantData::Str { value: name.into() }); // Turn code object into function object: emit!(self, Instruction::MakeFunction(func_flags)); - self.emit_load_const(ConstantData::Str { - value: name.to_owned(), - }); + self.emit_load_const(ConstantData::Str { value: name.into() }); // Call the __build_class__ builtin let call = if let Some(arguments) = arguments { @@ -1638,7 +1637,7 @@ impl Compiler<'_> { // Doc string value: self.emit_load_const(match doc_str { - Some(doc) => ConstantData::Str { value: doc }, + Some(doc) => ConstantData::Str { value: doc.into() }, None => ConstantData::None, // set docstring None if not declared }); } @@ -2031,7 +2030,7 @@ impl Compiler<'_> { let ident = Default::default(); let codegen = ruff_python_codegen::Generator::new(&ident, Default::default()); self.emit_load_const(ConstantData::Str { - value: codegen.expr(annotation), + value: codegen.expr(annotation).into(), }); } else { self.compile_expression(annotation)?; @@ -2063,7 +2062,7 @@ impl Compiler<'_> { let annotations = self.name("__annotations__"); emit!(self, Instruction::LoadNameAny(annotations)); self.emit_load_const(ConstantData::Str { - value: self.mangle(id.as_str()).into_owned(), + value: self.mangle(id.as_str()).into_owned().into(), }); emit!(self, Instruction::StoreSubscript); } else { @@ -2538,7 +2537,7 @@ impl Compiler<'_> { self.emit_load_const(ConstantData::Code { code: Box::new(code), }); - self.emit_load_const(ConstantData::Str { value: name }); + self.emit_load_const(ConstantData::Str { value: name.into() }); // Turn code object into function object: emit!(self, Instruction::MakeFunction(func_flags)); @@ -2679,9 +2678,23 @@ impl Compiler<'_> { self.compile_expr_fstring(fstring)?; } Expr::StringLiteral(string) => { - self.emit_load_const(ConstantData::Str { - value: string.value.to_str().to_owned(), - }); + let value = string.value.to_str(); + if value.contains(char::REPLACEMENT_CHARACTER) { + let value = string + .value + .iter() + .map(|lit| { + let source = self.source_code.get_range(lit.range); + crate::string_parser::parse_string_literal(source, lit.flags.into()) + }) + .collect(); + // might have a surrogate literal; should reparse to be sure + self.emit_load_const(ConstantData::Str { value }); + } else { + self.emit_load_const(ConstantData::Str { + value: value.into(), + }); + } } Expr::BytesLiteral(bytes) => { let iter = bytes.value.iter().flat_map(|x| x.iter().copied()); @@ -2732,7 +2745,7 @@ impl Compiler<'_> { for keyword in sub_keywords { if let Some(name) = &keyword.arg { self.emit_load_const(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); self.compile_expression(&keyword.value)?; sub_size += 1; @@ -2822,7 +2835,7 @@ impl Compiler<'_> { for keyword in &arguments.keywords { if let Some(name) = &keyword.arg { kwarg_names.push(ConstantData::Str { - value: name.to_string(), + value: name.as_str().into(), }); } else { // This means **kwargs! @@ -3058,9 +3071,7 @@ impl Compiler<'_> { }); // List comprehension function name: - self.emit_load_const(ConstantData::Str { - value: name.to_owned(), - }); + self.emit_load_const(ConstantData::Str { value: name.into() }); // Turn code object into function object: emit!(self, Instruction::MakeFunction(func_flags)); @@ -3358,9 +3369,19 @@ impl Compiler<'_> { fn compile_fstring_part(&mut self, part: &FStringPart) -> CompileResult<()> { match part { FStringPart::Literal(string) => { - self.emit_load_const(ConstantData::Str { - value: string.value.to_string(), - }); + if string.value.contains(char::REPLACEMENT_CHARACTER) { + // might have a surrogate literal; should reparse to be sure + let source = self.source_code.get_range(string.range); + let value = + crate::string_parser::parse_string_literal(source, string.flags.into()); + self.emit_load_const(ConstantData::Str { + value: value.into(), + }); + } else { + self.emit_load_const(ConstantData::Str { + value: string.value.to_string().into(), + }); + } Ok(()) } FStringPart::FString(fstring) => self.compile_fstring(fstring), @@ -3368,19 +3389,32 @@ impl Compiler<'_> { } fn compile_fstring(&mut self, fstring: &FString) -> CompileResult<()> { - self.compile_fstring_elements(&fstring.elements) + self.compile_fstring_elements(fstring.flags, &fstring.elements) } fn compile_fstring_elements( &mut self, + flags: FStringFlags, fstring_elements: &FStringElements, ) -> CompileResult<()> { for element in fstring_elements { match element { FStringElement::Literal(string) => { - self.emit_load_const(ConstantData::Str { - value: string.value.to_string(), - }); + if string.value.contains(char::REPLACEMENT_CHARACTER) { + // might have a surrogate literal; should reparse to be sure + let source = self.source_code.get_range(string.range); + let value = crate::string_parser::parse_fstring_literal_element( + source.into(), + flags.into(), + ); + self.emit_load_const(ConstantData::Str { + value: value.into(), + }); + } else { + self.emit_load_const(ConstantData::Str { + value: string.value.to_string().into(), + }); + } } FStringElement::Expression(fstring_expr) => { let mut conversion = fstring_expr.conversion; @@ -3393,11 +3427,13 @@ impl Compiler<'_> { let source = source.to_string(); self.emit_load_const(ConstantData::Str { - value: leading.to_string(), + value: leading.to_string().into(), }); - self.emit_load_const(ConstantData::Str { value: source }); self.emit_load_const(ConstantData::Str { - value: trailing.to_string(), + value: source.into(), + }); + self.emit_load_const(ConstantData::Str { + value: trailing.to_string().into(), }); 3 @@ -3407,7 +3443,7 @@ impl Compiler<'_> { match &fstring_expr.format_spec { None => { self.emit_load_const(ConstantData::Str { - value: String::new(), + value: Wtf8Buf::new(), }); // Match CPython behavior: If debug text is present, apply repr conversion. // See: https://github.com/python/cpython/blob/f61afca262d3a0aa6a8a501db0b1936c60858e35/Parser/action_helpers.c#L1456 @@ -3416,7 +3452,7 @@ impl Compiler<'_> { } } Some(format_spec) => { - self.compile_fstring_elements(&format_spec.elements)?; + self.compile_fstring_elements(flags, &format_spec.elements)?; } } @@ -3449,7 +3485,7 @@ impl Compiler<'_> { if element_count == 0 { // ensure to put an empty string on the stack if there aren't any fstring elements self.emit_load_const(ConstantData::Str { - value: String::new(), + value: Wtf8Buf::new(), }); } else if element_count > 1 { emit!( diff --git a/compiler/codegen/src/lib.rs b/compiler/codegen/src/lib.rs index ceadb3c36..d44844543 100644 --- a/compiler/codegen/src/lib.rs +++ b/compiler/codegen/src/lib.rs @@ -11,6 +11,7 @@ type IndexSet = indexmap::IndexSet; pub mod compile; pub mod error; pub mod ir; +mod string_parser; pub mod symboltable; pub use compile::CompileOpts; diff --git a/compiler/codegen/src/string_parser.rs b/compiler/codegen/src/string_parser.rs new file mode 100644 index 000000000..7bdb86aa5 --- /dev/null +++ b/compiler/codegen/src/string_parser.rs @@ -0,0 +1,287 @@ +//! A stripped-down version of ruff's string literal parser, modified to +//! handle surrogates in string literals and output WTF-8. +//! +//! Any `unreachable!()` statements in this file are because we only get here +//! after ruff has already successfully parsed the string literal, meaning +//! we don't need to do any validation or error handling. + +use std::convert::Infallible; + +use ruff_python_ast::{AnyStringFlags, StringFlags}; +use rustpython_common::wtf8::{CodePoint, Wtf8, Wtf8Buf}; + +// use ruff_python_parser::{LexicalError, LexicalErrorType}; +type LexicalError = Infallible; + +enum EscapedChar { + Literal(CodePoint), + Escape(char), +} + +struct StringParser { + /// The raw content of the string e.g., the `foo` part in `"foo"`. + source: Box, + /// Current position of the parser in the source. + cursor: usize, + /// Flags that can be used to query information about the string. + flags: AnyStringFlags, +} + +impl StringParser { + fn new(source: Box, flags: AnyStringFlags) -> Self { + Self { + source, + cursor: 0, + flags, + } + } + + #[inline] + fn skip_bytes(&mut self, bytes: usize) -> &str { + let skipped_str = &self.source[self.cursor..self.cursor + bytes]; + self.cursor += bytes; + skipped_str + } + + /// Returns the next byte in the string, if there is one. + /// + /// # Panics + /// + /// When the next byte is a part of a multi-byte character. + #[inline] + fn next_byte(&mut self) -> Option { + self.source[self.cursor..].as_bytes().first().map(|&byte| { + self.cursor += 1; + byte + }) + } + + #[inline] + fn next_char(&mut self) -> Option { + self.source[self.cursor..].chars().next().inspect(|c| { + self.cursor += c.len_utf8(); + }) + } + + #[inline] + fn peek_byte(&self) -> Option { + self.source[self.cursor..].as_bytes().first().copied() + } + + fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { + let mut p: u32 = 0u32; + for i in 1..=literal_number { + match self.next_char() { + Some(c) => match c.to_digit(16) { + Some(d) => p += d << ((literal_number - i) * 4), + None => unreachable!(), + }, + None => unreachable!(), + } + } + Ok(CodePoint::from_u32(p).unwrap()) + } + + fn parse_octet(&mut self, o: u8) -> char { + let mut radix_bytes = [o, 0, 0]; + let mut len = 1; + + while len < 3 { + let Some(b'0'..=b'7') = self.peek_byte() else { + break; + }; + + radix_bytes[len] = self.next_byte().unwrap(); + len += 1; + } + + // OK because radix_bytes is always going to be in the ASCII range. + let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes"); + let value = u32::from_str_radix(radix_str, 8).unwrap(); + char::from_u32(value).unwrap() + } + + fn parse_unicode_name(&mut self) -> Result { + let Some('{') = self.next_char() else { + unreachable!() + }; + + let Some(close_idx) = self.source[self.cursor..].find('}') else { + unreachable!() + }; + + let name_and_ending = self.skip_bytes(close_idx + 1); + let name = &name_and_ending[..name_and_ending.len() - 1]; + + unicode_names2::character(name).ok_or_else(|| unreachable!()) + } + + /// Parse an escaped character, returning the new character. + fn parse_escaped_char(&mut self) -> Result, LexicalError> { + let Some(first_char) = self.next_char() else { + unreachable!() + }; + + let new_char = match first_char { + '\\' => '\\'.into(), + '\'' => '\''.into(), + '\"' => '"'.into(), + 'a' => '\x07'.into(), + 'b' => '\x08'.into(), + 'f' => '\x0c'.into(), + 'n' => '\n'.into(), + 'r' => '\r'.into(), + 't' => '\t'.into(), + 'v' => '\x0b'.into(), + o @ '0'..='7' => self.parse_octet(o as u8).into(), + 'x' => self.parse_unicode_literal(2)?, + 'u' if !self.flags.is_byte_string() => self.parse_unicode_literal(4)?, + 'U' if !self.flags.is_byte_string() => self.parse_unicode_literal(8)?, + 'N' if !self.flags.is_byte_string() => self.parse_unicode_name()?.into(), + // Special cases where the escape sequence is not a single character + '\n' => return Ok(None), + '\r' => { + if self.peek_byte() == Some(b'\n') { + self.next_byte(); + } + + return Ok(None); + } + _ => return Ok(Some(EscapedChar::Escape(first_char))), + }; + + Ok(Some(EscapedChar::Literal(new_char))) + } + + fn parse_fstring_middle(mut self) -> Result, LexicalError> { + // Fast-path: if the f-string doesn't contain any escape sequences, return the literal. + let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else { + return Ok(self.source.into()); + }; + + let mut value = Wtf8Buf::with_capacity(self.source.len()); + loop { + // Add the characters before the escape sequence (or curly brace) to the string. + let before_with_slash_or_brace = self.skip_bytes(index + 1); + let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match &self.source.as_bytes()[self.cursor - 1] { + // If there are any curly braces inside a `FStringMiddle` token, + // then they were escaped (i.e. `{{` or `}}`). This means that + // we need increase the location by 2 instead of 1. + b'{' => value.push_char('{'), + b'}' => value.push_char('}'), + // We can encounter a `\` as the last character in a `FStringMiddle` + // token which is valid in this context. For example, + // + // ```python + // f"\{foo} \{bar:\}" + // # ^ ^^ ^ + // ``` + // + // Here, the `FStringMiddle` token content will be "\" and " \" + // which is invalid if we look at the content in isolation: + // + // ```python + // "\" + // ``` + // + // However, the content is syntactically valid in the context of + // the f-string because it's a substring of the entire f-string. + // This is still an invalid escape sequence, but we don't want to + // raise a syntax error as is done by the CPython parser. It might + // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas + b'\\' => { + if !self.flags.is_raw_string() && self.peek_byte().is_some() { + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push_char('\\'); + value.push_char(c); + } + } + } else { + value.push_char('\\'); + } + } + ch => { + unreachable!("Expected '{{', '}}', or '\\' but got {:?}", ch); + } + } + + let Some(next_index) = + memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + index = next_index; + } + + Ok(value.into()) + } + + fn parse_string(mut self) -> Result, LexicalError> { + if self.flags.is_raw_string() { + // For raw strings, no escaping is necessary. + return Ok(self.source.into()); + } + + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + // If the string doesn't contain any escape sequences, return the owned string. + return Ok(self.source.into()); + }; + + // If the string contains escape sequences, we need to parse them. + let mut value = Wtf8Buf::with_capacity(self.source.len()); + + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(escape + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push_char('\\'); + value.push_char(c); + } + } + + let Some(next_escape) = self.source[self.cursor..].find('\\') else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + // Update the position of the next escape sequence. + escape = next_escape; + } + + Ok(value.into()) + } +} + +pub(crate) fn parse_string_literal(source: &str, flags: AnyStringFlags) -> Box { + let source = &source[flags.opener_len().to_usize()..]; + let source = &source[..source.len() - flags.quote_len().to_usize()]; + StringParser::new(source.into(), flags) + .parse_string() + .unwrap_or_else(|x| match x {}) +} + +pub(crate) fn parse_fstring_literal_element(source: Box, flags: AnyStringFlags) -> Box { + StringParser::new(source, flags) + .parse_fstring_middle() + .unwrap_or_else(|x| match x {}) +} diff --git a/compiler/core/Cargo.toml b/compiler/core/Cargo.toml index 7621c643d..8ff0cd020 100644 --- a/compiler/core/Cargo.toml +++ b/compiler/core/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true ruff_python_ast = { workspace = true } ruff_python_parser = { workspace = true } ruff_source_file = { workspace = true } +rustpython-common = { workspace = true } bitflags = { workspace = true } itertools = { workspace = true } diff --git a/compiler/core/src/bytecode.rs b/compiler/core/src/bytecode.rs index 4cb80020e..7b018d1df 100644 --- a/compiler/core/src/bytecode.rs +++ b/compiler/core/src/bytecode.rs @@ -8,6 +8,7 @@ use num_complex::Complex64; pub use ruff_python_ast::ConversionFlag; // use rustpython_parser_core::source_code::{OneIndexed, SourceLocation}; use ruff_source_file::{OneIndexed, SourceLocation}; +use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; use std::marker::PhantomData; use std::{collections::BTreeSet, fmt, hash, mem}; @@ -678,7 +679,7 @@ pub enum ConstantData { Float { value: f64 }, Complex { value: Complex64 }, Boolean { value: bool }, - Str { value: String }, + Str { value: Wtf8Buf }, Bytes { value: Vec }, Code { code: Box }, None, @@ -738,7 +739,7 @@ pub enum BorrowedConstant<'a, C: Constant> { Float { value: f64 }, Complex { value: Complex64 }, Boolean { value: bool }, - Str { value: &'a str }, + Str { value: &'a Wtf8 }, Bytes { value: &'a [u8] }, Code { code: &'a CodeObject }, Tuple { elements: &'a [C] }, diff --git a/compiler/core/src/marshal.rs b/compiler/core/src/marshal.rs index 1e47a6cac..0c8da17ff 100644 --- a/compiler/core/src/marshal.rs +++ b/compiler/core/src/marshal.rs @@ -2,6 +2,7 @@ use crate::bytecode::*; use malachite_bigint::{BigInt, Sign}; use num_complex::Complex64; use ruff_source_file::{OneIndexed, SourceLocation}; +use rustpython_common::wtf8::Wtf8; use std::convert::Infallible; pub const FORMAT_VERSION: u32 = 4; @@ -117,6 +118,9 @@ pub trait Read { fn read_str(&mut self, len: u32) -> Result<&str> { Ok(std::str::from_utf8(self.read_slice(len)?)?) } + fn read_wtf8(&mut self, len: u32) -> Result<&Wtf8> { + Wtf8::from_bytes(self.read_slice(len)?).ok_or(MarshalError::InvalidUtf8) + } fn read_u8(&mut self) -> Result { Ok(u8::from_le_bytes(*self.read_array()?)) } @@ -262,7 +266,7 @@ pub trait MarshalBag: Copy { fn make_ellipsis(&self) -> Self::Value; fn make_float(&self, value: f64) -> Self::Value; fn make_complex(&self, value: Complex64) -> Self::Value; - fn make_str(&self, value: &str) -> Self::Value; + fn make_str(&self, value: &Wtf8) -> Self::Value; fn make_bytes(&self, value: &[u8]) -> Self::Value; fn make_int(&self, value: BigInt) -> Self::Value; fn make_tuple(&self, elements: impl Iterator) -> Self::Value; @@ -299,7 +303,7 @@ impl MarshalBag for Bag { fn make_complex(&self, value: Complex64) -> Self::Value { self.make_constant::(BorrowedConstant::Complex { value }) } - fn make_str(&self, value: &str) -> Self::Value { + fn make_str(&self, value: &Wtf8) -> Self::Value { self.make_constant::(BorrowedConstant::Str { value }) } fn make_bytes(&self, value: &[u8]) -> Self::Value { @@ -368,7 +372,7 @@ pub fn deserialize_value(rdr: &mut R, bag: Bag) -> Res } Type::Ascii | Type::Unicode => { let len = rdr.read_u32()?; - let value = rdr.read_str(len)?; + let value = rdr.read_wtf8(len)?; bag.make_str(value) } Type::Tuple => { @@ -422,7 +426,7 @@ pub enum DumpableValue<'a, D: Dumpable> { Float(f64), Complex(Complex64), Boolean(bool), - Str(&'a str), + Str(&'a Wtf8), Bytes(&'a [u8]), Code(&'a CodeObject), Tuple(&'a [D]), diff --git a/jit/tests/common.rs b/jit/tests/common.rs index a2d4fc3bc..a4ac8a796 100644 --- a/jit/tests/common.rs +++ b/jit/tests/common.rs @@ -53,7 +53,9 @@ enum StackValue { impl From for StackValue { fn from(value: ConstantData) -> Self { match value { - ConstantData::Str { value } => StackValue::String(value), + ConstantData::Str { value } => { + StackValue::String(value.into_string().expect("surrogate in test code")) + } ConstantData::None => StackValue::None, ConstantData::Code { code } => StackValue::Code(code), c => unimplemented!("constant {:?} isn't yet supported in py_function!", c), diff --git a/vm/src/builtins/code.rs b/vm/src/builtins/code.rs index ba2d2dd5c..4bb209f6d 100644 --- a/vm/src/builtins/code.rs +++ b/vm/src/builtins/code.rs @@ -74,7 +74,7 @@ fn borrow_obj_constant(obj: &PyObject) -> BorrowedConstant<'_, Literal> { ref c @ super::complex::PyComplex => BorrowedConstant::Complex { value: c.to_complex() }, - ref s @ super::pystr::PyStr => BorrowedConstant::Str { value: s.as_str() }, + ref s @ super::pystr::PyStr => BorrowedConstant::Str { value: s.as_wtf8() }, ref b @ super::bytes::PyBytes => BorrowedConstant::Bytes { value: b.as_bytes() }, diff --git a/vm/src/builtins/str.rs b/vm/src/builtins/str.rs index 55cefae4f..8fe390494 100644 --- a/vm/src/builtins/str.rs +++ b/vm/src/builtins/str.rs @@ -1815,6 +1815,18 @@ impl AsRef for PyExact { } } +impl AsRef for PyRefExact { + fn as_ref(&self) -> &Wtf8 { + self.as_wtf8() + } +} + +impl AsRef for PyExact { + fn as_ref(&self) -> &Wtf8 { + self.as_wtf8() + } +} + impl AnyStrWrapper for PyStrRef { fn as_ref(&self) -> Option<&Wtf8> { Some(self.as_wtf8()) diff --git a/vm/src/intern.rs b/vm/src/intern.rs index 10aaa5345..bb9220d06 100644 --- a/vm/src/intern.rs +++ b/vm/src/intern.rs @@ -1,3 +1,5 @@ +use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; + use crate::{ AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, VirtualMachine, builtins::{PyStr, PyStrInterned, PyTypeRef}, @@ -86,29 +88,29 @@ pub struct CachedPyStrRef { impl std::hash::Hash for CachedPyStrRef { fn hash(&self, state: &mut H) { - self.inner.as_str().hash(state) + self.inner.as_wtf8().hash(state) } } impl PartialEq for CachedPyStrRef { fn eq(&self, other: &Self) -> bool { - self.inner.as_str() == other.inner.as_str() + self.inner.as_wtf8() == other.inner.as_wtf8() } } impl Eq for CachedPyStrRef {} -impl std::borrow::Borrow for CachedPyStrRef { +impl std::borrow::Borrow for CachedPyStrRef { #[inline] - fn borrow(&self) -> &str { - self.inner.as_str() + fn borrow(&self) -> &Wtf8 { + self.as_wtf8() } } -impl AsRef for CachedPyStrRef { +impl AsRef for CachedPyStrRef { #[inline] - fn as_ref(&self) -> &str { - self.as_str() + fn as_ref(&self) -> &Wtf8 { + self.as_wtf8() } } @@ -121,8 +123,8 @@ impl CachedPyStrRef { } #[inline] - fn as_str(&self) -> &str { - self.inner.as_str() + fn as_wtf8(&self) -> &Wtf8 { + self.inner.as_wtf8() } } @@ -209,6 +211,8 @@ impl ToPyObject for &'static PyInterned { } mod sealed { + use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; + use crate::{ builtins::PyStr, object::{Py, PyExact, PyRefExact}, @@ -218,11 +222,14 @@ mod sealed { impl SealedInternable for String {} impl SealedInternable for &str {} + impl SealedInternable for Wtf8Buf {} + impl SealedInternable for &Wtf8 {} impl SealedInternable for PyRefExact {} pub trait SealedMaybeInterned {} impl SealedMaybeInterned for str {} + impl SealedMaybeInterned for Wtf8 {} impl SealedMaybeInterned for PyExact {} impl SealedMaybeInterned for Py {} } @@ -250,6 +257,21 @@ impl InternableString for &str { } } +impl InternableString for Wtf8Buf { + type Interned = Wtf8; + fn into_pyref_exact(self, str_type: PyTypeRef) -> PyRefExact { + let obj = PyRef::new_ref(PyStr::from(self), str_type, None); + unsafe { PyRefExact::new_unchecked(obj) } + } +} + +impl InternableString for &Wtf8 { + type Interned = Wtf8; + fn into_pyref_exact(self, str_type: PyTypeRef) -> PyRefExact { + self.to_owned().into_pyref_exact(str_type) + } +} + impl InternableString for PyRefExact { type Interned = Py; #[inline] @@ -259,7 +281,7 @@ impl InternableString for PyRefExact { } pub trait MaybeInternedString: - AsRef + crate::dictdatatype::DictKey + sealed::SealedMaybeInterned + AsRef + crate::dictdatatype::DictKey + sealed::SealedMaybeInterned { fn as_interned(&self) -> Option<&'static PyStrInterned>; } @@ -271,6 +293,13 @@ impl MaybeInternedString for str { } } +impl MaybeInternedString for Wtf8 { + #[inline(always)] + fn as_interned(&self) -> Option<&'static PyStrInterned> { + None + } +} + impl MaybeInternedString for PyExact { #[inline(always)] fn as_interned(&self) -> Option<&'static PyStrInterned> { @@ -296,7 +325,7 @@ impl PyObject { if self.is_interned() { s.unwrap().as_interned() } else if let Some(s) = s { - vm.ctx.interned_str(s.as_str()) + vm.ctx.interned_str(s.as_wtf8()) } else { None } diff --git a/vm/src/stdlib/marshal.rs b/vm/src/stdlib/marshal.rs index fd7332e7c..564ee5bf6 100644 --- a/vm/src/stdlib/marshal.rs +++ b/vm/src/stdlib/marshal.rs @@ -10,6 +10,7 @@ mod decl { PyBool, PyByteArray, PyBytes, PyCode, PyComplex, PyDict, PyEllipsis, PyFloat, PyFrozenSet, PyInt, PyList, PyNone, PySet, PyStopIteration, PyStr, PyTuple, }, + common::wtf8::Wtf8, convert::ToPyObject, function::{ArgBytesLike, OptionalArg}, object::AsObject, @@ -53,7 +54,7 @@ mod decl { f(Complex(pycomplex.to_complex64())) } ref pystr @ PyStr => { - f(Str(pystr.as_str())) + f(Str(pystr.as_wtf8())) } ref pylist @ PyList => { f(List(&pylist.borrow_vec())) @@ -139,7 +140,7 @@ mod decl { fn make_complex(&self, value: Complex64) -> Self::Value { self.0.ctx.new_complex(value).into() } - fn make_str(&self, value: &str) -> Self::Value { + fn make_str(&self, value: &Wtf8) -> Self::Value { self.0.ctx.new_str(value).into() } fn make_bytes(&self, value: &[u8]) -> Self::Value {