Parse surrogates in string literals properly

This commit is contained in:
Noa
2025-03-26 20:35:59 -05:00
parent 2ab8716c95
commit c6cab4c43a
16 changed files with 506 additions and 80 deletions

3
Cargo.lock generated
View File

@@ -2319,6 +2319,7 @@ dependencies = [
"itertools 0.14.0",
"log",
"malachite-bigint",
"memchr",
"num-complex",
"num-traits",
"ruff_python_ast",
@@ -2330,6 +2331,7 @@ dependencies = [
"rustpython-compiler-core",
"rustpython-compiler-source",
"thiserror 2.0.11",
"unicode_names2",
]
[[package]]
@@ -2387,6 +2389,7 @@ dependencies = [
"ruff_python_ast",
"ruff_python_parser",
"ruff_source_file",
"rustpython-common",
"serde",
]

View File

@@ -536,8 +536,6 @@ class CodecCallbackTest(unittest.TestCase):
("".join("&#%d;" % c for c in cs), 1 + len(s))
)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_badandgoodbackslashreplaceexceptions(self):
# "backslashreplace" complains about a non-exception passed in
self.assertRaises(
@@ -596,8 +594,6 @@ class CodecCallbackTest(unittest.TestCase):
(r, 2)
)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_badandgoodnamereplaceexceptions(self):
# "namereplace" complains about a non-exception passed in
self.assertRaises(
@@ -644,8 +640,6 @@ class CodecCallbackTest(unittest.TestCase):
(r, 1 + len(s))
)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_badandgoodsurrogateescapeexceptions(self):
surrogateescape_errors = codecs.lookup_error('surrogateescape')
# "surrogateescape" complains about a non-exception passed in

View File

@@ -401,7 +401,7 @@ pub mod errors {
let mut out = String::with_capacity(num_chars * 4);
for c in err_str.code_points() {
let c_u32 = c.to_u32();
if let Some(c_name) = unicode_names2::name(c.to_char_lossy()) {
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
write!(out, "\\N{{{c_name}}}").unwrap();
} else if c_u32 >= 0x10000 {
write!(out, "\\U{c_u32:08x}").unwrap();

View File

@@ -574,6 +574,12 @@ impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
}
}
impl Hash for Wtf8Buf {
fn hash<H: Hasher>(&self, state: &mut H) {
Wtf8::hash(self, state)
}
}
impl AsRef<Wtf8> for Wtf8Buf {
fn as_ref(&self) -> &Wtf8 {
self
@@ -692,6 +698,13 @@ impl Default for &Wtf8 {
}
}
impl Hash for Wtf8 {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write(self.as_bytes());
state.write_u8(0xff);
}
}
impl Wtf8 {
/// Creates a WTF-8 slice from a UTF-8 `&str` slice.
///
@@ -722,6 +735,32 @@ impl Wtf8 {
unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
}
/// Create a WTF-8 slice from a WTF-8 byte slice.
//
// whooops! using WTF-8 for interchange!
#[inline]
pub fn from_bytes(b: &[u8]) -> Option<&Self> {
let mut rest = b;
while let Err(e) = std::str::from_utf8(rest) {
rest = &rest[e.valid_up_to()..];
Self::decode_surrogate(rest)?;
rest = &rest[3..];
}
Some(unsafe { Wtf8::from_bytes_unchecked(b) })
}
fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
let [a, b, c, ..] = *b else { return None };
if (a & 0xf0) == 0xe0 && (b & 0xc0) == 0x80 && (c & 0xc0) == 0x80 {
// it's a three-byte code
let c = ((a as u32 & 0x0f) << 12) + ((b as u32 & 0x3f) << 6) + (c as u32 & 0x3f);
let 0xD800..=0xDFFF = c else { return None };
Some(CodePoint { value: c })
} else {
None
}
}
/// Returns the length, in WTF-8 bytes.
#[inline]
pub fn len(&self) -> usize {
@@ -875,6 +914,14 @@ impl Wtf8 {
}
}
#[inline]
fn final_lead_surrogate(&self) -> Option<u16> {
match self.bytes {
[.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
_ => None,
}
}
pub fn is_code_point_boundary(&self, index: usize) -> bool {
is_code_point_boundary(self, index)
}
@@ -1481,6 +1528,12 @@ impl From<Wtf8Buf> for Box<Wtf8> {
}
}
impl From<Box<Wtf8>> for Wtf8Buf {
fn from(w: Box<Wtf8>) -> Self {
Wtf8Buf::from_box(w)
}
}
impl From<String> for Box<Wtf8> {
fn from(s: String) -> Self {
s.into_boxed_str().into()

View File

@@ -30,6 +30,8 @@ num-complex = { workspace = true }
num-traits = { workspace = true }
thiserror = { workspace = true }
malachite-bigint = { workspace = true }
memchr = { workspace = true }
unicode_names2 = { workspace = true }
[dev-dependencies]
# rustpython-parser = { workspace = true }

View File

@@ -21,13 +21,14 @@ use ruff_python_ast::{
Alias, Arguments, BoolOp, CmpOp, Comprehension, ConversionFlag, DebugText, Decorator, DictItem,
ExceptHandler, ExceptHandlerExceptHandler, Expr, ExprAttribute, ExprBoolOp, ExprFString,
ExprList, ExprName, ExprStarred, ExprSubscript, ExprTuple, ExprUnaryOp, FString,
FStringElement, FStringElements, FStringPart, Int, Keyword, MatchCase, ModExpression,
ModModule, Operator, Parameters, Pattern, PatternMatchAs, PatternMatchValue, Stmt, StmtExpr,
TypeParam, TypeParamParamSpec, TypeParamTypeVar, TypeParamTypeVarTuple, TypeParams, UnaryOp,
WithItem,
FStringElement, FStringElements, FStringFlags, FStringPart, Int, Keyword, MatchCase,
ModExpression, ModModule, Operator, Parameters, Pattern, PatternMatchAs, PatternMatchValue,
Stmt, StmtExpr, TypeParam, TypeParamParamSpec, TypeParamTypeVar, TypeParamTypeVarTuple,
TypeParams, UnaryOp, WithItem,
};
use ruff_source_file::OneIndexed;
use ruff_text_size::{Ranged, TextRange};
use rustpython_common::wtf8::Wtf8Buf;
// use rustpython_ast::located::{self as located_ast, Located};
use rustpython_compiler_core::{
Mode,
@@ -375,7 +376,9 @@ impl Compiler<'_> {
let (doc, statements) = split_doc(&body.body, &self.opts);
if let Some(value) = doc {
self.emit_load_const(ConstantData::Str { value });
self.emit_load_const(ConstantData::Str {
value: value.into(),
});
let doc = self.name("__doc__");
emit!(self, Instruction::StoreGlobal(doc))
}
@@ -636,14 +639,12 @@ impl Compiler<'_> {
statement.range(),
));
}
vec![ConstantData::Str {
value: "*".to_owned(),
}]
vec![ConstantData::Str { value: "*".into() }]
} else {
names
.iter()
.map(|n| ConstantData::Str {
value: n.name.to_string(),
value: n.name.as_str().into(),
})
.collect()
};
@@ -954,7 +955,7 @@ impl Compiler<'_> {
self.pop_symbol_table();
}
self.emit_load_const(ConstantData::Str {
value: name_string.clone(),
value: name_string.clone().into(),
});
emit!(self, Instruction::TypeAlias);
self.store_name(&name_string)?;
@@ -1028,7 +1029,7 @@ impl Compiler<'_> {
let default_kw_count = kw_with_defaults.len();
for (arg, default) in kw_with_defaults.iter() {
self.emit_load_const(ConstantData::Str {
value: arg.name.to_string(),
value: arg.name.as_str().into(),
});
self.compile_expression(default)?;
}
@@ -1101,7 +1102,7 @@ impl Compiler<'_> {
if let Some(expr) = &bound {
self.compile_expression(expr)?;
self.emit_load_const(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
emit!(self, Instruction::TypeVarWithBound);
emit!(self, Instruction::Duplicate);
@@ -1109,7 +1110,7 @@ impl Compiler<'_> {
} else {
// self.store_name(type_name.as_str())?;
self.emit_load_const(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
emit!(self, Instruction::TypeVar);
emit!(self, Instruction::Duplicate);
@@ -1118,7 +1119,7 @@ impl Compiler<'_> {
}
TypeParam::ParamSpec(TypeParamParamSpec { name, .. }) => {
self.emit_load_const(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
emit!(self, Instruction::ParamSpec);
emit!(self, Instruction::Duplicate);
@@ -1126,7 +1127,7 @@ impl Compiler<'_> {
}
TypeParam::TypeVarTuple(TypeParamTypeVarTuple { name, .. }) => {
self.emit_load_const(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
emit!(self, Instruction::TypeVarTuple);
emit!(self, Instruction::Duplicate);
@@ -1363,7 +1364,7 @@ impl Compiler<'_> {
if let Some(annotation) = returns {
// key:
self.emit_load_const(ConstantData::Str {
value: "return".to_owned(),
value: "return".into(),
});
// value:
self.compile_annotation(annotation)?;
@@ -1380,7 +1381,7 @@ impl Compiler<'_> {
for param in parameters_iter {
if let Some(annotation) = &param.annotation {
self.emit_load_const(ConstantData::Str {
value: self.mangle(param.name.as_str()).into_owned(),
value: self.mangle(param.name.as_str()).into_owned().into(),
});
self.compile_annotation(annotation)?;
num_annotations += 1;
@@ -1410,7 +1411,7 @@ impl Compiler<'_> {
code: Box::new(code),
});
self.emit_load_const(ConstantData::Str {
value: qualified_name,
value: qualified_name.into(),
});
// Turn code object into function object:
@@ -1418,7 +1419,9 @@ impl Compiler<'_> {
if let Some(value) = doc_str {
emit!(self, Instruction::Duplicate);
self.emit_load_const(ConstantData::Str { value });
self.emit_load_const(ConstantData::Str {
value: value.into(),
});
emit!(self, Instruction::Rotate2);
let doc = self.name("__doc__");
emit!(self, Instruction::StoreAttr { idx: doc });
@@ -1547,7 +1550,7 @@ impl Compiler<'_> {
let dunder_module = self.name("__module__");
emit!(self, Instruction::StoreLocal(dunder_module));
self.emit_load_const(ConstantData::Str {
value: qualified_name,
value: qualified_name.into(),
});
let qualname = self.name("__qualname__");
emit!(self, Instruction::StoreLocal(qualname));
@@ -1608,16 +1611,12 @@ impl Compiler<'_> {
self.emit_load_const(ConstantData::Code {
code: Box::new(code),
});
self.emit_load_const(ConstantData::Str {
value: name.to_owned(),
});
self.emit_load_const(ConstantData::Str { value: name.into() });
// Turn code object into function object:
emit!(self, Instruction::MakeFunction(func_flags));
self.emit_load_const(ConstantData::Str {
value: name.to_owned(),
});
self.emit_load_const(ConstantData::Str { value: name.into() });
// Call the __build_class__ builtin
let call = if let Some(arguments) = arguments {
@@ -1638,7 +1637,7 @@ impl Compiler<'_> {
// Doc string value:
self.emit_load_const(match doc_str {
Some(doc) => ConstantData::Str { value: doc },
Some(doc) => ConstantData::Str { value: doc.into() },
None => ConstantData::None, // set docstring None if not declared
});
}
@@ -2031,7 +2030,7 @@ impl Compiler<'_> {
let ident = Default::default();
let codegen = ruff_python_codegen::Generator::new(&ident, Default::default());
self.emit_load_const(ConstantData::Str {
value: codegen.expr(annotation),
value: codegen.expr(annotation).into(),
});
} else {
self.compile_expression(annotation)?;
@@ -2063,7 +2062,7 @@ impl Compiler<'_> {
let annotations = self.name("__annotations__");
emit!(self, Instruction::LoadNameAny(annotations));
self.emit_load_const(ConstantData::Str {
value: self.mangle(id.as_str()).into_owned(),
value: self.mangle(id.as_str()).into_owned().into(),
});
emit!(self, Instruction::StoreSubscript);
} else {
@@ -2538,7 +2537,7 @@ impl Compiler<'_> {
self.emit_load_const(ConstantData::Code {
code: Box::new(code),
});
self.emit_load_const(ConstantData::Str { value: name });
self.emit_load_const(ConstantData::Str { value: name.into() });
// Turn code object into function object:
emit!(self, Instruction::MakeFunction(func_flags));
@@ -2679,9 +2678,23 @@ impl Compiler<'_> {
self.compile_expr_fstring(fstring)?;
}
Expr::StringLiteral(string) => {
self.emit_load_const(ConstantData::Str {
value: string.value.to_str().to_owned(),
});
let value = string.value.to_str();
if value.contains(char::REPLACEMENT_CHARACTER) {
let value = string
.value
.iter()
.map(|lit| {
let source = self.source_code.get_range(lit.range);
crate::string_parser::parse_string_literal(source, lit.flags.into())
})
.collect();
// might have a surrogate literal; should reparse to be sure
self.emit_load_const(ConstantData::Str { value });
} else {
self.emit_load_const(ConstantData::Str {
value: value.into(),
});
}
}
Expr::BytesLiteral(bytes) => {
let iter = bytes.value.iter().flat_map(|x| x.iter().copied());
@@ -2732,7 +2745,7 @@ impl Compiler<'_> {
for keyword in sub_keywords {
if let Some(name) = &keyword.arg {
self.emit_load_const(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
self.compile_expression(&keyword.value)?;
sub_size += 1;
@@ -2822,7 +2835,7 @@ impl Compiler<'_> {
for keyword in &arguments.keywords {
if let Some(name) = &keyword.arg {
kwarg_names.push(ConstantData::Str {
value: name.to_string(),
value: name.as_str().into(),
});
} else {
// This means **kwargs!
@@ -3058,9 +3071,7 @@ impl Compiler<'_> {
});
// List comprehension function name:
self.emit_load_const(ConstantData::Str {
value: name.to_owned(),
});
self.emit_load_const(ConstantData::Str { value: name.into() });
// Turn code object into function object:
emit!(self, Instruction::MakeFunction(func_flags));
@@ -3358,9 +3369,19 @@ impl Compiler<'_> {
fn compile_fstring_part(&mut self, part: &FStringPart) -> CompileResult<()> {
match part {
FStringPart::Literal(string) => {
self.emit_load_const(ConstantData::Str {
value: string.value.to_string(),
});
if string.value.contains(char::REPLACEMENT_CHARACTER) {
// might have a surrogate literal; should reparse to be sure
let source = self.source_code.get_range(string.range);
let value =
crate::string_parser::parse_string_literal(source, string.flags.into());
self.emit_load_const(ConstantData::Str {
value: value.into(),
});
} else {
self.emit_load_const(ConstantData::Str {
value: string.value.to_string().into(),
});
}
Ok(())
}
FStringPart::FString(fstring) => self.compile_fstring(fstring),
@@ -3368,19 +3389,32 @@ impl Compiler<'_> {
}
fn compile_fstring(&mut self, fstring: &FString) -> CompileResult<()> {
self.compile_fstring_elements(&fstring.elements)
self.compile_fstring_elements(fstring.flags, &fstring.elements)
}
fn compile_fstring_elements(
&mut self,
flags: FStringFlags,
fstring_elements: &FStringElements,
) -> CompileResult<()> {
for element in fstring_elements {
match element {
FStringElement::Literal(string) => {
self.emit_load_const(ConstantData::Str {
value: string.value.to_string(),
});
if string.value.contains(char::REPLACEMENT_CHARACTER) {
// might have a surrogate literal; should reparse to be sure
let source = self.source_code.get_range(string.range);
let value = crate::string_parser::parse_fstring_literal_element(
source.into(),
flags.into(),
);
self.emit_load_const(ConstantData::Str {
value: value.into(),
});
} else {
self.emit_load_const(ConstantData::Str {
value: string.value.to_string().into(),
});
}
}
FStringElement::Expression(fstring_expr) => {
let mut conversion = fstring_expr.conversion;
@@ -3393,11 +3427,13 @@ impl Compiler<'_> {
let source = source.to_string();
self.emit_load_const(ConstantData::Str {
value: leading.to_string(),
value: leading.to_string().into(),
});
self.emit_load_const(ConstantData::Str { value: source });
self.emit_load_const(ConstantData::Str {
value: trailing.to_string(),
value: source.into(),
});
self.emit_load_const(ConstantData::Str {
value: trailing.to_string().into(),
});
3
@@ -3407,7 +3443,7 @@ impl Compiler<'_> {
match &fstring_expr.format_spec {
None => {
self.emit_load_const(ConstantData::Str {
value: String::new(),
value: Wtf8Buf::new(),
});
// Match CPython behavior: If debug text is present, apply repr conversion.
// See: https://github.com/python/cpython/blob/f61afca262d3a0aa6a8a501db0b1936c60858e35/Parser/action_helpers.c#L1456
@@ -3416,7 +3452,7 @@ impl Compiler<'_> {
}
}
Some(format_spec) => {
self.compile_fstring_elements(&format_spec.elements)?;
self.compile_fstring_elements(flags, &format_spec.elements)?;
}
}
@@ -3449,7 +3485,7 @@ impl Compiler<'_> {
if element_count == 0 {
// ensure to put an empty string on the stack if there aren't any fstring elements
self.emit_load_const(ConstantData::Str {
value: String::new(),
value: Wtf8Buf::new(),
});
} else if element_count > 1 {
emit!(

View File

@@ -11,6 +11,7 @@ type IndexSet<T> = indexmap::IndexSet<T, ahash::RandomState>;
pub mod compile;
pub mod error;
pub mod ir;
mod string_parser;
pub mod symboltable;
pub use compile::CompileOpts;

View File

@@ -0,0 +1,287 @@
//! A stripped-down version of ruff's string literal parser, modified to
//! handle surrogates in string literals and output WTF-8.
//!
//! Any `unreachable!()` statements in this file are because we only get here
//! after ruff has already successfully parsed the string literal, meaning
//! we don't need to do any validation or error handling.
use std::convert::Infallible;
use ruff_python_ast::{AnyStringFlags, StringFlags};
use rustpython_common::wtf8::{CodePoint, Wtf8, Wtf8Buf};
// use ruff_python_parser::{LexicalError, LexicalErrorType};
type LexicalError = Infallible;
enum EscapedChar {
Literal(CodePoint),
Escape(char),
}
struct StringParser {
/// The raw content of the string e.g., the `foo` part in `"foo"`.
source: Box<str>,
/// Current position of the parser in the source.
cursor: usize,
/// Flags that can be used to query information about the string.
flags: AnyStringFlags,
}
impl StringParser {
fn new(source: Box<str>, flags: AnyStringFlags) -> Self {
Self {
source,
cursor: 0,
flags,
}
}
#[inline]
fn skip_bytes(&mut self, bytes: usize) -> &str {
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
self.cursor += bytes;
skipped_str
}
/// Returns the next byte in the string, if there is one.
///
/// # Panics
///
/// When the next byte is a part of a multi-byte character.
#[inline]
fn next_byte(&mut self) -> Option<u8> {
self.source[self.cursor..].as_bytes().first().map(|&byte| {
self.cursor += 1;
byte
})
}
#[inline]
fn next_char(&mut self) -> Option<char> {
self.source[self.cursor..].chars().next().inspect(|c| {
self.cursor += c.len_utf8();
})
}
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.source[self.cursor..].as_bytes().first().copied()
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<CodePoint, LexicalError> {
let mut p: u32 = 0u32;
for i in 1..=literal_number {
match self.next_char() {
Some(c) => match c.to_digit(16) {
Some(d) => p += d << ((literal_number - i) * 4),
None => unreachable!(),
},
None => unreachable!(),
}
}
Ok(CodePoint::from_u32(p).unwrap())
}
fn parse_octet(&mut self, o: u8) -> char {
let mut radix_bytes = [o, 0, 0];
let mut len = 1;
while len < 3 {
let Some(b'0'..=b'7') = self.peek_byte() else {
break;
};
radix_bytes[len] = self.next_byte().unwrap();
len += 1;
}
// OK because radix_bytes is always going to be in the ASCII range.
let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes");
let value = u32::from_str_radix(radix_str, 8).unwrap();
char::from_u32(value).unwrap()
}
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let Some('{') = self.next_char() else {
unreachable!()
};
let Some(close_idx) = self.source[self.cursor..].find('}') else {
unreachable!()
};
let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];
unicode_names2::character(name).ok_or_else(|| unreachable!())
}
/// Parse an escaped character, returning the new character.
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
let Some(first_char) = self.next_char() else {
unreachable!()
};
let new_char = match first_char {
'\\' => '\\'.into(),
'\'' => '\''.into(),
'\"' => '"'.into(),
'a' => '\x07'.into(),
'b' => '\x08'.into(),
'f' => '\x0c'.into(),
'n' => '\n'.into(),
'r' => '\r'.into(),
't' => '\t'.into(),
'v' => '\x0b'.into(),
o @ '0'..='7' => self.parse_octet(o as u8).into(),
'x' => self.parse_unicode_literal(2)?,
'u' if !self.flags.is_byte_string() => self.parse_unicode_literal(4)?,
'U' if !self.flags.is_byte_string() => self.parse_unicode_literal(8)?,
'N' if !self.flags.is_byte_string() => self.parse_unicode_name()?.into(),
// Special cases where the escape sequence is not a single character
'\n' => return Ok(None),
'\r' => {
if self.peek_byte() == Some(b'\n') {
self.next_byte();
}
return Ok(None);
}
_ => return Ok(Some(EscapedChar::Escape(first_char))),
};
Ok(Some(EscapedChar::Literal(new_char)))
}
fn parse_fstring_middle(mut self) -> Result<Box<Wtf8>, LexicalError> {
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
return Ok(self.source.into());
};
let mut value = Wtf8Buf::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence (or curly brace) to the string.
let before_with_slash_or_brace = self.skip_bytes(index + 1);
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match &self.source.as_bytes()[self.cursor - 1] {
// If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that
// we need increase the location by 2 instead of 1.
b'{' => value.push_char('{'),
b'}' => value.push_char('}'),
// We can encounter a `\` as the last character in a `FStringMiddle`
// token which is valid in this context. For example,
//
// ```python
// f"\{foo} \{bar:\}"
// # ^ ^^ ^
// ```
//
// Here, the `FStringMiddle` token content will be "\" and " \"
// which is invalid if we look at the content in isolation:
//
// ```python
// "\"
// ```
//
// However, the content is syntactically valid in the context of
// the f-string because it's a substring of the entire f-string.
// This is still an invalid escape sequence, but we don't want to
// raise a syntax error as is done by the CPython parser. It might
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
b'\\' => {
if !self.flags.is_raw_string() && self.peek_byte().is_some() {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push_char('\\');
value.push_char(c);
}
}
} else {
value.push_char('\\');
}
}
ch => {
unreachable!("Expected '{{', '}}', or '\\' but got {:?}", ch);
}
}
let Some(next_index) =
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
index = next_index;
}
Ok(value.into())
}
fn parse_string(mut self) -> Result<Box<Wtf8>, LexicalError> {
if self.flags.is_raw_string() {
// For raw strings, no escaping is necessary.
return Ok(self.source.into());
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(self.source.into());
};
// If the string contains escape sequences, we need to parse them.
let mut value = Wtf8Buf::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push_char('\\');
value.push_char(c);
}
}
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
// Update the position of the next escape sequence.
escape = next_escape;
}
Ok(value.into())
}
}
pub(crate) fn parse_string_literal(source: &str, flags: AnyStringFlags) -> Box<Wtf8> {
let source = &source[flags.opener_len().to_usize()..];
let source = &source[..source.len() - flags.quote_len().to_usize()];
StringParser::new(source.into(), flags)
.parse_string()
.unwrap_or_else(|x| match x {})
}
pub(crate) fn parse_fstring_literal_element(source: Box<str>, flags: AnyStringFlags) -> Box<Wtf8> {
StringParser::new(source, flags)
.parse_fstring_middle()
.unwrap_or_else(|x| match x {})
}

View File

@@ -13,6 +13,7 @@ license.workspace = true
ruff_python_ast = { workspace = true }
ruff_python_parser = { workspace = true }
ruff_source_file = { workspace = true }
rustpython-common = { workspace = true }
bitflags = { workspace = true }
itertools = { workspace = true }

View File

@@ -8,6 +8,7 @@ use num_complex::Complex64;
pub use ruff_python_ast::ConversionFlag;
// use rustpython_parser_core::source_code::{OneIndexed, SourceLocation};
use ruff_source_file::{OneIndexed, SourceLocation};
use rustpython_common::wtf8::{Wtf8, Wtf8Buf};
use std::marker::PhantomData;
use std::{collections::BTreeSet, fmt, hash, mem};
@@ -678,7 +679,7 @@ pub enum ConstantData {
Float { value: f64 },
Complex { value: Complex64 },
Boolean { value: bool },
Str { value: String },
Str { value: Wtf8Buf },
Bytes { value: Vec<u8> },
Code { code: Box<CodeObject> },
None,
@@ -738,7 +739,7 @@ pub enum BorrowedConstant<'a, C: Constant> {
Float { value: f64 },
Complex { value: Complex64 },
Boolean { value: bool },
Str { value: &'a str },
Str { value: &'a Wtf8 },
Bytes { value: &'a [u8] },
Code { code: &'a CodeObject<C> },
Tuple { elements: &'a [C] },

View File

@@ -2,6 +2,7 @@ use crate::bytecode::*;
use malachite_bigint::{BigInt, Sign};
use num_complex::Complex64;
use ruff_source_file::{OneIndexed, SourceLocation};
use rustpython_common::wtf8::Wtf8;
use std::convert::Infallible;
pub const FORMAT_VERSION: u32 = 4;
@@ -117,6 +118,9 @@ pub trait Read {
fn read_str(&mut self, len: u32) -> Result<&str> {
Ok(std::str::from_utf8(self.read_slice(len)?)?)
}
fn read_wtf8(&mut self, len: u32) -> Result<&Wtf8> {
Wtf8::from_bytes(self.read_slice(len)?).ok_or(MarshalError::InvalidUtf8)
}
fn read_u8(&mut self) -> Result<u8> {
Ok(u8::from_le_bytes(*self.read_array()?))
}
@@ -262,7 +266,7 @@ pub trait MarshalBag: Copy {
fn make_ellipsis(&self) -> Self::Value;
fn make_float(&self, value: f64) -> Self::Value;
fn make_complex(&self, value: Complex64) -> Self::Value;
fn make_str(&self, value: &str) -> Self::Value;
fn make_str(&self, value: &Wtf8) -> Self::Value;
fn make_bytes(&self, value: &[u8]) -> Self::Value;
fn make_int(&self, value: BigInt) -> Self::Value;
fn make_tuple(&self, elements: impl Iterator<Item = Self::Value>) -> Self::Value;
@@ -299,7 +303,7 @@ impl<Bag: ConstantBag> MarshalBag for Bag {
fn make_complex(&self, value: Complex64) -> Self::Value {
self.make_constant::<Bag::Constant>(BorrowedConstant::Complex { value })
}
fn make_str(&self, value: &str) -> Self::Value {
fn make_str(&self, value: &Wtf8) -> Self::Value {
self.make_constant::<Bag::Constant>(BorrowedConstant::Str { value })
}
fn make_bytes(&self, value: &[u8]) -> Self::Value {
@@ -368,7 +372,7 @@ pub fn deserialize_value<R: Read, Bag: MarshalBag>(rdr: &mut R, bag: Bag) -> Res
}
Type::Ascii | Type::Unicode => {
let len = rdr.read_u32()?;
let value = rdr.read_str(len)?;
let value = rdr.read_wtf8(len)?;
bag.make_str(value)
}
Type::Tuple => {
@@ -422,7 +426,7 @@ pub enum DumpableValue<'a, D: Dumpable> {
Float(f64),
Complex(Complex64),
Boolean(bool),
Str(&'a str),
Str(&'a Wtf8),
Bytes(&'a [u8]),
Code(&'a CodeObject<D::Constant>),
Tuple(&'a [D]),

View File

@@ -53,7 +53,9 @@ enum StackValue {
impl From<ConstantData> for StackValue {
fn from(value: ConstantData) -> Self {
match value {
ConstantData::Str { value } => StackValue::String(value),
ConstantData::Str { value } => {
StackValue::String(value.into_string().expect("surrogate in test code"))
}
ConstantData::None => StackValue::None,
ConstantData::Code { code } => StackValue::Code(code),
c => unimplemented!("constant {:?} isn't yet supported in py_function!", c),

View File

@@ -74,7 +74,7 @@ fn borrow_obj_constant(obj: &PyObject) -> BorrowedConstant<'_, Literal> {
ref c @ super::complex::PyComplex => BorrowedConstant::Complex {
value: c.to_complex()
},
ref s @ super::pystr::PyStr => BorrowedConstant::Str { value: s.as_str() },
ref s @ super::pystr::PyStr => BorrowedConstant::Str { value: s.as_wtf8() },
ref b @ super::bytes::PyBytes => BorrowedConstant::Bytes {
value: b.as_bytes()
},

View File

@@ -1815,6 +1815,18 @@ impl AsRef<str> for PyExact<PyStr> {
}
}
impl AsRef<Wtf8> for PyRefExact<PyStr> {
fn as_ref(&self) -> &Wtf8 {
self.as_wtf8()
}
}
impl AsRef<Wtf8> for PyExact<PyStr> {
fn as_ref(&self) -> &Wtf8 {
self.as_wtf8()
}
}
impl AnyStrWrapper<Wtf8> for PyStrRef {
fn as_ref(&self) -> Option<&Wtf8> {
Some(self.as_wtf8())

View File

@@ -1,3 +1,5 @@
use rustpython_common::wtf8::{Wtf8, Wtf8Buf};
use crate::{
AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, VirtualMachine,
builtins::{PyStr, PyStrInterned, PyTypeRef},
@@ -86,29 +88,29 @@ pub struct CachedPyStrRef {
impl std::hash::Hash for CachedPyStrRef {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.inner.as_str().hash(state)
self.inner.as_wtf8().hash(state)
}
}
impl PartialEq for CachedPyStrRef {
fn eq(&self, other: &Self) -> bool {
self.inner.as_str() == other.inner.as_str()
self.inner.as_wtf8() == other.inner.as_wtf8()
}
}
impl Eq for CachedPyStrRef {}
impl std::borrow::Borrow<str> for CachedPyStrRef {
impl std::borrow::Borrow<Wtf8> for CachedPyStrRef {
#[inline]
fn borrow(&self) -> &str {
self.inner.as_str()
fn borrow(&self) -> &Wtf8 {
self.as_wtf8()
}
}
impl AsRef<str> for CachedPyStrRef {
impl AsRef<Wtf8> for CachedPyStrRef {
#[inline]
fn as_ref(&self) -> &str {
self.as_str()
fn as_ref(&self) -> &Wtf8 {
self.as_wtf8()
}
}
@@ -121,8 +123,8 @@ impl CachedPyStrRef {
}
#[inline]
fn as_str(&self) -> &str {
self.inner.as_str()
fn as_wtf8(&self) -> &Wtf8 {
self.inner.as_wtf8()
}
}
@@ -209,6 +211,8 @@ impl<T: PyPayload> ToPyObject for &'static PyInterned<T> {
}
mod sealed {
use rustpython_common::wtf8::{Wtf8, Wtf8Buf};
use crate::{
builtins::PyStr,
object::{Py, PyExact, PyRefExact},
@@ -218,11 +222,14 @@ mod sealed {
impl SealedInternable for String {}
impl SealedInternable for &str {}
impl SealedInternable for Wtf8Buf {}
impl SealedInternable for &Wtf8 {}
impl SealedInternable for PyRefExact<PyStr> {}
pub trait SealedMaybeInterned {}
impl SealedMaybeInterned for str {}
impl SealedMaybeInterned for Wtf8 {}
impl SealedMaybeInterned for PyExact<PyStr> {}
impl SealedMaybeInterned for Py<PyStr> {}
}
@@ -250,6 +257,21 @@ impl InternableString for &str {
}
}
impl InternableString for Wtf8Buf {
type Interned = Wtf8;
fn into_pyref_exact(self, str_type: PyTypeRef) -> PyRefExact<PyStr> {
let obj = PyRef::new_ref(PyStr::from(self), str_type, None);
unsafe { PyRefExact::new_unchecked(obj) }
}
}
impl InternableString for &Wtf8 {
type Interned = Wtf8;
fn into_pyref_exact(self, str_type: PyTypeRef) -> PyRefExact<PyStr> {
self.to_owned().into_pyref_exact(str_type)
}
}
impl InternableString for PyRefExact<PyStr> {
type Interned = Py<PyStr>;
#[inline]
@@ -259,7 +281,7 @@ impl InternableString for PyRefExact<PyStr> {
}
pub trait MaybeInternedString:
AsRef<str> + crate::dictdatatype::DictKey + sealed::SealedMaybeInterned
AsRef<Wtf8> + crate::dictdatatype::DictKey + sealed::SealedMaybeInterned
{
fn as_interned(&self) -> Option<&'static PyStrInterned>;
}
@@ -271,6 +293,13 @@ impl MaybeInternedString for str {
}
}
impl MaybeInternedString for Wtf8 {
#[inline(always)]
fn as_interned(&self) -> Option<&'static PyStrInterned> {
None
}
}
impl MaybeInternedString for PyExact<PyStr> {
#[inline(always)]
fn as_interned(&self) -> Option<&'static PyStrInterned> {
@@ -296,7 +325,7 @@ impl PyObject {
if self.is_interned() {
s.unwrap().as_interned()
} else if let Some(s) = s {
vm.ctx.interned_str(s.as_str())
vm.ctx.interned_str(s.as_wtf8())
} else {
None
}

View File

@@ -10,6 +10,7 @@ mod decl {
PyBool, PyByteArray, PyBytes, PyCode, PyComplex, PyDict, PyEllipsis, PyFloat,
PyFrozenSet, PyInt, PyList, PyNone, PySet, PyStopIteration, PyStr, PyTuple,
},
common::wtf8::Wtf8,
convert::ToPyObject,
function::{ArgBytesLike, OptionalArg},
object::AsObject,
@@ -53,7 +54,7 @@ mod decl {
f(Complex(pycomplex.to_complex64()))
}
ref pystr @ PyStr => {
f(Str(pystr.as_str()))
f(Str(pystr.as_wtf8()))
}
ref pylist @ PyList => {
f(List(&pylist.borrow_vec()))
@@ -139,7 +140,7 @@ mod decl {
fn make_complex(&self, value: Complex64) -> Self::Value {
self.0.ctx.new_complex(value).into()
}
fn make_str(&self, value: &str) -> Self::Value {
fn make_str(&self, value: &Wtf8) -> Self::Value {
self.0.ctx.new_str(value).into()
}
fn make_bytes(&self, value: &[u8]) -> Self::Value {