diff --git a/Cargo.lock b/Cargo.lock index 20bfc4578..16941b826 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3337,6 +3337,7 @@ version = "0.5.0" dependencies = [ "bitflags 2.11.0", "criterion", + "icu_properties", "num_enum", "optional", "rustpython-wtf8", diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml index 4f899e6b3..8400a34b5 100644 --- a/crates/sre_engine/Cargo.toml +++ b/crates/sre_engine/Cargo.toml @@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true } num_enum = { workspace = true } bitflags = { workspace = true } optional = { workspace = true } +icu_properties = { workspace = true } [dev-dependencies] criterion = { workspace = true } diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index 489819bfb..b4b3a6092 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -1,3 +1,4 @@ +use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty}; use rustpython_wtf8::Wtf8; #[derive(Debug, Clone, Copy)] @@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { pub(crate) fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython char::try_from(ch) - .map(|x| x.is_alphanumeric()) + .map(|x| { + x.is_alphanumeric() + && CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered + }) .unwrap_or(false) } #[inline] diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 870d3b72a..d74259b84 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -45,7 +45,8 @@ use rustpython_common::{ }; use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, + BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, + XidContinue, XidStart, }; use unicode_casing::CharExt; @@ -946,7 +947,11 @@ impl PyStr { #[pymethod] fn isalnum(&self) -> bool { - !self.data.is_empty() && self.char_all(char::is_alphanumeric) + !self.data.is_empty() + && self.char_all(|c| { + c.is_alphanumeric() + && CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered + }) } #[pymethod] diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 3d54643b3..61cbf63ea 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -73,6 +73,15 @@ assert "\u1c89".istitle() # assert "DZ".title() == "Dz" assert a.isalpha() +# Combining characters differ slightly between Rust and Python +assert "\u006e".isalnum() +assert not "\u0303".isalnum() +assert not "\u006e\u0303".isalnum() +assert "\u00f1".isalnum() +assert not "\u0345".isalnum() +for raw in range(0x0363, 0x036F): + assert not chr(raw).isalnum() + s = "1 2 3" assert s.split(" ", 1) == ["1", "2 3"] assert s.rsplit(" ", 1) == ["1 2", "3"] diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py index 53f21f917..8613ddd30 100644 --- a/extra_tests/snippets/stdlib_re.py +++ b/extra_tests/snippets/stdlib_re.py @@ -79,3 +79,6 @@ assert re.compile("(?:(1)?)*").match("111").group() == "111" # Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183 assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38") assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38" + +# Combining characters; issue #7518 +assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"