From aac207003fd36c19bf3c9ef2c647e79c3f640db6 Mon Sep 17 00:00:00 2001 From: Joshua Megnauth <48846352+joshuamegnauth54@users.noreply.github.com> Date: Fri, 17 Apr 2026 05:45:43 -0400 Subject: [PATCH] fix: Python-Rust combining char diff in isalnum (#7612) * fix: Python-Rust combining char diff in isalnum Related to: #7518 Rust and Python differ on alphanumeric characters. Rust follows the Unicode standard closer than Python. This means that is_alphanumeric (char function in Rust) is different from isalnum (Python). To fix the discrepancy, RustPython needs to mimic Python by rejecting certain characters. Some classes of combining characters count as alphanumeric in Rust but not Python. Combining characters are accent marks that are combined with other characters to create a single grapheme. It's possible that this PR is not exhaustive. I fixed the combining character issue BUT I don't know the full range of discrepancies. * fix: Ignore combining characters in SRE Closes: #7518 --- Cargo.lock | 1 + crates/sre_engine/Cargo.toml | 1 + crates/sre_engine/src/string.rs | 6 +++++- crates/vm/src/builtins/str.rs | 9 +++++++-- extra_tests/snippets/builtin_str.py | 9 +++++++++ extra_tests/snippets/stdlib_re.py | 3 +++ 6 files changed, 26 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20bfc4578..16941b826 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3337,6 +3337,7 @@ version = "0.5.0" dependencies = [ "bitflags 2.11.0", "criterion", + "icu_properties", "num_enum", "optional", "rustpython-wtf8", diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml index 4f899e6b3..8400a34b5 100644 --- a/crates/sre_engine/Cargo.toml +++ b/crates/sre_engine/Cargo.toml @@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true } num_enum = { workspace = true } bitflags = { workspace = true } optional = { workspace = true } +icu_properties = { workspace = true } [dev-dependencies] criterion = { workspace = true } diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index 489819bfb..b4b3a6092 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -1,3 +1,4 @@ +use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty}; use rustpython_wtf8::Wtf8; #[derive(Debug, Clone, Copy)] @@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { pub(crate) fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython char::try_from(ch) - .map(|x| x.is_alphanumeric()) + .map(|x| { + x.is_alphanumeric() + && CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered + }) .unwrap_or(false) } #[inline] diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 870d3b72a..d74259b84 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -45,7 +45,8 @@ use rustpython_common::{ }; use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, + BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, + XidContinue, XidStart, }; use unicode_casing::CharExt; @@ -946,7 +947,11 @@ impl PyStr { #[pymethod] fn isalnum(&self) -> bool { - !self.data.is_empty() && self.char_all(char::is_alphanumeric) + !self.data.is_empty() + && self.char_all(|c| { + c.is_alphanumeric() + && CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered + }) } #[pymethod] diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 3d54643b3..61cbf63ea 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -73,6 +73,15 @@ assert "\u1c89".istitle() # assert "DZ".title() == "Dz" assert a.isalpha() +# Combining characters differ slightly between Rust and Python +assert "\u006e".isalnum() +assert not "\u0303".isalnum() +assert not "\u006e\u0303".isalnum() +assert "\u00f1".isalnum() +assert not "\u0345".isalnum() +for raw in range(0x0363, 0x036F): + assert not chr(raw).isalnum() + s = "1 2 3" assert s.split(" ", 1) == ["1", "2 3"] assert s.rsplit(" ", 1) == ["1 2", "3"] diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py index 53f21f917..8613ddd30 100644 --- a/extra_tests/snippets/stdlib_re.py +++ b/extra_tests/snippets/stdlib_re.py @@ -79,3 +79,6 @@ assert re.compile("(?:(1)?)*").match("111").group() == "111" # Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183 assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38") assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38" + +# Combining characters; issue #7518 +assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"