From aac207003fd36c19bf3c9ef2c647e79c3f640db6 Mon Sep 17 00:00:00 2001
From: Joshua Megnauth <48846352+joshuamegnauth54@users.noreply.github.com>
Date: Fri, 17 Apr 2026 05:45:43 -0400
Subject: [PATCH] fix: Python-Rust combining char diff in isalnum (#7612)

* fix: Python-Rust combining char diff in isalnum

Related to: #7518

Rust and Python differ on alphanumeric characters. Rust follows the
Unicode standard closer than Python. This means that is_alphanumeric
(char function in Rust) is different from isalnum (Python). To fix the
discrepancy, RustPython needs to mimic Python by rejecting certain
characters. Some classes of combining characters count as alphanumeric
in Rust but not Python. Combining characters are accent marks
that are combined with other characters to create a single grapheme.

It's possible that this PR is not exhaustive. I fixed the combining
character issue BUT I don't know the full range of discrepancies.

* fix: Ignore combining characters in SRE

Closes: #7518
---
 Cargo.lock                          | 1 +
 crates/sre_engine/Cargo.toml        | 1 +
 crates/sre_engine/src/string.rs     | 6 +++++-
 crates/vm/src/builtins/str.rs       | 9 +++++++--
 extra_tests/snippets/builtin_str.py | 9 +++++++++
 extra_tests/snippets/stdlib_re.py   | 3 +++
 6 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 20bfc4578..16941b826 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3337,6 +3337,7 @@ version = "0.5.0"
 dependencies = [
  "bitflags 2.11.0",
  "criterion",
+ "icu_properties",
  "num_enum",
  "optional",
  "rustpython-wtf8",
diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml
index 4f899e6b3..8400a34b5 100644
--- a/crates/sre_engine/Cargo.toml
+++ b/crates/sre_engine/Cargo.toml
@@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
 num_enum = { workspace = true }
 bitflags = { workspace = true }
 optional = { workspace = true }
+icu_properties = { workspace = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs
index 489819bfb..b4b3a6092 100644
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -1,3 +1,4 @@
+use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty};
 use rustpython_wtf8::Wtf8;
 
 #[derive(Debug, Clone, Copy)]
@@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
 pub(crate) fn is_uni_alnum(ch: u32) -> bool {
     // TODO: check with cpython
     char::try_from(ch)
-        .map(|x| x.is_alphanumeric())
+        .map(|x| {
+            x.is_alphanumeric()
+                && CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered
+        })
         .unwrap_or(false)
 }
 #[inline]
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
index 870d3b72a..d74259b84 100644
--- a/crates/vm/src/builtins/str.rs
+++ b/crates/vm/src/builtins/str.rs
@@ -45,7 +45,8 @@ use rustpython_common::{
 };
 
 use icu_properties::props::{
-    BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
+    BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory,
+    XidContinue, XidStart,
 };
 use unicode_casing::CharExt;
 
@@ -946,7 +947,11 @@ impl PyStr {
 
     #[pymethod]
     fn isalnum(&self) -> bool {
-        !self.data.is_empty() && self.char_all(char::is_alphanumeric)
+        !self.data.is_empty()
+            && self.char_all(|c| {
+                c.is_alphanumeric()
+                    && CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered
+            })
     }
 
     #[pymethod]
diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py
index 3d54643b3..61cbf63ea 100644
--- a/extra_tests/snippets/builtin_str.py
+++ b/extra_tests/snippets/builtin_str.py
@@ -73,6 +73,15 @@ assert "\u1c89".istitle()
 # assert "Ǳ".title() == "ǲ"
 assert a.isalpha()
 
+# Combining characters differ slightly between Rust and Python
+assert "\u006e".isalnum()
+assert not "\u0303".isalnum()
+assert not "\u006e\u0303".isalnum()
+assert "\u00f1".isalnum()
+assert not "\u0345".isalnum()
+for raw in range(0x0363, 0x036F):
+    assert not chr(raw).isalnum()
+
 s = "1 2 3"
 assert s.split(" ", 1) == ["1", "2 3"]
 assert s.rsplit(" ", 1) == ["1 2", "3"]
diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py
index 53f21f917..8613ddd30 100644
--- a/extra_tests/snippets/stdlib_re.py
+++ b/extra_tests/snippets/stdlib_re.py
@@ -79,3 +79,6 @@ assert re.compile("(?:(1)?)*").match("111").group() == "111"
 # Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183
 assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38")
 assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"
+
+# Combining characters; issue #7518
+assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"