fix: Python-Rust combining char diff in isalnum (#7612)

* fix: Python-Rust combining char diff in isalnum

Related to: #7518

Rust and Python differ on alphanumeric characters. Rust follows the
Unicode standard closer than Python. This means that is_alphanumeric
(char function in Rust) is different from isalnum (Python). To fix the
discrepancy, RustPython needs to mimic Python by rejecting certain
characters. Some classes of combining characters count as alphanumeric
in Rust but not Python. Combining characters are accent marks
that are combined with other characters to create a single grapheme.

It's possible that this PR is not exhaustive. I fixed the combining
character issue BUT I don't know the full range of discrepancies.

* fix: Ignore combining characters in SRE

Closes: #7518
This commit is contained in:
Joshua Megnauth
2026-04-17 05:45:43 -04:00
committed by GitHub
parent f82b8d8eb7
commit aac207003f
6 changed files with 26 additions and 3 deletions

1
Cargo.lock generated
View File

@@ -3337,6 +3337,7 @@ version = "0.5.0"
dependencies = [
"bitflags 2.11.0",
"criterion",
"icu_properties",
"num_enum",
"optional",
"rustpython-wtf8",

View File

@@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
num_enum = { workspace = true }
bitflags = { workspace = true }
optional = { workspace = true }
icu_properties = { workspace = true }
[dev-dependencies]
criterion = { workspace = true }

View File

@@ -1,3 +1,4 @@
use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty};
use rustpython_wtf8::Wtf8;
#[derive(Debug, Clone, Copy)]
@@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.map(|x| {
x.is_alphanumeric()
&& CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered
})
.unwrap_or(false)
}
#[inline]

View File

@@ -45,7 +45,8 @@ use rustpython_common::{
};
use icu_properties::props::{
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory,
XidContinue, XidStart,
};
use unicode_casing::CharExt;
@@ -946,7 +947,11 @@ impl PyStr {
#[pymethod]
fn isalnum(&self) -> bool {
!self.data.is_empty() && self.char_all(char::is_alphanumeric)
!self.data.is_empty()
&& self.char_all(|c| {
c.is_alphanumeric()
&& CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered
})
}
#[pymethod]

View File

@@ -73,6 +73,15 @@ assert "\u1c89".istitle()
# assert "DZ".title() == "Dz"
assert a.isalpha()
# Combining characters differ slightly between Rust and Python
assert "\u006e".isalnum()
assert not "\u0303".isalnum()
assert not "\u006e\u0303".isalnum()
assert "\u00f1".isalnum()
assert not "\u0345".isalnum()
for raw in range(0x0363, 0x036F):
assert not chr(raw).isalnum()
s = "1 2 3"
assert s.split(" ", 1) == ["1", "2 3"]
assert s.rsplit(" ", 1) == ["1 2", "3"]

View File

@@ -79,3 +79,6 @@ assert re.compile("(?:(1)?)*").match("111").group() == "111"
# Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38")
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"
# Combining characters; issue #7518
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"