mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
fix: Python-Rust combining char diff in isalnum (#7612)
* fix: Python-Rust combining char diff in isalnum Related to: #7518 Rust and Python differ on alphanumeric characters. Rust follows the Unicode standard closer than Python. This means that is_alphanumeric (char function in Rust) is different from isalnum (Python). To fix the discrepancy, RustPython needs to mimic Python by rejecting certain characters. Some classes of combining characters count as alphanumeric in Rust but not Python. Combining characters are accent marks that are combined with other characters to create a single grapheme. It's possible that this PR is not exhaustive. I fixed the combining character issue BUT I don't know the full range of discrepancies. * fix: Ignore combining characters in SRE Closes: #7518
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -3337,6 +3337,7 @@ version = "0.5.0"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"criterion",
|
||||
"icu_properties",
|
||||
"num_enum",
|
||||
"optional",
|
||||
"rustpython-wtf8",
|
||||
|
||||
@@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
|
||||
num_enum = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
optional = { workspace = true }
|
||||
icu_properties = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { workspace = true }
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty};
|
||||
use rustpython_wtf8::Wtf8;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
|
||||
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
|
||||
// TODO: check with cpython
|
||||
char::try_from(ch)
|
||||
.map(|x| x.is_alphanumeric())
|
||||
.map(|x| {
|
||||
x.is_alphanumeric()
|
||||
&& CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
|
||||
@@ -45,7 +45,8 @@ use rustpython_common::{
|
||||
};
|
||||
|
||||
use icu_properties::props::{
|
||||
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
|
||||
BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory,
|
||||
XidContinue, XidStart,
|
||||
};
|
||||
use unicode_casing::CharExt;
|
||||
|
||||
@@ -946,7 +947,11 @@ impl PyStr {
|
||||
|
||||
#[pymethod]
|
||||
fn isalnum(&self) -> bool {
|
||||
!self.data.is_empty() && self.char_all(char::is_alphanumeric)
|
||||
!self.data.is_empty()
|
||||
&& self.char_all(|c| {
|
||||
c.is_alphanumeric()
|
||||
&& CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered
|
||||
})
|
||||
}
|
||||
|
||||
#[pymethod]
|
||||
|
||||
@@ -73,6 +73,15 @@ assert "\u1c89".istitle()
|
||||
# assert "DZ".title() == "Dz"
|
||||
assert a.isalpha()
|
||||
|
||||
# Combining characters differ slightly between Rust and Python
|
||||
assert "\u006e".isalnum()
|
||||
assert not "\u0303".isalnum()
|
||||
assert not "\u006e\u0303".isalnum()
|
||||
assert "\u00f1".isalnum()
|
||||
assert not "\u0345".isalnum()
|
||||
for raw in range(0x0363, 0x036F):
|
||||
assert not chr(raw).isalnum()
|
||||
|
||||
s = "1 2 3"
|
||||
assert s.split(" ", 1) == ["1", "2 3"]
|
||||
assert s.rsplit(" ", 1) == ["1 2", "3"]
|
||||
|
||||
@@ -79,3 +79,6 @@ assert re.compile("(?:(1)?)*").match("111").group() == "111"
|
||||
# Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183
|
||||
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38")
|
||||
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"
|
||||
|
||||
# Combining characters; issue #7518
|
||||
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"
|
||||
|
||||
Reference in New Issue
Block a user