Merge main: adopt icu_properties approach, drop unic-ucd-category fallback

Fix isalpha/isalnum fallback for chars unassigned in Unicode 10; add regression tests
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/43f9f7dc-f5af-48b8-b93b-6363bcda334c Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-06-02 19:39:49 +09:00 · 2026-06-01 08:50:31 +00:00 · 2026-03-29 04:53:10 +00:00 · 2026-03-27 23:47:11 +00:00 · 2026-03-27 16:57:44 +00:00 · 2026-03-27 16:36:31 +00:00
4 changed files with 19 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3739,6 +3739,7 @@ dependencies = [
 "num_enum",
 "optional",
 "rustpython-wtf8",
 "unic-ucd-category",
 ]
 [[package]]
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -428,7 +428,6 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
 }
 #[inline]
 pub(crate) fn is_uni_alnum(ch: u32) -> bool {
    // TODO: check with cpython
    char::try_from(ch).is_ok_and(|c| {
        GeneralCategoryGroup::Letter
            .union(GeneralCategoryGroup::Number)
--- a/extra_tests/snippets/builtin_str.py
+++ b/extra_tests/snippets/builtin_str.py
@@ -891,3 +891,13 @@ assert id(b) != id(b * 0)
 assert id(b) != id(b * 1)
 assert id(b) != id(1 * b)
 assert id(b) != id(b * 2)
 # Regression tests for isalpha/isalnum Unicode General Category correctness.
 # These characters are in letter categories (Ll/Lo) and should return True,
 # but were missed in older Unicode tables used by unic-ucd-category.
 # See: https://github.com/RustPython/RustPython/pull/7520#issuecomment-4148322294
 for _cp in [1376, 1416, 1519, 2160, 2161, 2162, 2163, 2164, 2165, 2166]:
    _c = chr(_cp)
    assert _c.isalpha(), f"U+{_cp:04X} should be isalpha"
    assert _c.isalnum(), f"U+{_cp:04X} should be isalnum"
--- a/extra_tests/snippets/builtin_str_unicode.py
+++ b/extra_tests/snippets/builtin_str_unicode.py
@@ -11,6 +11,7 @@ c = ᚴ * 3
 assert c == "👋👋👋"
 import re
 import unicodedata
 assert unicodedata.category("a") == "Ll"
@@ -38,3 +39,10 @@ assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
 # TODO: add east_asian_width and mirrored
 # assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
 # assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")
 # U+0345 COMBINING GREEK YPOGEGRAMMENI (category Mn) should not be alphanumeric.
 # CPython's isalpha/isalnum use Unicode letter categories (Lu/Ll/Lt/Lm/Lo),
 # not the broader Unicode Alphabetic derived property.
 assert not "\u0345".isalpha(), "isalpha should not match Mn category characters"
 assert not "\u0345".isalnum(), "isalnum should not match Mn category characters"
 assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"
Author	SHA1	Message	Date
copilot-swe-agent[bot]	23813ddbcd	Merge main: adopt icu_properties approach, drop unic-ucd-category fallback	2026-06-01 08:50:31 +00:00
copilot-swe-agent[bot]	b9cbd5133b	Fix isalpha/isalnum fallback for chars unassigned in Unicode 10; add regression tests Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/43f9f7dc-f5af-48b8-b93b-6363bcda334c Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>	2026-03-29 04:53:10 +00:00
copilot-swe-agent[bot]	fab1c0cc01	Fix import ordering: move import re to be sorted with import unicodedata Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/18cc7216-15cf-40d1-8726-23e21aa4c368 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>	2026-03-27 23:47:11 +00:00
copilot-swe-agent[bot]	5dd88ee5ae	Fix isalnum/isalpha to use Unicode general category checks; fix regex \\w for Mn characters Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/6f573a91-8811-486c-933d-7ba9a9067643 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>	2026-03-27 16:57:44 +00:00
copilot-swe-agent[bot]	2722bc06de	Initial plan	2026-03-27 16:36:31 +00:00