Compare commits

...

5 Commits

4 changed files with 19 additions and 1 deletions

1
Cargo.lock generated
View File

@@ -3739,6 +3739,7 @@ dependencies = [
"num_enum", "num_enum",
"optional", "optional",
"rustpython-wtf8", "rustpython-wtf8",
"unic-ucd-category",
] ]
[[package]] [[package]]

View File

@@ -428,7 +428,6 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
} }
#[inline] #[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool { pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch).is_ok_and(|c| { char::try_from(ch).is_ok_and(|c| {
GeneralCategoryGroup::Letter GeneralCategoryGroup::Letter
.union(GeneralCategoryGroup::Number) .union(GeneralCategoryGroup::Number)

View File

@@ -891,3 +891,13 @@ assert id(b) != id(b * 0)
assert id(b) != id(b * 1) assert id(b) != id(b * 1)
assert id(b) != id(1 * b) assert id(b) != id(1 * b)
assert id(b) != id(b * 2) assert id(b) != id(b * 2)
# Regression tests for isalpha/isalnum Unicode General Category correctness.
# These characters are in letter categories (Ll/Lo) and should return True,
# but were missed in older Unicode tables used by unic-ucd-category.
# See: https://github.com/RustPython/RustPython/pull/7520#issuecomment-4148322294
for _cp in [1376, 1416, 1519, 2160, 2161, 2162, 2163, 2164, 2165, 2166]:
_c = chr(_cp)
assert _c.isalpha(), f"U+{_cp:04X} should be isalpha"
assert _c.isalnum(), f"U+{_cp:04X} should be isalnum"

View File

@@ -11,6 +11,7 @@ c = ᚴ * 3
assert c == "πŸ‘‹πŸ‘‹πŸ‘‹" assert c == "πŸ‘‹πŸ‘‹πŸ‘‹"
import re
import unicodedata import unicodedata
assert unicodedata.category("a") == "Ll" assert unicodedata.category("a") == "Ll"
@@ -38,3 +39,10 @@ assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
# TODO: add east_asian_width and mirrored # TODO: add east_asian_width and mirrored
# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N' # assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a") # assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")
# U+0345 COMBINING GREEK YPOGEGRAMMENI (category Mn) should not be alphanumeric.
# CPython's isalpha/isalnum use Unicode letter categories (Lu/Ll/Lt/Lm/Lo),
# not the broader Unicode Alphabetic derived property.
assert not "\u0345".isalpha(), "isalpha should not match Mn category characters"
assert not "\u0345".isalnum(), "isalnum should not match Mn category characters"
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"