diff --git a/Cargo.lock b/Cargo.lock index 0c36cd785..a2bb745cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2174,6 +2174,7 @@ dependencies = [ "socket2", "system-configuration", "termios", + "ucd", "unic-char-property", "unic-normal", "unic-ucd-age", @@ -2753,6 +2754,12 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "ucd" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe4fa6e588762366f1eb4991ce59ad1b93651d0b769dfb4e4d1c5c4b943d1159" + [[package]] name = "uname" version = "0.1.1" diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index e6cf0c367..dbf1349a3 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -243,8 +243,6 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): # For tests of unicodedata.is_normalized / self.db.is_normalized , # see test_normalization.py . - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') diff --git a/extra_tests/snippets/builtin_str_unicode.py b/extra_tests/snippets/builtin_str_unicode.py index f270ee114..8858cf9bf 100644 --- a/extra_tests/snippets/builtin_str_unicode.py +++ b/extra_tests/snippets/builtin_str_unicode.py @@ -18,6 +18,7 @@ assert unicodedata.category('A') == 'Lu' assert unicodedata.name('a') == 'LATIN SMALL LETTER A' assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a' assert unicodedata.bidirectional('a') == 'L' +assert unicodedata.east_asian_width('\u231a') == 'W' assert unicodedata.normalize('NFC', 'bla') == 'bla' # testing unicodedata.ucd_3_2_0 for idna diff --git a/stdlib/Cargo.toml b/stdlib/Cargo.toml index 88b896c60..db6565eff 100644 --- a/stdlib/Cargo.toml +++ b/stdlib/Cargo.toml @@ -67,6 +67,7 @@ unic-ucd-bidi = "0.9.0" unic-ucd-category = "0.9.0" unic-ucd-age = "0.9.0" unic-ucd-ident = "0.9.0" +ucd = "0.1.1" # compression adler32 = "1.2.0" diff --git a/stdlib/src/unicodedata.rs b/stdlib/src/unicodedata.rs index 15ca35ed5..b43db0dd1 100644 --- a/stdlib/src/unicodedata.rs +++ b/stdlib/src/unicodedata.rs @@ -10,9 +10,15 @@ pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { .into_ref(vm) .into(); - for attr in ["category", "lookup", "name", "bidirectional", "normalize"] - .iter() - .copied() + for attr in [ + "category", + "lookup", + "name", + "bidirectional", + "east_asian_width", + "normalize", + ] + .into_iter() { crate::vm::extend_module!(vm, &module, { attr => ucd.get_attr(attr, vm).unwrap(), @@ -29,6 +35,7 @@ mod unicodedata { VirtualMachine, }; use itertools::Itertools; + use ucd::{Codepoint, EastAsianWidth}; use unic_char_property::EnumeratedCharProperty; use unic_normal::StrNormalForm; use unic_ucd_age::{Age, UnicodeVersion, UNICODE_VERSION}; @@ -113,6 +120,16 @@ mod unicodedata { Ok(bidi.to_owned()) } + /// NOTE: This function uses 9.0.0 database instead of 3.2.0 + #[pymethod] + fn east_asian_width(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { + Ok(self + .extract_char(character, vm)? + .map_or(EastAsianWidth::Neutral, |c| c.east_asian_width()) + .abbr_name() + .to_owned()) + } + #[pymethod] fn normalize( &self, @@ -138,6 +155,23 @@ mod unicodedata { } } + trait EastAsianWidthAbbrName { + fn abbr_name(&self) -> &'static str; + } + + impl EastAsianWidthAbbrName for EastAsianWidth { + fn abbr_name(&self) -> &'static str { + match self { + EastAsianWidth::Narrow => "Na", + EastAsianWidth::Wide => "W", + EastAsianWidth::Neutral => "N", + EastAsianWidth::Ambiguous => "A", + EastAsianWidth::FullWidth => "F", + EastAsianWidth::HalfWidth => "H", + } + } + } + #[pyattr] fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { Ucd {