Add unicodedata.east_asian_width (#4523)

2026-06-02 19:39:49 +09:00 · 2023-02-20 01:38:02 +09:00
parent 8b7158f169
commit 3b8d670c81
5 changed files with 46 additions and 5 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2174,6 +2174,7 @@ dependencies = [
 "socket2",
 "system-configuration",
 "termios",
+ "ucd",
 "unic-char-property",
 "unic-normal",
 "unic-ucd-age",
@@ -2753,6 +2754,12 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"

+[[package]]
+name = "ucd"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe4fa6e588762366f1eb4991ce59ad1b93651d0b769dfb4e4d1c5c4b943d1159"
+
 [[package]]
 name = "uname"
 version = "0.1.1"
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -243,8 +243,6 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
    # For tests of unicodedata.is_normalized / self.db.is_normalized ,
    # see test_normalization.py .

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_east_asian_width(self):
        eaw = self.db.east_asian_width
        self.assertRaises(TypeError, eaw, b'a')
--- a/extra_tests/snippets/builtin_str_unicode.py
+++ b/extra_tests/snippets/builtin_str_unicode.py
@@ -18,6 +18,7 @@ assert unicodedata.category('A') == 'Lu'
 assert unicodedata.name('a') == 'LATIN SMALL LETTER A'
 assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
 assert unicodedata.bidirectional('a') == 'L'
+assert unicodedata.east_asian_width('\u231a') == 'W'
 assert unicodedata.normalize('NFC', 'bla') == 'bla'

 # testing unicodedata.ucd_3_2_0 for idna
--- a/stdlib/Cargo.toml
+++ b/stdlib/Cargo.toml
@@ -67,6 +67,7 @@ unic-ucd-bidi      = "0.9.0"
 unic-ucd-category  = "0.9.0"
 unic-ucd-age       = "0.9.0"
 unic-ucd-ident     = "0.9.0"
+ucd = "0.1.1"

 # compression
 adler32 = "1.2.0"
--- a/stdlib/src/unicodedata.rs
+++ b/stdlib/src/unicodedata.rs
@@ -10,9 +10,15 @@ pub fn make_module(vm: &VirtualMachine) -> PyObjectRef {
        .into_ref(vm)
        .into();

-    for attr in ["category", "lookup", "name", "bidirectional", "normalize"]
-        .iter()
-        .copied()
+    for attr in [
+        "category",
+        "lookup",
+        "name",
+        "bidirectional",
+        "east_asian_width",
+        "normalize",
+    ]
+    .into_iter()
    {
        crate::vm::extend_module!(vm, &module, {
            attr => ucd.get_attr(attr, vm).unwrap(),
@@ -29,6 +35,7 @@ mod unicodedata {
        VirtualMachine,
    };
    use itertools::Itertools;
+    use ucd::{Codepoint, EastAsianWidth};
    use unic_char_property::EnumeratedCharProperty;
    use unic_normal::StrNormalForm;
    use unic_ucd_age::{Age, UnicodeVersion, UNICODE_VERSION};
@@ -113,6 +120,16 @@ mod unicodedata {
            Ok(bidi.to_owned())
        }

+        /// NOTE: This function uses 9.0.0 database instead of 3.2.0
+        #[pymethod]
+        fn east_asian_width(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
+            Ok(self
+                .extract_char(character, vm)?
+                .map_or(EastAsianWidth::Neutral, |c| c.east_asian_width())
+                .abbr_name()
+                .to_owned())
+        }
+
        #[pymethod]
        fn normalize(
            &self,
@@ -138,6 +155,23 @@ mod unicodedata {
        }
    }

+    trait EastAsianWidthAbbrName {
+        fn abbr_name(&self) -> &'static str;
+    }
+
+    impl EastAsianWidthAbbrName for EastAsianWidth {
+        fn abbr_name(&self) -> &'static str {
+            match self {
+                EastAsianWidth::Narrow => "Na",
+                EastAsianWidth::Wide => "W",
+                EastAsianWidth::Neutral => "N",
+                EastAsianWidth::Ambiguous => "A",
+                EastAsianWidth::FullWidth => "F",
+                EastAsianWidth::HalfWidth => "H",
+            }
+        }
+    }
+
    #[pyattr]
    fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
        Ucd {