From 8494979a02dae027a11d94798fb195cbc7ebc7ac Mon Sep 17 00:00:00 2001
From: coolreader18 <33094578+coolreader18@users.noreply.github.com>
Date: Thu, 26 Dec 2019 13:19:28 -0600
Subject: [PATCH 1/3] Reorganize unicodedata and add unicodedata.ucd_3_2_0

---
 Cargo.lock                   |   8 +-
 vm/Cargo.toml                |   8 +-
 vm/src/pyobject.rs           |   4 +-
 vm/src/stdlib/unicodedata.rs | 199 ++++++++++++++++++++++++-----------
 4 files changed, 146 insertions(+), 73 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b3b4e38a3..57059c632 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1352,9 +1352,9 @@ dependencies = [
  "statrs 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "subprocess 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)",
  "unic 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unic-common 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "unicode-casing 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
- "unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "wasm-bindgen 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)",
  "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1971,11 +1971,6 @@ name = "unicode-xid"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
-[[package]]
-name = "unicode_categories"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-
 [[package]]
 name = "unicode_names2"
 version = "0.3.0"
@@ -2366,7 +2361,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20"
 "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
 "checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
-"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 "checksum unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7a928b876ff873d4a0ac966acce72423879dd86afcf190017aa700207188078"
 "checksum utf8parse 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8772a4ccbb4e89959023bc5b7cb8623a795caa7092d99f3aa9501b9484d4557d"
 "checksum vcpkg 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "33dd455d0f96e90a75803cfeb7f948768c08d70a6de9a8d2362461935698bf95"
diff --git a/vm/Cargo.toml b/vm/Cargo.toml
index e9748b844..62f87ca8e 100644
--- a/vm/Cargo.toml
+++ b/vm/Cargo.toml
@@ -54,10 +54,10 @@ hex = "0.4.0"
 hexf-parse = "0.1.0"
 indexmap = "1.0.2"
 crc = "^1.0.0"
-unicode_categories = "0.1.1"
-unicode_names2 = "0.3.0"
-unicode-casing = "0.1.0"
-unic = "0.9.0"
+unicode_names2 = "0.3"
+unicode-casing = "0.1"
+unic = "0.9"
+unic-common = "0.9"
 maplit = "1.0"
 proc-macro-hack = { version = "0.5", optional = true }
 bitflags = "1.1"
diff --git a/vm/src/pyobject.rs b/vm/src/pyobject.rs
index 0fe1cdd17..c346bb28e 100644
--- a/vm/src/pyobject.rs
+++ b/vm/src/pyobject.rs
@@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static {
             };
             PyRef::new_ref(PyObject::new(self, cls, dict), vm)
         } else {
-            let subtype = vm.to_pystr(&cls.obj)?;
-            let basetype = vm.to_pystr(&class.obj)?;
+            let subtype = vm.to_str(&cls.obj)?;
+            let basetype = vm.to_str(&class.obj)?;
             Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype)))
         }
     }
diff --git a/vm/src/stdlib/unicodedata.rs b/vm/src/stdlib/unicodedata.rs
index b1e75730b..077b687a7 100644
--- a/vm/src/stdlib/unicodedata.rs
+++ b/vm/src/stdlib/unicodedata.rs
@@ -4,54 +4,132 @@
 
 use crate::function::OptionalArg;
 use crate::obj::objstr::PyStringRef;
-use crate::pyobject::{PyObjectRef, PyResult};
+use crate::obj::objtype::PyClassRef;
+use crate::pyobject::{PyClassImpl, PyObject, PyObjectRef, PyResult, PyValue};
 use crate::vm::VirtualMachine;
 
+use itertools::Itertools;
+use unic::bidi::BidiClass;
 use unic::char::property::EnumeratedCharProperty;
+use unic::normal::StrNormalForm;
 use unic::ucd::category::GeneralCategory;
-use unic::ucd::Name;
-use unicode_names2;
+use unic::ucd::{Age, Name};
+use unic_common::version::UnicodeVersion;
 
 pub fn make_module(vm: &VirtualMachine) -> PyObjectRef {
     let ctx = &vm.ctx;
 
-    let unidata_version = unic::UNICODE_VERSION.to_string();
+    let ucd_class = PyUCD::make_class(ctx);
 
-    py_module!(vm, "unicodedata", {
-        "bidirectional" => ctx.new_rustfunc(bidirectional),
-        "category" => ctx.new_rustfunc(category),
-        "name" => ctx.new_rustfunc(name),
-        "lookup" => ctx.new_rustfunc(lookup),
-        "normalize" => ctx.new_rustfunc(normalize),
-        "unidata_version" => ctx.new_str(unidata_version),
-    })
+    let ucd = PyObject::new(PyUCD::default(), ucd_class.clone(), None);
+
+    let ucd_3_2_0 = PyObject::new(
+        PyUCD {
+            unic_version: UnicodeVersion {
+                major: 3,
+                minor: 2,
+                micro: 0,
+            },
+        },
+        ucd_class.clone(),
+        None,
+    );
+
+    let module = py_module!(vm, "unicodedata", {
+        "UCD" => ucd_class.into_object(),
+        "ucd_3_2_0" => ucd_3_2_0,
+        // we do unidata_version here because the getter tries to do PyUCD::class() before
+        // the module is in the VM
+        "unidata_version" => ctx.new_str(PyUCD::default().unic_version.to_string()),
+    });
+
+    for attr in ["category", "lookup", "name", "bidirectional", "normalize"]
+        .iter()
+        .copied()
+    {
+        extend_module!(vm, &module, {
+            attr => vm.get_attribute(ucd.clone(), attr).unwrap(),
+        });
+    }
+
+    module
 }
 
-fn category(character: PyStringRef, vm: &VirtualMachine) -> PyResult {
-    let my_char = extract_char(character, vm)?;
-    let category = GeneralCategory::of(my_char);
-    Ok(vm.new_str(category.abbr_name().to_string()))
+#[pyclass]
+#[derive(Debug)]
+struct PyUCD {
+    unic_version: UnicodeVersion,
 }
 
-fn lookup(name: PyStringRef, vm: &VirtualMachine) -> PyResult {
-    // TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible:
-    if let Some(character) = unicode_names2::character(name.as_str()) {
-        Ok(vm.new_str(character.to_string()))
-    } else {
-        Err(vm.new_key_error(vm.new_str(format!("undefined character name '{}'", name))))
+impl PyValue for PyUCD {
+    fn class(vm: &VirtualMachine) -> PyClassRef {
+        vm.class("unicodedata", "UCD")
     }
 }
 
-fn name(
-    character: PyStringRef,
-    default: OptionalArg<PyObjectRef>,
-    vm: &VirtualMachine,
-) -> PyResult {
-    let my_char = extract_char(character, vm)?;
+impl Default for PyUCD {
+    #[inline(always)]
+    fn default() -> Self {
+        PyUCD {
+            unic_version: unic::UNICODE_VERSION,
+        }
+    }
+}
 
-    if let Some(name) = Name::of(my_char) {
-        Ok(vm.new_str(name.to_string()))
-    } else {
+#[pyimpl]
+impl PyUCD {
+    fn check_age(&self, c: char) -> bool {
+        Age::of(c).map_or(false, |age| age.actual() <= self.unic_version)
+    }
+
+    fn extract_char(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<Option<char>> {
+        let c = character.as_str().chars().exactly_one().map_err(|_| {
+            vm.new_type_error("argument must be an unicode character, not str".to_string())
+        })?;
+
+        if self.check_age(c) {
+            Ok(Some(c))
+        } else {
+            Ok(None)
+        }
+    }
+
+    #[pymethod]
+    fn category(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
+        Ok(self
+            .extract_char(character, vm)?
+            .map_or(GeneralCategory::Unassigned, GeneralCategory::of)
+            .abbr_name()
+            .to_owned())
+    }
+
+    #[pymethod]
+    fn lookup(&self, name: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
+        // TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible:
+        if let Some(character) = unicode_names2::character(name.as_str()) {
+            if self.check_age(character) {
+                return Ok(character.to_string());
+            }
+        }
+        Err(vm.new_lookup_error(format!("undefined character name '{}'", name)))
+    }
+
+    #[pymethod]
+    fn name(
+        &self,
+        character: PyStringRef,
+        default: OptionalArg<PyObjectRef>,
+        vm: &VirtualMachine,
+    ) -> PyResult {
+        let c = self.extract_char(character, vm)?;
+
+        if let Some(c) = c {
+            if self.check_age(c) {
+                if let Some(name) = Name::of(c) {
+                    return Ok(vm.new_str(name.to_string()));
+                }
+            }
+        }
         match default {
             OptionalArg::Present(obj) => Ok(obj),
             OptionalArg::Missing => {
@@ -59,36 +137,37 @@ fn name(
             }
         }
     }
-}
 
-fn bidirectional(character: PyStringRef, vm: &VirtualMachine) -> PyResult {
-    use unic::bidi::BidiClass;
-    let my_char = extract_char(character, vm)?;
-    let cls = BidiClass::of(my_char);
-    Ok(vm.new_str(cls.abbr_name().to_string()))
-}
-
-fn normalize(form: PyStringRef, unistr: PyStringRef, vm: &VirtualMachine) -> PyResult {
-    use unic::normal::StrNormalForm;
-    let text = unistr.as_str();
-    let normalized_text = match form.as_str() {
-        "NFC" => text.nfc().collect::<String>(),
-        "NFKC" => text.nfkc().collect::<String>(),
-        "NFD" => text.nfd().collect::<String>(),
-        "NFKD" => text.nfkd().collect::<String>(),
-        _ => {
-            return Err(vm.new_value_error("unistr must be one of NFC, NFD".to_string()));
-        }
-    };
-
-    Ok(vm.new_str(normalized_text))
-}
-
-fn extract_char(character: PyStringRef, vm: &VirtualMachine) -> PyResult<char> {
-    if character.as_str().len() != 1 {
-        return Err(vm.new_type_error("argument must be an unicode character, not str".to_string()));
+    #[pymethod]
+    fn bidirectional(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
+        let bidi = match self.extract_char(character, vm)? {
+            Some(c) => BidiClass::of(c).abbr_name(),
+            None => "",
+        };
+        Ok(bidi.to_owned())
     }
 
-    let my_char: char = character.as_str().chars().next().unwrap();
-    Ok(my_char)
+    #[pymethod]
+    fn normalize(
+        &self,
+        form: PyStringRef,
+        unistr: PyStringRef,
+        vm: &VirtualMachine,
+    ) -> PyResult<String> {
+        let text = unistr.as_str();
+        let normalized_text = match form.as_str() {
+            "NFC" => text.nfc().collect::<String>(),
+            "NFKC" => text.nfkc().collect::<String>(),
+            "NFD" => text.nfd().collect::<String>(),
+            "NFKD" => text.nfkd().collect::<String>(),
+            _ => return Err(vm.new_value_error("invalid normalization form".to_string())),
+        };
+
+        Ok(normalized_text)
+    }
+
+    #[pyproperty]
+    fn unidata_version(&self, _vm: &VirtualMachine) -> String {
+        self.unic_version.to_string()
+    }
 }

From 98211a34dee7aa9eb837752f4a5ada20053e9440 Mon Sep 17 00:00:00 2001
From: coolreader18 <33094578+coolreader18@users.noreply.github.com>
Date: Thu, 26 Dec 2019 13:29:20 -0600
Subject: [PATCH 2/3] Try to cut down on the # of unicode crates we use

---
 Cargo.lock           |  3 +--
 parser/Cargo.toml    |  4 ++--
 parser/src/lexer.rs  | 13 +++---------
 vm/Cargo.toml        |  3 ++-
 vm/src/obj/objstr.rs | 47 +++++++++++++++++++++-----------------------
 5 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 57059c632..6b6278406 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1286,7 +1286,7 @@ dependencies = [
  "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "unic-emoji-char 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
- "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unic-ucd-ident 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "wtf8 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
@@ -1354,7 +1354,6 @@ dependencies = [
  "unic 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "unic-common 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "unicode-casing 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
- "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "wasm-bindgen 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)",
  "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
index 047714c9d..0d6da2b66 100644
--- a/parser/Cargo.toml
+++ b/parser/Cargo.toml
@@ -17,6 +17,6 @@ log="0.4.1"
 regex = "1"
 num-bigint = "0.2"
 num-traits = "0.2"
-unicode-xid = "0.2.0"
-unic-emoji-char = "0.9.0"
+unic-emoji-char = "0.9"
+unic-ucd-ident = "0.9"
 wtf8 = "0.0.3"
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
index a51bc3cf9..2a7011967 100644
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@@ -2,9 +2,6 @@
 //!
 //! This means source code is translated into separate tokens.
 
-extern crate unic_emoji_char;
-extern crate unicode_xid;
-
 pub use super::token::Tok;
 use crate::error::{LexicalError, LexicalErrorType};
 use crate::location::Location;
@@ -15,8 +12,7 @@ use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::str::FromStr;
 use unic_emoji_char::is_emoji_presentation;
-use unicode_xid::UnicodeXID;
-use wtf8;
+use unic_ucd_ident::{is_xid_continue, is_xid_start};
 
 #[derive(Clone, Copy, PartialEq, Debug, Default)]
 struct IndentationLevel {
@@ -658,17 +654,14 @@ where
     }
 
     fn is_identifier_start(&self, c: char) -> bool {
-        match c {
-            '_' => true,
-            c => UnicodeXID::is_xid_start(c),
-        }
+        c == '_' || is_xid_start(c)
     }
 
     fn is_identifier_continuation(&self) -> bool {
         if let Some(c) = self.chr0 {
             match c {
                 '_' | '0'..='9' => true,
-                c => UnicodeXID::is_xid_continue(c),
+                c => is_xid_continue(c),
             }
         } else {
             false
diff --git a/vm/Cargo.toml b/vm/Cargo.toml
index 62f87ca8e..068cf0f0e 100644
--- a/vm/Cargo.toml
+++ b/vm/Cargo.toml
@@ -46,7 +46,6 @@ rustc_version_runtime = "0.1.*"
 statrs = "0.12.0"
 caseless = "0.2.1"
 chrono = { version = "=0.4.9", features = ["wasmbind"] }
-unicode-xid = "0.2.0"
 lazy_static = "^1.0.1"
 lexical = "4"
 itertools = "0.8"
@@ -55,6 +54,8 @@ hexf-parse = "0.1.0"
 indexmap = "1.0.2"
 crc = "^1.0.0"
 unicode_names2 = "0.3"
+# TODO: use unic for this; needed for title case:
+# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
 unicode-casing = "0.1"
 unic = "0.9"
 unic-common = "0.9"
diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs
index 0e0f3fb22..063e18152 100644
--- a/vm/src/obj/objstr.rs
+++ b/vm/src/obj/objstr.rs
@@ -1,6 +1,3 @@
-extern crate unicode_categories;
-extern crate unicode_xid;
-
 use std::cell::Cell;
 use std::char;
 use std::fmt;
@@ -10,10 +7,10 @@ use std::str::FromStr;
 use std::string::ToString;
 
 use num_traits::ToPrimitive;
+use unic::ucd::category::GeneralCategory;
+use unic::ucd::ident::{is_xid_continue, is_xid_start};
 use unic::ucd::is_cased;
 use unicode_casing::CharExt;
-use unicode_categories::UnicodeCategories;
-use unicode_xid::UnicodeXID;
 
 use super::objbytes::{PyBytes, PyBytesRef};
 use super::objdict::PyDict;
@@ -366,16 +363,7 @@ impl PyString {
                 formatted.push_str(&format!("\\x{:02x}", c as u32));
             } else if c.is_ascii() {
                 formatted.push(c);
-            } else if c.is_other() || c.is_separator() {
-                // According to python following categories aren't printable:
-                // * Cc (Other, Control)
-                // * Cf (Other, Format)
-                // * Cs (Other, Surrogate)
-                // * Co (Other, Private Use)
-                // * Cn (Other, Not Assigned)
-                // * Zl Separator, Line ('\u2028', LINE SEPARATOR)
-                // * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
-                // * Zs (Separator, Space) other than ASCII space('\x20').
+            } else if !char_is_printable(c) {
                 let code = c as u32;
                 let escaped = if code < 0xff {
                     format!("\\U{:02x}", code)
@@ -742,10 +730,9 @@ impl PyString {
     ///   * Zs (Separator, Space) other than ASCII space('\x20').
     #[pymethod]
     fn isprintable(&self, _vm: &VirtualMachine) -> bool {
-        self.value.chars().all(|c| match c {
-            '\u{0020}' => true,
-            _ => !(c.is_other_control() | c.is_separator()),
-        })
+        self.value
+            .chars()
+            .all(|c| c == '\u{0020}' || char_is_printable(c))
     }
 
     // cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty
@@ -1094,13 +1081,9 @@ impl PyString {
     #[pymethod]
     fn isidentifier(&self, _vm: &VirtualMachine) -> bool {
         let mut chars = self.value.chars();
-        let is_identifier_start = match chars.next() {
-            Some('_') => true,
-            Some(c) => UnicodeXID::is_xid_start(c),
-            None => false,
-        };
+        let is_identifier_start = chars.next().map_or(false, |c| c == '_' || is_xid_start(c));
         // a string is not an identifier if it has whitespace or starts with a number
-        is_identifier_start && chars.all(UnicodeXID::is_xid_continue)
+        is_identifier_start && chars.all(is_xid_continue)
     }
 
     // https://docs.python.org/3/library/stdtypes.html#str.translate
@@ -1706,6 +1689,20 @@ fn adjust_indices(
     }
 }
 
+// According to python following categories aren't printable:
+// * Cc (Other, Control)
+// * Cf (Other, Format)
+// * Cs (Other, Surrogate)
+// * Co (Other, Private Use)
+// * Cn (Other, Not Assigned)
+// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
+// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
+// * Zs (Separator, Space) other than ASCII space('\x20').
+fn char_is_printable(c: char) -> bool {
+    let cat = GeneralCategory::of(c);
+    !(cat.is_other() || cat.is_separator())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 7d3f34144dade48c5f6f4f827c0140a4e3b4bc72 Mon Sep 17 00:00:00 2001
From: coolreader18 <33094578+coolreader18@users.noreply.github.com>
Date: Sat, 28 Dec 2019 21:38:28 -0600
Subject: [PATCH 3/3] Add tests

---
 tests/snippets/unicode_fu.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/snippets/unicode_fu.py b/tests/snippets/unicode_fu.py
index bae97a372..82294697a 100644
--- a/tests/snippets/unicode_fu.py
+++ b/tests/snippets/unicode_fu.py
@@ -19,3 +19,21 @@ assert unicodedata.name('a') == 'LATIN SMALL LETTER A'
 assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
 assert unicodedata.bidirectional('a') == 'L'
 assert unicodedata.normalize('NFC', 'bla') == 'bla'
+
+# testing unicodedata.ucd_3_2_0 for idna
+assert "abcСĤ".encode("idna") == b'xn--abc-7sa390b'
+# TODO: fix: assert "abc䄣Ĳ".encode("idna") == b'xn--abcij-zb5f'
+
+# from CPython tests
+assert "python.org".encode("idna") == b"python.org"
+assert "python.org.".encode("idna") == b"python.org."
+assert "pyth\xf6n.org".encode("idna") == b"xn--pythn-mua.org"
+assert "pyth\xf6n.org.".encode("idna") == b"xn--pythn-mua.org."
+assert b"python.org".decode("idna") == "python.org"
+assert b"python.org.".decode("idna") == "python.org."
+assert b"xn--pythn-mua.org".decode("idna") == "pyth\xf6n.org"
+assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
+
+# TODO: add east_asian_width and mirrored
+# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
+# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")