From 8494979a02dae027a11d94798fb195cbc7ebc7ac Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Thu, 26 Dec 2019 13:19:28 -0600 Subject: [PATCH 1/3] Reorganize unicodedata and add unicodedata.ucd_3_2_0 --- Cargo.lock | 8 +- vm/Cargo.toml | 8 +- vm/src/pyobject.rs | 4 +- vm/src/stdlib/unicodedata.rs | 199 ++++++++++++++++++++++++----------- 4 files changed, 146 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3b4e38a3..57059c632 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1352,9 +1352,9 @@ dependencies = [ "statrs 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "subprocess 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)", "unic 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unic-common 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-casing 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1971,11 +1971,6 @@ name = "unicode-xid" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "unicode_categories" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "unicode_names2" version = "0.3.0" @@ -2366,7 +2361,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" -"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" "checksum unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7a928b876ff873d4a0ac966acce72423879dd86afcf190017aa700207188078" "checksum utf8parse 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8772a4ccbb4e89959023bc5b7cb8623a795caa7092d99f3aa9501b9484d4557d" "checksum vcpkg 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "33dd455d0f96e90a75803cfeb7f948768c08d70a6de9a8d2362461935698bf95" diff --git a/vm/Cargo.toml b/vm/Cargo.toml index e9748b844..62f87ca8e 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -54,10 +54,10 @@ hex = "0.4.0" hexf-parse = "0.1.0" indexmap = "1.0.2" crc = "^1.0.0" -unicode_categories = "0.1.1" -unicode_names2 = "0.3.0" -unicode-casing = "0.1.0" -unic = "0.9.0" +unicode_names2 = "0.3" +unicode-casing = "0.1" +unic = "0.9" +unic-common = "0.9" maplit = "1.0" proc-macro-hack = { version = "0.5", optional = true } bitflags = "1.1" diff --git a/vm/src/pyobject.rs b/vm/src/pyobject.rs index 0fe1cdd17..c346bb28e 100644 --- a/vm/src/pyobject.rs +++ b/vm/src/pyobject.rs @@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static { }; PyRef::new_ref(PyObject::new(self, cls, dict), vm) } else { - let subtype = vm.to_pystr(&cls.obj)?; - let basetype = vm.to_pystr(&class.obj)?; + let subtype = vm.to_str(&cls.obj)?; + let basetype = vm.to_str(&class.obj)?; Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype))) } } diff --git a/vm/src/stdlib/unicodedata.rs b/vm/src/stdlib/unicodedata.rs index b1e75730b..077b687a7 100644 --- a/vm/src/stdlib/unicodedata.rs +++ b/vm/src/stdlib/unicodedata.rs @@ -4,54 +4,132 @@ use crate::function::OptionalArg; use crate::obj::objstr::PyStringRef; -use crate::pyobject::{PyObjectRef, PyResult}; +use crate::obj::objtype::PyClassRef; +use crate::pyobject::{PyClassImpl, PyObject, PyObjectRef, PyResult, PyValue}; use crate::vm::VirtualMachine; +use itertools::Itertools; +use unic::bidi::BidiClass; use unic::char::property::EnumeratedCharProperty; +use unic::normal::StrNormalForm; use unic::ucd::category::GeneralCategory; -use unic::ucd::Name; -use unicode_names2; +use unic::ucd::{Age, Name}; +use unic_common::version::UnicodeVersion; pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { let ctx = &vm.ctx; - let unidata_version = unic::UNICODE_VERSION.to_string(); + let ucd_class = PyUCD::make_class(ctx); - py_module!(vm, "unicodedata", { - "bidirectional" => ctx.new_rustfunc(bidirectional), - "category" => ctx.new_rustfunc(category), - "name" => ctx.new_rustfunc(name), - "lookup" => ctx.new_rustfunc(lookup), - "normalize" => ctx.new_rustfunc(normalize), - "unidata_version" => ctx.new_str(unidata_version), - }) + let ucd = PyObject::new(PyUCD::default(), ucd_class.clone(), None); + + let ucd_3_2_0 = PyObject::new( + PyUCD { + unic_version: UnicodeVersion { + major: 3, + minor: 2, + micro: 0, + }, + }, + ucd_class.clone(), + None, + ); + + let module = py_module!(vm, "unicodedata", { + "UCD" => ucd_class.into_object(), + "ucd_3_2_0" => ucd_3_2_0, + // we do unidata_version here because the getter tries to do PyUCD::class() before + // the module is in the VM + "unidata_version" => ctx.new_str(PyUCD::default().unic_version.to_string()), + }); + + for attr in ["category", "lookup", "name", "bidirectional", "normalize"] + .iter() + .copied() + { + extend_module!(vm, &module, { + attr => vm.get_attribute(ucd.clone(), attr).unwrap(), + }); + } + + module } -fn category(character: PyStringRef, vm: &VirtualMachine) -> PyResult { - let my_char = extract_char(character, vm)?; - let category = GeneralCategory::of(my_char); - Ok(vm.new_str(category.abbr_name().to_string())) +#[pyclass] +#[derive(Debug)] +struct PyUCD { + unic_version: UnicodeVersion, } -fn lookup(name: PyStringRef, vm: &VirtualMachine) -> PyResult { - // TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible: - if let Some(character) = unicode_names2::character(name.as_str()) { - Ok(vm.new_str(character.to_string())) - } else { - Err(vm.new_key_error(vm.new_str(format!("undefined character name '{}'", name)))) +impl PyValue for PyUCD { + fn class(vm: &VirtualMachine) -> PyClassRef { + vm.class("unicodedata", "UCD") } } -fn name( - character: PyStringRef, - default: OptionalArg, - vm: &VirtualMachine, -) -> PyResult { - let my_char = extract_char(character, vm)?; +impl Default for PyUCD { + #[inline(always)] + fn default() -> Self { + PyUCD { + unic_version: unic::UNICODE_VERSION, + } + } +} - if let Some(name) = Name::of(my_char) { - Ok(vm.new_str(name.to_string())) - } else { +#[pyimpl] +impl PyUCD { + fn check_age(&self, c: char) -> bool { + Age::of(c).map_or(false, |age| age.actual() <= self.unic_version) + } + + fn extract_char(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult> { + let c = character.as_str().chars().exactly_one().map_err(|_| { + vm.new_type_error("argument must be an unicode character, not str".to_string()) + })?; + + if self.check_age(c) { + Ok(Some(c)) + } else { + Ok(None) + } + } + + #[pymethod] + fn category(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult { + Ok(self + .extract_char(character, vm)? + .map_or(GeneralCategory::Unassigned, GeneralCategory::of) + .abbr_name() + .to_owned()) + } + + #[pymethod] + fn lookup(&self, name: PyStringRef, vm: &VirtualMachine) -> PyResult { + // TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible: + if let Some(character) = unicode_names2::character(name.as_str()) { + if self.check_age(character) { + return Ok(character.to_string()); + } + } + Err(vm.new_lookup_error(format!("undefined character name '{}'", name))) + } + + #[pymethod] + fn name( + &self, + character: PyStringRef, + default: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let c = self.extract_char(character, vm)?; + + if let Some(c) = c { + if self.check_age(c) { + if let Some(name) = Name::of(c) { + return Ok(vm.new_str(name.to_string())); + } + } + } match default { OptionalArg::Present(obj) => Ok(obj), OptionalArg::Missing => { @@ -59,36 +137,37 @@ fn name( } } } -} -fn bidirectional(character: PyStringRef, vm: &VirtualMachine) -> PyResult { - use unic::bidi::BidiClass; - let my_char = extract_char(character, vm)?; - let cls = BidiClass::of(my_char); - Ok(vm.new_str(cls.abbr_name().to_string())) -} - -fn normalize(form: PyStringRef, unistr: PyStringRef, vm: &VirtualMachine) -> PyResult { - use unic::normal::StrNormalForm; - let text = unistr.as_str(); - let normalized_text = match form.as_str() { - "NFC" => text.nfc().collect::(), - "NFKC" => text.nfkc().collect::(), - "NFD" => text.nfd().collect::(), - "NFKD" => text.nfkd().collect::(), - _ => { - return Err(vm.new_value_error("unistr must be one of NFC, NFD".to_string())); - } - }; - - Ok(vm.new_str(normalized_text)) -} - -fn extract_char(character: PyStringRef, vm: &VirtualMachine) -> PyResult { - if character.as_str().len() != 1 { - return Err(vm.new_type_error("argument must be an unicode character, not str".to_string())); + #[pymethod] + fn bidirectional(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult { + let bidi = match self.extract_char(character, vm)? { + Some(c) => BidiClass::of(c).abbr_name(), + None => "", + }; + Ok(bidi.to_owned()) } - let my_char: char = character.as_str().chars().next().unwrap(); - Ok(my_char) + #[pymethod] + fn normalize( + &self, + form: PyStringRef, + unistr: PyStringRef, + vm: &VirtualMachine, + ) -> PyResult { + let text = unistr.as_str(); + let normalized_text = match form.as_str() { + "NFC" => text.nfc().collect::(), + "NFKC" => text.nfkc().collect::(), + "NFD" => text.nfd().collect::(), + "NFKD" => text.nfkd().collect::(), + _ => return Err(vm.new_value_error("invalid normalization form".to_string())), + }; + + Ok(normalized_text) + } + + #[pyproperty] + fn unidata_version(&self, _vm: &VirtualMachine) -> String { + self.unic_version.to_string() + } } From 98211a34dee7aa9eb837752f4a5ada20053e9440 Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Thu, 26 Dec 2019 13:29:20 -0600 Subject: [PATCH 2/3] Try to cut down on the # of unicode crates we use --- Cargo.lock | 3 +-- parser/Cargo.toml | 4 ++-- parser/src/lexer.rs | 13 +++--------- vm/Cargo.toml | 3 ++- vm/src/obj/objstr.rs | 47 +++++++++++++++++++++----------------------- 5 files changed, 30 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 57059c632..6b6278406 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1286,7 +1286,7 @@ dependencies = [ "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "unic-emoji-char 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unic-ucd-ident 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "wtf8 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1354,7 +1354,6 @@ dependencies = [ "unic 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "unic-common 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-casing 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 047714c9d..0d6da2b66 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -17,6 +17,6 @@ log="0.4.1" regex = "1" num-bigint = "0.2" num-traits = "0.2" -unicode-xid = "0.2.0" -unic-emoji-char = "0.9.0" +unic-emoji-char = "0.9" +unic-ucd-ident = "0.9" wtf8 = "0.0.3" diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index a51bc3cf9..2a7011967 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -2,9 +2,6 @@ //! //! This means source code is translated into separate tokens. -extern crate unic_emoji_char; -extern crate unicode_xid; - pub use super::token::Tok; use crate::error::{LexicalError, LexicalErrorType}; use crate::location::Location; @@ -15,8 +12,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::str::FromStr; use unic_emoji_char::is_emoji_presentation; -use unicode_xid::UnicodeXID; -use wtf8; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; #[derive(Clone, Copy, PartialEq, Debug, Default)] struct IndentationLevel { @@ -658,17 +654,14 @@ where } fn is_identifier_start(&self, c: char) -> bool { - match c { - '_' => true, - c => UnicodeXID::is_xid_start(c), - } + c == '_' || is_xid_start(c) } fn is_identifier_continuation(&self) -> bool { if let Some(c) = self.chr0 { match c { '_' | '0'..='9' => true, - c => UnicodeXID::is_xid_continue(c), + c => is_xid_continue(c), } } else { false diff --git a/vm/Cargo.toml b/vm/Cargo.toml index 62f87ca8e..068cf0f0e 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -46,7 +46,6 @@ rustc_version_runtime = "0.1.*" statrs = "0.12.0" caseless = "0.2.1" chrono = { version = "=0.4.9", features = ["wasmbind"] } -unicode-xid = "0.2.0" lazy_static = "^1.0.1" lexical = "4" itertools = "0.8" @@ -55,6 +54,8 @@ hexf-parse = "0.1.0" indexmap = "1.0.2" crc = "^1.0.0" unicode_names2 = "0.3" +# TODO: use unic for this; needed for title case: +# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 unicode-casing = "0.1" unic = "0.9" unic-common = "0.9" diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 0e0f3fb22..063e18152 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -1,6 +1,3 @@ -extern crate unicode_categories; -extern crate unicode_xid; - use std::cell::Cell; use std::char; use std::fmt; @@ -10,10 +7,10 @@ use std::str::FromStr; use std::string::ToString; use num_traits::ToPrimitive; +use unic::ucd::category::GeneralCategory; +use unic::ucd::ident::{is_xid_continue, is_xid_start}; use unic::ucd::is_cased; use unicode_casing::CharExt; -use unicode_categories::UnicodeCategories; -use unicode_xid::UnicodeXID; use super::objbytes::{PyBytes, PyBytesRef}; use super::objdict::PyDict; @@ -366,16 +363,7 @@ impl PyString { formatted.push_str(&format!("\\x{:02x}", c as u32)); } else if c.is_ascii() { formatted.push(c); - } else if c.is_other() || c.is_separator() { - // According to python following categories aren't printable: - // * Cc (Other, Control) - // * Cf (Other, Format) - // * Cs (Other, Surrogate) - // * Co (Other, Private Use) - // * Cn (Other, Not Assigned) - // * Zl Separator, Line ('\u2028', LINE SEPARATOR) - // * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) - // * Zs (Separator, Space) other than ASCII space('\x20'). + } else if !char_is_printable(c) { let code = c as u32; let escaped = if code < 0xff { format!("\\U{:02x}", code) @@ -742,10 +730,9 @@ impl PyString { /// * Zs (Separator, Space) other than ASCII space('\x20'). #[pymethod] fn isprintable(&self, _vm: &VirtualMachine) -> bool { - self.value.chars().all(|c| match c { - '\u{0020}' => true, - _ => !(c.is_other_control() | c.is_separator()), - }) + self.value + .chars() + .all(|c| c == '\u{0020}' || char_is_printable(c)) } // cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty @@ -1094,13 +1081,9 @@ impl PyString { #[pymethod] fn isidentifier(&self, _vm: &VirtualMachine) -> bool { let mut chars = self.value.chars(); - let is_identifier_start = match chars.next() { - Some('_') => true, - Some(c) => UnicodeXID::is_xid_start(c), - None => false, - }; + let is_identifier_start = chars.next().map_or(false, |c| c == '_' || is_xid_start(c)); // a string is not an identifier if it has whitespace or starts with a number - is_identifier_start && chars.all(UnicodeXID::is_xid_continue) + is_identifier_start && chars.all(is_xid_continue) } // https://docs.python.org/3/library/stdtypes.html#str.translate @@ -1706,6 +1689,20 @@ fn adjust_indices( } } +// According to python following categories aren't printable: +// * Cc (Other, Control) +// * Cf (Other, Format) +// * Cs (Other, Surrogate) +// * Co (Other, Private Use) +// * Cn (Other, Not Assigned) +// * Zl Separator, Line ('\u2028', LINE SEPARATOR) +// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) +// * Zs (Separator, Space) other than ASCII space('\x20'). +fn char_is_printable(c: char) -> bool { + let cat = GeneralCategory::of(c); + !(cat.is_other() || cat.is_separator()) +} + #[cfg(test)] mod tests { use super::*; From 7d3f34144dade48c5f6f4f827c0140a4e3b4bc72 Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Sat, 28 Dec 2019 21:38:28 -0600 Subject: [PATCH 3/3] Add tests --- tests/snippets/unicode_fu.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/snippets/unicode_fu.py b/tests/snippets/unicode_fu.py index bae97a372..82294697a 100644 --- a/tests/snippets/unicode_fu.py +++ b/tests/snippets/unicode_fu.py @@ -19,3 +19,21 @@ assert unicodedata.name('a') == 'LATIN SMALL LETTER A' assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a' assert unicodedata.bidirectional('a') == 'L' assert unicodedata.normalize('NFC', 'bla') == 'bla' + +# testing unicodedata.ucd_3_2_0 for idna +assert "abcСĤ".encode("idna") == b'xn--abc-7sa390b' +# TODO: fix: assert "abc䄣IJ".encode("idna") == b'xn--abcij-zb5f' + +# from CPython tests +assert "python.org".encode("idna") == b"python.org" +assert "python.org.".encode("idna") == b"python.org." +assert "pyth\xf6n.org".encode("idna") == b"xn--pythn-mua.org" +assert "pyth\xf6n.org.".encode("idna") == b"xn--pythn-mua.org." +assert b"python.org".decode("idna") == "python.org" +assert b"python.org.".decode("idna") == "python.org." +assert b"xn--pythn-mua.org".decode("idna") == "pyth\xf6n.org" +assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org." + +# TODO: add east_asian_width and mirrored +# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N' +# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")