mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
Merge pull request #1658 from RustPython/coolreader18/unicode-stuff
Update unicodedata with ucd_3_2_0
This commit is contained in:
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -1286,7 +1286,7 @@ dependencies = [
|
||||
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unic-emoji-char 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unic-ucd-ident 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"wtf8 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
@@ -1352,9 +1352,8 @@ dependencies = [
|
||||
"statrs 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"subprocess 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unic 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unic-common 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-casing 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"wasm-bindgen 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@@ -1971,11 +1970,6 @@ name = "unicode-xid"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "unicode_categories"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "unicode_names2"
|
||||
version = "0.3.0"
|
||||
@@ -2366,7 +2360,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20"
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
|
||||
"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
||||
"checksum unicode_names2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7a928b876ff873d4a0ac966acce72423879dd86afcf190017aa700207188078"
|
||||
"checksum utf8parse 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8772a4ccbb4e89959023bc5b7cb8623a795caa7092d99f3aa9501b9484d4557d"
|
||||
"checksum vcpkg 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "33dd455d0f96e90a75803cfeb7f948768c08d70a6de9a8d2362461935698bf95"
|
||||
|
||||
@@ -17,6 +17,6 @@ log="0.4.1"
|
||||
regex = "1"
|
||||
num-bigint = "0.2"
|
||||
num-traits = "0.2"
|
||||
unicode-xid = "0.2.0"
|
||||
unic-emoji-char = "0.9.0"
|
||||
unic-emoji-char = "0.9"
|
||||
unic-ucd-ident = "0.9"
|
||||
wtf8 = "0.0.3"
|
||||
|
||||
@@ -2,9 +2,6 @@
|
||||
//!
|
||||
//! This means source code is translated into separate tokens.
|
||||
|
||||
extern crate unic_emoji_char;
|
||||
extern crate unicode_xid;
|
||||
|
||||
pub use super::token::Tok;
|
||||
use crate::error::{LexicalError, LexicalErrorType};
|
||||
use crate::location::Location;
|
||||
@@ -15,8 +12,7 @@ use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use unic_emoji_char::is_emoji_presentation;
|
||||
use unicode_xid::UnicodeXID;
|
||||
use wtf8;
|
||||
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug, Default)]
|
||||
struct IndentationLevel {
|
||||
@@ -658,17 +654,14 @@ where
|
||||
}
|
||||
|
||||
fn is_identifier_start(&self, c: char) -> bool {
|
||||
match c {
|
||||
'_' => true,
|
||||
c => UnicodeXID::is_xid_start(c),
|
||||
}
|
||||
c == '_' || is_xid_start(c)
|
||||
}
|
||||
|
||||
fn is_identifier_continuation(&self) -> bool {
|
||||
if let Some(c) = self.chr0 {
|
||||
match c {
|
||||
'_' | '0'..='9' => true,
|
||||
c => UnicodeXID::is_xid_continue(c),
|
||||
c => is_xid_continue(c),
|
||||
}
|
||||
} else {
|
||||
false
|
||||
|
||||
@@ -19,3 +19,21 @@ assert unicodedata.name('a') == 'LATIN SMALL LETTER A'
|
||||
assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
|
||||
assert unicodedata.bidirectional('a') == 'L'
|
||||
assert unicodedata.normalize('NFC', 'bla') == 'bla'
|
||||
|
||||
# testing unicodedata.ucd_3_2_0 for idna
|
||||
assert "abcСĤ".encode("idna") == b'xn--abc-7sa390b'
|
||||
# TODO: fix: assert "abc䄣IJ".encode("idna") == b'xn--abcij-zb5f'
|
||||
|
||||
# from CPython tests
|
||||
assert "python.org".encode("idna") == b"python.org"
|
||||
assert "python.org.".encode("idna") == b"python.org."
|
||||
assert "pyth\xf6n.org".encode("idna") == b"xn--pythn-mua.org"
|
||||
assert "pyth\xf6n.org.".encode("idna") == b"xn--pythn-mua.org."
|
||||
assert b"python.org".decode("idna") == "python.org"
|
||||
assert b"python.org.".decode("idna") == "python.org."
|
||||
assert b"xn--pythn-mua.org".decode("idna") == "pyth\xf6n.org"
|
||||
assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
|
||||
|
||||
# TODO: add east_asian_width and mirrored
|
||||
# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
|
||||
# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")
|
||||
|
||||
@@ -46,7 +46,6 @@ rustc_version_runtime = "0.1.*"
|
||||
statrs = "0.12.0"
|
||||
caseless = "0.2.1"
|
||||
chrono = { version = "=0.4.9", features = ["wasmbind"] }
|
||||
unicode-xid = "0.2.0"
|
||||
lazy_static = "^1.0.1"
|
||||
lexical = "4"
|
||||
itertools = "0.8"
|
||||
@@ -54,10 +53,12 @@ hex = "0.4.0"
|
||||
hexf-parse = "0.1.0"
|
||||
indexmap = "1.0.2"
|
||||
crc = "^1.0.0"
|
||||
unicode_categories = "0.1.1"
|
||||
unicode_names2 = "0.3.0"
|
||||
unicode-casing = "0.1.0"
|
||||
unic = "0.9.0"
|
||||
unicode_names2 = "0.3"
|
||||
# TODO: use unic for this; needed for title case:
|
||||
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
|
||||
unicode-casing = "0.1"
|
||||
unic = "0.9"
|
||||
unic-common = "0.9"
|
||||
maplit = "1.0"
|
||||
proc-macro-hack = { version = "0.5", optional = true }
|
||||
bitflags = "1.1"
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
extern crate unicode_categories;
|
||||
extern crate unicode_xid;
|
||||
|
||||
use std::cell::Cell;
|
||||
use std::char;
|
||||
use std::fmt;
|
||||
@@ -10,10 +7,10 @@ use std::str::FromStr;
|
||||
use std::string::ToString;
|
||||
|
||||
use num_traits::ToPrimitive;
|
||||
use unic::ucd::category::GeneralCategory;
|
||||
use unic::ucd::ident::{is_xid_continue, is_xid_start};
|
||||
use unic::ucd::is_cased;
|
||||
use unicode_casing::CharExt;
|
||||
use unicode_categories::UnicodeCategories;
|
||||
use unicode_xid::UnicodeXID;
|
||||
|
||||
use super::objbytes::{PyBytes, PyBytesRef};
|
||||
use super::objdict::PyDict;
|
||||
@@ -366,16 +363,7 @@ impl PyString {
|
||||
formatted.push_str(&format!("\\x{:02x}", c as u32));
|
||||
} else if c.is_ascii() {
|
||||
formatted.push(c);
|
||||
} else if c.is_other() || c.is_separator() {
|
||||
// According to python following categories aren't printable:
|
||||
// * Cc (Other, Control)
|
||||
// * Cf (Other, Format)
|
||||
// * Cs (Other, Surrogate)
|
||||
// * Co (Other, Private Use)
|
||||
// * Cn (Other, Not Assigned)
|
||||
// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
|
||||
// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
|
||||
// * Zs (Separator, Space) other than ASCII space('\x20').
|
||||
} else if !char_is_printable(c) {
|
||||
let code = c as u32;
|
||||
let escaped = if code < 0xff {
|
||||
format!("\\U{:02x}", code)
|
||||
@@ -742,10 +730,9 @@ impl PyString {
|
||||
/// * Zs (Separator, Space) other than ASCII space('\x20').
|
||||
#[pymethod]
|
||||
fn isprintable(&self, _vm: &VirtualMachine) -> bool {
|
||||
self.value.chars().all(|c| match c {
|
||||
'\u{0020}' => true,
|
||||
_ => !(c.is_other_control() | c.is_separator()),
|
||||
})
|
||||
self.value
|
||||
.chars()
|
||||
.all(|c| c == '\u{0020}' || char_is_printable(c))
|
||||
}
|
||||
|
||||
// cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty
|
||||
@@ -1094,13 +1081,9 @@ impl PyString {
|
||||
#[pymethod]
|
||||
fn isidentifier(&self, _vm: &VirtualMachine) -> bool {
|
||||
let mut chars = self.value.chars();
|
||||
let is_identifier_start = match chars.next() {
|
||||
Some('_') => true,
|
||||
Some(c) => UnicodeXID::is_xid_start(c),
|
||||
None => false,
|
||||
};
|
||||
let is_identifier_start = chars.next().map_or(false, |c| c == '_' || is_xid_start(c));
|
||||
// a string is not an identifier if it has whitespace or starts with a number
|
||||
is_identifier_start && chars.all(UnicodeXID::is_xid_continue)
|
||||
is_identifier_start && chars.all(is_xid_continue)
|
||||
}
|
||||
|
||||
// https://docs.python.org/3/library/stdtypes.html#str.translate
|
||||
@@ -1706,6 +1689,20 @@ fn adjust_indices(
|
||||
}
|
||||
}
|
||||
|
||||
// According to python following categories aren't printable:
|
||||
// * Cc (Other, Control)
|
||||
// * Cf (Other, Format)
|
||||
// * Cs (Other, Surrogate)
|
||||
// * Co (Other, Private Use)
|
||||
// * Cn (Other, Not Assigned)
|
||||
// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
|
||||
// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
|
||||
// * Zs (Separator, Space) other than ASCII space('\x20').
|
||||
fn char_is_printable(c: char) -> bool {
|
||||
let cat = GeneralCategory::of(c);
|
||||
!(cat.is_other() || cat.is_separator())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static {
|
||||
};
|
||||
PyRef::new_ref(PyObject::new(self, cls, dict), vm)
|
||||
} else {
|
||||
let subtype = vm.to_pystr(&cls.obj)?;
|
||||
let basetype = vm.to_pystr(&class.obj)?;
|
||||
let subtype = vm.to_str(&cls.obj)?;
|
||||
let basetype = vm.to_str(&class.obj)?;
|
||||
Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,54 +4,132 @@
|
||||
|
||||
use crate::function::OptionalArg;
|
||||
use crate::obj::objstr::PyStringRef;
|
||||
use crate::pyobject::{PyObjectRef, PyResult};
|
||||
use crate::obj::objtype::PyClassRef;
|
||||
use crate::pyobject::{PyClassImpl, PyObject, PyObjectRef, PyResult, PyValue};
|
||||
use crate::vm::VirtualMachine;
|
||||
|
||||
use itertools::Itertools;
|
||||
use unic::bidi::BidiClass;
|
||||
use unic::char::property::EnumeratedCharProperty;
|
||||
use unic::normal::StrNormalForm;
|
||||
use unic::ucd::category::GeneralCategory;
|
||||
use unic::ucd::Name;
|
||||
use unicode_names2;
|
||||
use unic::ucd::{Age, Name};
|
||||
use unic_common::version::UnicodeVersion;
|
||||
|
||||
pub fn make_module(vm: &VirtualMachine) -> PyObjectRef {
|
||||
let ctx = &vm.ctx;
|
||||
|
||||
let unidata_version = unic::UNICODE_VERSION.to_string();
|
||||
let ucd_class = PyUCD::make_class(ctx);
|
||||
|
||||
py_module!(vm, "unicodedata", {
|
||||
"bidirectional" => ctx.new_rustfunc(bidirectional),
|
||||
"category" => ctx.new_rustfunc(category),
|
||||
"name" => ctx.new_rustfunc(name),
|
||||
"lookup" => ctx.new_rustfunc(lookup),
|
||||
"normalize" => ctx.new_rustfunc(normalize),
|
||||
"unidata_version" => ctx.new_str(unidata_version),
|
||||
})
|
||||
let ucd = PyObject::new(PyUCD::default(), ucd_class.clone(), None);
|
||||
|
||||
let ucd_3_2_0 = PyObject::new(
|
||||
PyUCD {
|
||||
unic_version: UnicodeVersion {
|
||||
major: 3,
|
||||
minor: 2,
|
||||
micro: 0,
|
||||
},
|
||||
},
|
||||
ucd_class.clone(),
|
||||
None,
|
||||
);
|
||||
|
||||
let module = py_module!(vm, "unicodedata", {
|
||||
"UCD" => ucd_class.into_object(),
|
||||
"ucd_3_2_0" => ucd_3_2_0,
|
||||
// we do unidata_version here because the getter tries to do PyUCD::class() before
|
||||
// the module is in the VM
|
||||
"unidata_version" => ctx.new_str(PyUCD::default().unic_version.to_string()),
|
||||
});
|
||||
|
||||
for attr in ["category", "lookup", "name", "bidirectional", "normalize"]
|
||||
.iter()
|
||||
.copied()
|
||||
{
|
||||
extend_module!(vm, &module, {
|
||||
attr => vm.get_attribute(ucd.clone(), attr).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
module
|
||||
}
|
||||
|
||||
fn category(character: PyStringRef, vm: &VirtualMachine) -> PyResult {
|
||||
let my_char = extract_char(character, vm)?;
|
||||
let category = GeneralCategory::of(my_char);
|
||||
Ok(vm.new_str(category.abbr_name().to_string()))
|
||||
#[pyclass]
|
||||
#[derive(Debug)]
|
||||
struct PyUCD {
|
||||
unic_version: UnicodeVersion,
|
||||
}
|
||||
|
||||
fn lookup(name: PyStringRef, vm: &VirtualMachine) -> PyResult {
|
||||
// TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible:
|
||||
if let Some(character) = unicode_names2::character(name.as_str()) {
|
||||
Ok(vm.new_str(character.to_string()))
|
||||
} else {
|
||||
Err(vm.new_key_error(vm.new_str(format!("undefined character name '{}'", name))))
|
||||
impl PyValue for PyUCD {
|
||||
fn class(vm: &VirtualMachine) -> PyClassRef {
|
||||
vm.class("unicodedata", "UCD")
|
||||
}
|
||||
}
|
||||
|
||||
fn name(
|
||||
character: PyStringRef,
|
||||
default: OptionalArg<PyObjectRef>,
|
||||
vm: &VirtualMachine,
|
||||
) -> PyResult {
|
||||
let my_char = extract_char(character, vm)?;
|
||||
impl Default for PyUCD {
|
||||
#[inline(always)]
|
||||
fn default() -> Self {
|
||||
PyUCD {
|
||||
unic_version: unic::UNICODE_VERSION,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(name) = Name::of(my_char) {
|
||||
Ok(vm.new_str(name.to_string()))
|
||||
} else {
|
||||
#[pyimpl]
|
||||
impl PyUCD {
|
||||
fn check_age(&self, c: char) -> bool {
|
||||
Age::of(c).map_or(false, |age| age.actual() <= self.unic_version)
|
||||
}
|
||||
|
||||
fn extract_char(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<Option<char>> {
|
||||
let c = character.as_str().chars().exactly_one().map_err(|_| {
|
||||
vm.new_type_error("argument must be an unicode character, not str".to_string())
|
||||
})?;
|
||||
|
||||
if self.check_age(c) {
|
||||
Ok(Some(c))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethod]
|
||||
fn category(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||
Ok(self
|
||||
.extract_char(character, vm)?
|
||||
.map_or(GeneralCategory::Unassigned, GeneralCategory::of)
|
||||
.abbr_name()
|
||||
.to_owned())
|
||||
}
|
||||
|
||||
#[pymethod]
|
||||
fn lookup(&self, name: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||
// TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible:
|
||||
if let Some(character) = unicode_names2::character(name.as_str()) {
|
||||
if self.check_age(character) {
|
||||
return Ok(character.to_string());
|
||||
}
|
||||
}
|
||||
Err(vm.new_lookup_error(format!("undefined character name '{}'", name)))
|
||||
}
|
||||
|
||||
#[pymethod]
|
||||
fn name(
|
||||
&self,
|
||||
character: PyStringRef,
|
||||
default: OptionalArg<PyObjectRef>,
|
||||
vm: &VirtualMachine,
|
||||
) -> PyResult {
|
||||
let c = self.extract_char(character, vm)?;
|
||||
|
||||
if let Some(c) = c {
|
||||
if self.check_age(c) {
|
||||
if let Some(name) = Name::of(c) {
|
||||
return Ok(vm.new_str(name.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
match default {
|
||||
OptionalArg::Present(obj) => Ok(obj),
|
||||
OptionalArg::Missing => {
|
||||
@@ -59,36 +137,37 @@ fn name(
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn bidirectional(character: PyStringRef, vm: &VirtualMachine) -> PyResult {
|
||||
use unic::bidi::BidiClass;
|
||||
let my_char = extract_char(character, vm)?;
|
||||
let cls = BidiClass::of(my_char);
|
||||
Ok(vm.new_str(cls.abbr_name().to_string()))
|
||||
}
|
||||
|
||||
fn normalize(form: PyStringRef, unistr: PyStringRef, vm: &VirtualMachine) -> PyResult {
|
||||
use unic::normal::StrNormalForm;
|
||||
let text = unistr.as_str();
|
||||
let normalized_text = match form.as_str() {
|
||||
"NFC" => text.nfc().collect::<String>(),
|
||||
"NFKC" => text.nfkc().collect::<String>(),
|
||||
"NFD" => text.nfd().collect::<String>(),
|
||||
"NFKD" => text.nfkd().collect::<String>(),
|
||||
_ => {
|
||||
return Err(vm.new_value_error("unistr must be one of NFC, NFD".to_string()));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(vm.new_str(normalized_text))
|
||||
}
|
||||
|
||||
fn extract_char(character: PyStringRef, vm: &VirtualMachine) -> PyResult<char> {
|
||||
if character.as_str().len() != 1 {
|
||||
return Err(vm.new_type_error("argument must be an unicode character, not str".to_string()));
|
||||
#[pymethod]
|
||||
fn bidirectional(&self, character: PyStringRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||
let bidi = match self.extract_char(character, vm)? {
|
||||
Some(c) => BidiClass::of(c).abbr_name(),
|
||||
None => "",
|
||||
};
|
||||
Ok(bidi.to_owned())
|
||||
}
|
||||
|
||||
let my_char: char = character.as_str().chars().next().unwrap();
|
||||
Ok(my_char)
|
||||
#[pymethod]
|
||||
fn normalize(
|
||||
&self,
|
||||
form: PyStringRef,
|
||||
unistr: PyStringRef,
|
||||
vm: &VirtualMachine,
|
||||
) -> PyResult<String> {
|
||||
let text = unistr.as_str();
|
||||
let normalized_text = match form.as_str() {
|
||||
"NFC" => text.nfc().collect::<String>(),
|
||||
"NFKC" => text.nfkc().collect::<String>(),
|
||||
"NFD" => text.nfd().collect::<String>(),
|
||||
"NFKD" => text.nfkd().collect::<String>(),
|
||||
_ => return Err(vm.new_value_error("invalid normalization form".to_string())),
|
||||
};
|
||||
|
||||
Ok(normalized_text)
|
||||
}
|
||||
|
||||
#[pyproperty]
|
||||
fn unidata_version(&self, _vm: &VirtualMachine) -> String {
|
||||
self.unic_version.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user