mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
Compare commits
7 Commits
main
...
copilot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4efa5da5f6 | ||
|
|
2934897035 | ||
|
|
0a340de9c3 | ||
|
|
5cf1bd6667 | ||
|
|
e968d83808 | ||
|
|
67485b5b77 | ||
|
|
d3af1c54ec |
31
Cargo.lock
generated
31
Cargo.lock
generated
@@ -3123,9 +3123,9 @@ dependencies = [
|
|||||||
"rustpython-ruff_python_ast",
|
"rustpython-ruff_python_ast",
|
||||||
"rustpython-ruff_python_parser",
|
"rustpython-ruff_python_parser",
|
||||||
"rustpython-ruff_text_size",
|
"rustpython-ruff_text_size",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustpython-wtf8",
|
"rustpython-wtf8",
|
||||||
"thiserror 2.0.18",
|
"thiserror 2.0.18",
|
||||||
"unicode_names2 2.0.0",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3148,9 +3148,9 @@ dependencies = [
|
|||||||
"parking_lot",
|
"parking_lot",
|
||||||
"radium",
|
"radium",
|
||||||
"rustpython-literal",
|
"rustpython-literal",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustpython-wtf8",
|
"rustpython-wtf8",
|
||||||
"siphasher",
|
"siphasher",
|
||||||
"unicode_names2 2.0.0",
|
|
||||||
"widestring",
|
"widestring",
|
||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
@@ -3242,11 +3242,11 @@ name = "rustpython-literal"
|
|||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"hexf-parse",
|
"hexf-parse",
|
||||||
"icu_properties",
|
|
||||||
"is-macro",
|
"is-macro",
|
||||||
"lexical-parse-float",
|
"lexical-parse-float",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustpython-wtf8",
|
"rustpython-wtf8",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -3338,6 +3338,7 @@ dependencies = [
|
|||||||
"criterion",
|
"criterion",
|
||||||
"num_enum",
|
"num_enum",
|
||||||
"optional",
|
"optional",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustpython-wtf8",
|
"rustpython-wtf8",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -3368,8 +3369,6 @@ dependencies = [
|
|||||||
"gethostname",
|
"gethostname",
|
||||||
"hex",
|
"hex",
|
||||||
"hmac",
|
"hmac",
|
||||||
"icu_normalizer",
|
|
||||||
"icu_properties",
|
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"itertools 0.14.0",
|
"itertools 0.14.0",
|
||||||
"libc",
|
"libc",
|
||||||
@@ -3411,6 +3410,7 @@ dependencies = [
|
|||||||
"rustpython-ruff_python_parser",
|
"rustpython-ruff_python_parser",
|
||||||
"rustpython-ruff_source_file",
|
"rustpython-ruff_source_file",
|
||||||
"rustpython-ruff_text_size",
|
"rustpython-ruff_text_size",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustpython-vm",
|
"rustpython-vm",
|
||||||
"schannel",
|
"schannel",
|
||||||
"sha-1",
|
"sha-1",
|
||||||
@@ -3421,9 +3421,6 @@ dependencies = [
|
|||||||
"tcl-sys",
|
"tcl-sys",
|
||||||
"termios",
|
"termios",
|
||||||
"tk-sys",
|
"tk-sys",
|
||||||
"ucd",
|
|
||||||
"unic-ucd-age",
|
|
||||||
"unicode_names2 2.0.0",
|
|
||||||
"uuid",
|
"uuid",
|
||||||
"webpki-roots",
|
"webpki-roots",
|
||||||
"widestring",
|
"widestring",
|
||||||
@@ -3433,6 +3430,21 @@ dependencies = [
|
|||||||
"xml",
|
"xml",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustpython-unicode"
|
||||||
|
version = "0.5.0"
|
||||||
|
dependencies = [
|
||||||
|
"caseless",
|
||||||
|
"icu_normalizer",
|
||||||
|
"icu_properties",
|
||||||
|
"itertools 0.14.0",
|
||||||
|
"rustpython-wtf8",
|
||||||
|
"ucd",
|
||||||
|
"unic-ucd-age",
|
||||||
|
"unicode-casing",
|
||||||
|
"unicode_names2 2.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustpython-venvlauncher"
|
name = "rustpython-venvlauncher"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
@@ -3445,7 +3457,6 @@ dependencies = [
|
|||||||
"ascii",
|
"ascii",
|
||||||
"bitflags 2.11.0",
|
"bitflags 2.11.0",
|
||||||
"bstr",
|
"bstr",
|
||||||
"caseless",
|
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"chrono",
|
"chrono",
|
||||||
"constant_time_eq",
|
"constant_time_eq",
|
||||||
@@ -3458,7 +3469,6 @@ dependencies = [
|
|||||||
"glob",
|
"glob",
|
||||||
"half",
|
"half",
|
||||||
"hex",
|
"hex",
|
||||||
"icu_properties",
|
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"is-macro",
|
"is-macro",
|
||||||
"itertools 0.14.0",
|
"itertools 0.14.0",
|
||||||
@@ -3492,6 +3502,7 @@ dependencies = [
|
|||||||
"rustpython-ruff_python_parser",
|
"rustpython-ruff_python_parser",
|
||||||
"rustpython-ruff_text_size",
|
"rustpython-ruff_text_size",
|
||||||
"rustpython-sre_engine",
|
"rustpython-sre_engine",
|
||||||
|
"rustpython-unicode",
|
||||||
"rustyline",
|
"rustyline",
|
||||||
"scoped-tls",
|
"scoped-tls",
|
||||||
"scopeguard",
|
"scopeguard",
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ rustpython-vm = { path = "crates/vm", default-features = false, version = "0.5.0
|
|||||||
rustpython-pylib = { path = "crates/pylib", version = "0.5.0" }
|
rustpython-pylib = { path = "crates/pylib", version = "0.5.0" }
|
||||||
rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" }
|
rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" }
|
||||||
rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" }
|
rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" }
|
||||||
|
rustpython-unicode = { path = "crates/unicode", default-features = false, version = "0.5.0" }
|
||||||
rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" }
|
rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" }
|
||||||
rustpython-doc = { path = "crates/doc", version = "0.5.0" }
|
rustpython-doc = { path = "crates/doc", version = "0.5.0" }
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rustpython-compiler-core = { workspace = true }
|
rustpython-compiler-core = { workspace = true }
|
||||||
|
rustpython-unicode = { workspace = true, default-features = false }
|
||||||
rustpython-literal = {workspace = true }
|
rustpython-literal = {workspace = true }
|
||||||
rustpython-wtf8 = { workspace = true }
|
rustpython-wtf8 = { workspace = true }
|
||||||
ruff_python_ast = { workspace = true }
|
ruff_python_ast = { workspace = true }
|
||||||
@@ -29,7 +30,6 @@ num-traits = { workspace = true }
|
|||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
malachite-bigint = { workspace = true }
|
malachite-bigint = { workspace = true }
|
||||||
memchr = { workspace = true }
|
memchr = { workspace = true }
|
||||||
unicode_names2 = { workspace = true }
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
ruff_python_parser = { workspace = true }
|
ruff_python_parser = { workspace = true }
|
||||||
|
|||||||
@@ -113,7 +113,9 @@ impl StringParser {
|
|||||||
let name_and_ending = self.skip_bytes(close_idx + 1);
|
let name_and_ending = self.skip_bytes(close_idx + 1);
|
||||||
let name = &name_and_ending[..name_and_ending.len() - 1];
|
let name = &name_and_ending[..name_and_ending.len() - 1];
|
||||||
|
|
||||||
unicode_names2::character(name).ok_or_else(|| unreachable!())
|
rustpython_unicode::data::lookup(name)
|
||||||
|
.and_then(char::from_u32)
|
||||||
|
.ok_or_else(|| unreachable!())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse an escaped character, returning the new character.
|
/// Parse an escaped character, returning the new character.
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rustpython-literal = { workspace = true }
|
rustpython-literal = { workspace = true }
|
||||||
|
rustpython-unicode = { workspace = true, default-features = false }
|
||||||
rustpython-wtf8 = { workspace = true }
|
rustpython-wtf8 = { workspace = true }
|
||||||
|
|
||||||
ascii = { workspace = true }
|
ascii = { workspace = true }
|
||||||
@@ -29,7 +30,6 @@ malachite-q = { workspace = true }
|
|||||||
malachite-base = { workspace = true }
|
malachite-base = { workspace = true }
|
||||||
num-traits = { workspace = true }
|
num-traits = { workspace = true }
|
||||||
parking_lot = { workspace = true, optional = true }
|
parking_lot = { workspace = true, optional = true }
|
||||||
unicode_names2 = { workspace = true }
|
|
||||||
radium = { workspace = true }
|
radium = { workspace = true }
|
||||||
|
|
||||||
lock_api = "0.4"
|
lock_api = "0.4"
|
||||||
|
|||||||
@@ -414,7 +414,7 @@ pub mod errors {
|
|||||||
let mut out = String::with_capacity(num_chars * 4);
|
let mut out = String::with_capacity(num_chars * 4);
|
||||||
for c in err_str.code_points() {
|
for c in err_str.code_points() {
|
||||||
let c_u32 = c.to_u32();
|
let c_u32 = c.to_u32();
|
||||||
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
|
if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
|
||||||
write!(out, "\\N{{{c_name}}}").unwrap();
|
write!(out, "\\N{{{c_name}}}").unwrap();
|
||||||
} else if c_u32 >= 0x10000 {
|
} else if c_u32 >= 0x10000 {
|
||||||
write!(out, "\\U{c_u32:08x}").unwrap();
|
write!(out, "\\U{c_u32:08x}").unwrap();
|
||||||
|
|||||||
@@ -9,13 +9,13 @@ license = { workspace = true }
|
|||||||
rust-version = { workspace = true }
|
rust-version = { workspace = true }
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
rustpython-unicode = { workspace = true, default-features = false }
|
||||||
rustpython-wtf8 = { workspace = true }
|
rustpython-wtf8 = { workspace = true }
|
||||||
|
|
||||||
hexf-parse = "0.2.1"
|
hexf-parse = "0.2.1"
|
||||||
is-macro.workspace = true
|
is-macro.workspace = true
|
||||||
lexical-parse-float = { version = "1.0.6", features = ["format"] }
|
lexical-parse-float = { version = "1.0.6", features = ["format"] }
|
||||||
num-traits = { workspace = true }
|
num-traits = { workspace = true }
|
||||||
icu_properties = { workspace = true }
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = { workspace = true }
|
rand = { workspace = true }
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
use icu_properties::props::{EnumeratedProperty, GeneralCategory};
|
|
||||||
|
|
||||||
/// According to python following categories aren't printable:
|
|
||||||
/// * Cc (Other, Control)
|
|
||||||
/// * Cf (Other, Format)
|
|
||||||
/// * Cs (Other, Surrogate)
|
|
||||||
/// * Co (Other, Private Use)
|
|
||||||
/// * Cn (Other, Not Assigned)
|
|
||||||
/// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
|
|
||||||
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
|
|
||||||
/// * Zs (Separator, Space) other than ASCII space('\x20').
|
|
||||||
pub fn is_printable(c: char) -> bool {
|
|
||||||
let cat = GeneralCategory::for_char(c);
|
|
||||||
|
|
||||||
!matches!(
|
|
||||||
cat,
|
|
||||||
GeneralCategory::SpaceSeparator
|
|
||||||
| GeneralCategory::LineSeparator
|
|
||||||
| GeneralCategory::ParagraphSeparator
|
|
||||||
| GeneralCategory::Control
|
|
||||||
| GeneralCategory::Format
|
|
||||||
| GeneralCategory::Surrogate
|
|
||||||
| GeneralCategory::PrivateUse
|
|
||||||
| GeneralCategory::Unassigned
|
|
||||||
)
|
|
||||||
}
|
|
||||||
@@ -204,7 +204,7 @@ impl UnicodeEscape<'_> {
|
|||||||
'\\' | '\t' | '\r' | '\n' => 2,
|
'\\' | '\t' | '\r' | '\n' => 2,
|
||||||
ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH
|
ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH
|
||||||
ch if ch.is_ascii() => 1,
|
ch if ch.is_ascii() => 1,
|
||||||
ch if crate::char::is_printable(ch) => {
|
ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
|
||||||
// max = std::cmp::max(ch, max);
|
// max = std::cmp::max(ch, max);
|
||||||
ch.len_utf8()
|
ch.len_utf8()
|
||||||
}
|
}
|
||||||
@@ -238,7 +238,9 @@ impl UnicodeEscape<'_> {
|
|||||||
ch if ch.is_ascii() => {
|
ch if ch.is_ascii() => {
|
||||||
write!(formatter, "\\x{:02x}", ch as u8)
|
write!(formatter, "\\x{:02x}", ch as u8)
|
||||||
}
|
}
|
||||||
ch if crate::char::is_printable(ch) => formatter.write_char(ch),
|
ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
|
||||||
|
formatter.write_char(ch)
|
||||||
|
}
|
||||||
'\0'..='\u{ff}' => {
|
'\0'..='\u{ff}' => {
|
||||||
write!(formatter, "\\x{:02x}", ch as u32)
|
write!(formatter, "\\x{:02x}", ch as u32)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
extern crate alloc;
|
extern crate alloc;
|
||||||
|
|
||||||
pub mod char;
|
|
||||||
pub mod complex;
|
pub mod complex;
|
||||||
pub mod escape;
|
pub mod escape;
|
||||||
pub mod float;
|
pub mod float;
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ name = "benches"
|
|||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
rustpython-unicode = { workspace = true, default-features = false }
|
||||||
rustpython-wtf8 = { workspace = true }
|
rustpython-wtf8 = { workspace = true }
|
||||||
num_enum = { workspace = true }
|
num_enum = { workspace = true }
|
||||||
bitflags = { workspace = true }
|
bitflags = { workspace = true }
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
// good luck to those that follow; here be dragons
|
// good luck to those that follow; here be dragons
|
||||||
|
|
||||||
use crate::string::{
|
|
||||||
is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space,
|
|
||||||
is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode,
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor};
|
use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor};
|
||||||
use alloc::{vec, vec::Vec};
|
use alloc::{vec, vec::Vec};
|
||||||
use core::{convert::TryFrom, ptr::null};
|
use core::{convert::TryFrom, ptr::null};
|
||||||
use optional::Optioned;
|
use optional::Optioned;
|
||||||
|
use rustpython_unicode::regex as unicode_regex;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct Request<'a, S> {
|
pub struct Request<'a, S> {
|
||||||
@@ -659,10 +655,10 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
|
|||||||
}
|
}
|
||||||
SreOpcode::IN => general_op_in!(charset),
|
SreOpcode::IN => general_op_in!(charset),
|
||||||
SreOpcode::IN_IGNORE => {
|
SreOpcode::IN_IGNORE => {
|
||||||
general_op_in!(|set, c| charset(set, lower_ascii(c)))
|
general_op_in!(|set, c| charset(set, unicode_regex::lower_ascii(c)))
|
||||||
}
|
}
|
||||||
SreOpcode::IN_UNI_IGNORE => {
|
SreOpcode::IN_UNI_IGNORE => {
|
||||||
general_op_in!(|set, c| charset(set, lower_unicode(c)))
|
general_op_in!(|set, c| charset(set, unicode_regex::lower_unicode(c)))
|
||||||
}
|
}
|
||||||
SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore),
|
SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore),
|
||||||
SreOpcode::MARK => {
|
SreOpcode::MARK => {
|
||||||
@@ -803,25 +799,31 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
|
|||||||
SreOpcode::LITERAL => general_op_literal!(|code, c| code == c),
|
SreOpcode::LITERAL => general_op_literal!(|code, c| code == c),
|
||||||
SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c),
|
SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c),
|
||||||
SreOpcode::LITERAL_IGNORE => {
|
SreOpcode::LITERAL_IGNORE => {
|
||||||
general_op_literal!(|code, c| code == lower_ascii(c))
|
general_op_literal!(|code, c| code == unicode_regex::lower_ascii(c))
|
||||||
}
|
}
|
||||||
SreOpcode::NOT_LITERAL_IGNORE => {
|
SreOpcode::NOT_LITERAL_IGNORE => {
|
||||||
general_op_literal!(|code, c| code != lower_ascii(c))
|
general_op_literal!(|code, c| code != unicode_regex::lower_ascii(c))
|
||||||
}
|
}
|
||||||
SreOpcode::LITERAL_UNI_IGNORE => {
|
SreOpcode::LITERAL_UNI_IGNORE => {
|
||||||
general_op_literal!(|code, c| code == lower_unicode(c))
|
general_op_literal!(|code, c| code == unicode_regex::lower_unicode(c))
|
||||||
}
|
}
|
||||||
SreOpcode::NOT_LITERAL_UNI_IGNORE => {
|
SreOpcode::NOT_LITERAL_UNI_IGNORE => {
|
||||||
general_op_literal!(|code, c| code != lower_unicode(c))
|
general_op_literal!(|code, c| code != unicode_regex::lower_unicode(c))
|
||||||
}
|
}
|
||||||
SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore),
|
SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore),
|
||||||
SreOpcode::NOT_LITERAL_LOC_IGNORE => {
|
SreOpcode::NOT_LITERAL_LOC_IGNORE => {
|
||||||
general_op_literal!(|code, c| !char_loc_ignore(code, c))
|
general_op_literal!(|code, c| !char_loc_ignore(code, c))
|
||||||
}
|
}
|
||||||
SreOpcode::GROUPREF => general_op_groupref!(|x| x),
|
SreOpcode::GROUPREF => general_op_groupref!(|x| x),
|
||||||
SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii),
|
SreOpcode::GROUPREF_IGNORE => {
|
||||||
SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate),
|
general_op_groupref!(unicode_regex::lower_ascii)
|
||||||
SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode),
|
}
|
||||||
|
SreOpcode::GROUPREF_LOC_IGNORE => {
|
||||||
|
general_op_groupref!(unicode_regex::lower_locale)
|
||||||
|
}
|
||||||
|
SreOpcode::GROUPREF_UNI_IGNORE => {
|
||||||
|
general_op_groupref!(unicode_regex::lower_unicode)
|
||||||
|
}
|
||||||
SreOpcode::GROUPREF_EXISTS => {
|
SreOpcode::GROUPREF_EXISTS => {
|
||||||
let (group_start, group_end) =
|
let (group_start, group_end) =
|
||||||
state.marks.get(ctx.peek_code(req, 1) as usize);
|
state.marks.get(ctx.peek_code(req, 1) as usize);
|
||||||
@@ -1125,7 +1127,7 @@ impl MatchContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn at_linebreak<S: StrDrive>(&self, req: &Request<'_, S>) -> bool {
|
fn at_linebreak<S: StrDrive>(&self, req: &Request<'_, S>) -> bool {
|
||||||
!self.at_end(req) && is_linebreak(self.peek_char::<S>())
|
!self.at_end(req) && unicode_regex::is_linebreak(self.peek_char::<S>())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn at_boundary<S: StrDrive, F: FnMut(u32) -> bool>(
|
fn at_boundary<S: StrDrive, F: FnMut(u32) -> bool>(
|
||||||
@@ -1192,54 +1194,56 @@ impl MatchContext {
|
|||||||
fn at<S: StrDrive>(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool {
|
fn at<S: StrDrive>(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool {
|
||||||
match at_code {
|
match at_code {
|
||||||
SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(),
|
SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(),
|
||||||
SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::<S>()),
|
SreAtCode::BEGINNING_LINE => {
|
||||||
SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word),
|
ctx.at_beginning() || unicode_regex::is_linebreak(ctx.back_peek_char::<S>())
|
||||||
SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word),
|
}
|
||||||
|
SreAtCode::BOUNDARY => ctx.at_boundary(req, unicode_regex::is_word),
|
||||||
|
SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_word),
|
||||||
SreAtCode::END => {
|
SreAtCode::END => {
|
||||||
(ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req)
|
(ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req)
|
||||||
}
|
}
|
||||||
SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req),
|
SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req),
|
||||||
SreAtCode::END_STRING => ctx.at_end(req),
|
SreAtCode::END_STRING => ctx.at_end(req),
|
||||||
SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word),
|
SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_locale_word),
|
||||||
SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word),
|
SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_locale_word),
|
||||||
SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word),
|
SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_unicode_word),
|
||||||
SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word),
|
SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_unicode_word),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn char_loc_ignore(code: u32, c: u32) -> bool {
|
fn char_loc_ignore(code: u32, c: u32) -> bool {
|
||||||
code == c || code == lower_locate(c) || code == upper_locate(c)
|
code == c || code == unicode_regex::lower_locale(c) || code == unicode_regex::upper_locale(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn charset_loc_ignore(set: &[u32], c: u32) -> bool {
|
fn charset_loc_ignore(set: &[u32], c: u32) -> bool {
|
||||||
let lo = lower_locate(c);
|
let lo = unicode_regex::lower_locale(c);
|
||||||
if charset(set, c) {
|
if charset(set, c) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
let up = upper_locate(c);
|
let up = unicode_regex::upper_locale(c);
|
||||||
up != lo && charset(set, up)
|
up != lo && charset(set, up)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn category(cat_code: SreCatCode, c: u32) -> bool {
|
fn category(cat_code: SreCatCode, c: u32) -> bool {
|
||||||
match cat_code {
|
match cat_code {
|
||||||
SreCatCode::DIGIT => is_digit(c),
|
SreCatCode::DIGIT => unicode_regex::is_digit(c),
|
||||||
SreCatCode::NOT_DIGIT => !is_digit(c),
|
SreCatCode::NOT_DIGIT => !unicode_regex::is_digit(c),
|
||||||
SreCatCode::SPACE => is_space(c),
|
SreCatCode::SPACE => unicode_regex::is_space(c),
|
||||||
SreCatCode::NOT_SPACE => !is_space(c),
|
SreCatCode::NOT_SPACE => !unicode_regex::is_space(c),
|
||||||
SreCatCode::WORD => is_word(c),
|
SreCatCode::WORD => unicode_regex::is_word(c),
|
||||||
SreCatCode::NOT_WORD => !is_word(c),
|
SreCatCode::NOT_WORD => !unicode_regex::is_word(c),
|
||||||
SreCatCode::LINEBREAK => is_linebreak(c),
|
SreCatCode::LINEBREAK => unicode_regex::is_linebreak(c),
|
||||||
SreCatCode::NOT_LINEBREAK => !is_linebreak(c),
|
SreCatCode::NOT_LINEBREAK => !unicode_regex::is_linebreak(c),
|
||||||
SreCatCode::LOC_WORD => is_loc_word(c),
|
SreCatCode::LOC_WORD => unicode_regex::is_locale_word(c),
|
||||||
SreCatCode::LOC_NOT_WORD => !is_loc_word(c),
|
SreCatCode::LOC_NOT_WORD => !unicode_regex::is_locale_word(c),
|
||||||
SreCatCode::UNI_DIGIT => is_uni_digit(c),
|
SreCatCode::UNI_DIGIT => unicode_regex::is_unicode_digit(c),
|
||||||
SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c),
|
SreCatCode::UNI_NOT_DIGIT => !unicode_regex::is_unicode_digit(c),
|
||||||
SreCatCode::UNI_SPACE => is_uni_space(c),
|
SreCatCode::UNI_SPACE => unicode_regex::is_unicode_space(c),
|
||||||
SreCatCode::UNI_NOT_SPACE => !is_uni_space(c),
|
SreCatCode::UNI_NOT_SPACE => !unicode_regex::is_unicode_space(c),
|
||||||
SreCatCode::UNI_WORD => is_uni_word(c),
|
SreCatCode::UNI_WORD => unicode_regex::is_unicode_word(c),
|
||||||
SreCatCode::UNI_NOT_WORD => !is_uni_word(c),
|
SreCatCode::UNI_NOT_WORD => !unicode_regex::is_unicode_word(c),
|
||||||
SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c),
|
SreCatCode::UNI_LINEBREAK => unicode_regex::is_unicode_linebreak(c),
|
||||||
SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c),
|
SreCatCode::UNI_NOT_LINEBREAK => !unicode_regex::is_unicode_linebreak(c),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1320,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool {
|
|||||||
if set[i + 1] <= ch && ch <= set[i + 2] {
|
if set[i + 1] <= ch && ch <= set[i + 2] {
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
let ch = upper_unicode(ch);
|
let ch = unicode_regex::upper_unicode(ch);
|
||||||
if set[i + 1] <= ch && ch <= set[i + 2] {
|
if set[i + 1] <= ch && ch <= set[i + 2] {
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
@@ -1368,10 +1372,14 @@ fn _count<S: StrDrive>(
|
|||||||
general_count_literal(req, ctx, end, |code, c| code != c);
|
general_count_literal(req, ctx, end, |code, c| code != c);
|
||||||
}
|
}
|
||||||
SreOpcode::LITERAL_IGNORE => {
|
SreOpcode::LITERAL_IGNORE => {
|
||||||
general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c));
|
general_count_literal(req, ctx, end, |code, c| {
|
||||||
|
code == unicode_regex::lower_ascii(c)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
SreOpcode::NOT_LITERAL_IGNORE => {
|
SreOpcode::NOT_LITERAL_IGNORE => {
|
||||||
general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c));
|
general_count_literal(req, ctx, end, |code, c| {
|
||||||
|
code != unicode_regex::lower_ascii(c)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
SreOpcode::LITERAL_LOC_IGNORE => {
|
SreOpcode::LITERAL_LOC_IGNORE => {
|
||||||
general_count_literal(req, ctx, end, char_loc_ignore);
|
general_count_literal(req, ctx, end, char_loc_ignore);
|
||||||
@@ -1380,10 +1388,14 @@ fn _count<S: StrDrive>(
|
|||||||
general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c));
|
general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c));
|
||||||
}
|
}
|
||||||
SreOpcode::LITERAL_UNI_IGNORE => {
|
SreOpcode::LITERAL_UNI_IGNORE => {
|
||||||
general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c));
|
general_count_literal(req, ctx, end, |code, c| {
|
||||||
|
code == unicode_regex::lower_unicode(c)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
SreOpcode::NOT_LITERAL_UNI_IGNORE => {
|
SreOpcode::NOT_LITERAL_UNI_IGNORE => {
|
||||||
general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c));
|
general_count_literal(req, ctx, end, |code, c| {
|
||||||
|
code != unicode_regex::lower_unicode(c)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
/* General case */
|
/* General case */
|
||||||
|
|||||||
@@ -331,136 +331,3 @@ const fn utf8_is_cont_byte(byte: u8) -> bool {
|
|||||||
|
|
||||||
/// Mask of the value bits of a continuation byte.
|
/// Mask of the value bits of a continuation byte.
|
||||||
const CONT_MASK: u8 = 0b0011_1111;
|
const CONT_MASK: u8 = 0b0011_1111;
|
||||||
|
|
||||||
const fn is_py_ascii_whitespace(b: u8) -> bool {
|
|
||||||
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_word(ch: u32) -> bool {
|
|
||||||
ch == '_' as u32
|
|
||||||
|| u8::try_from(ch)
|
|
||||||
.map(|x| x.is_ascii_alphanumeric())
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_space(ch: u32) -> bool {
|
|
||||||
u8::try_from(ch)
|
|
||||||
.map(is_py_ascii_whitespace)
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_digit(ch: u32) -> bool {
|
|
||||||
u8::try_from(ch)
|
|
||||||
.map(|x| x.is_ascii_digit())
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
|
|
||||||
// FIXME: Ignore the locales
|
|
||||||
u8::try_from(ch)
|
|
||||||
.map(|x| x.is_ascii_alphanumeric())
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_loc_word(ch: u32) -> bool {
|
|
||||||
ch == '_' as u32 || is_loc_alnum(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) const fn is_linebreak(ch: u32) -> bool {
|
|
||||||
ch == '\n' as u32
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub fn lower_ascii(ch: u32) -> u32 {
|
|
||||||
u8::try_from(ch)
|
|
||||||
.map(|x| x.to_ascii_lowercase() as u32)
|
|
||||||
.unwrap_or(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn lower_locate(ch: u32) -> u32 {
|
|
||||||
// FIXME: Ignore the locales
|
|
||||||
lower_ascii(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn upper_locate(ch: u32) -> u32 {
|
|
||||||
// FIXME: Ignore the locales
|
|
||||||
u8::try_from(ch)
|
|
||||||
.map(|x| x.to_ascii_uppercase() as u32)
|
|
||||||
.unwrap_or(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_uni_digit(ch: u32) -> bool {
|
|
||||||
// TODO: check with cpython
|
|
||||||
char::try_from(ch)
|
|
||||||
.map(|x| x.is_ascii_digit())
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_uni_space(ch: u32) -> bool {
|
|
||||||
// TODO: check with cpython
|
|
||||||
is_space(ch)
|
|
||||||
|| matches!(
|
|
||||||
ch,
|
|
||||||
0x0009
|
|
||||||
| 0x000A
|
|
||||||
| 0x000B
|
|
||||||
| 0x000C
|
|
||||||
| 0x000D
|
|
||||||
| 0x001C
|
|
||||||
| 0x001D
|
|
||||||
| 0x001E
|
|
||||||
| 0x001F
|
|
||||||
| 0x0020
|
|
||||||
| 0x0085
|
|
||||||
| 0x00A0
|
|
||||||
| 0x1680
|
|
||||||
| 0x2000
|
|
||||||
| 0x2001
|
|
||||||
| 0x2002
|
|
||||||
| 0x2003
|
|
||||||
| 0x2004
|
|
||||||
| 0x2005
|
|
||||||
| 0x2006
|
|
||||||
| 0x2007
|
|
||||||
| 0x2008
|
|
||||||
| 0x2009
|
|
||||||
| 0x200A
|
|
||||||
| 0x2028
|
|
||||||
| 0x2029
|
|
||||||
| 0x202F
|
|
||||||
| 0x205F
|
|
||||||
| 0x3000
|
|
||||||
)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
|
|
||||||
matches!(
|
|
||||||
ch,
|
|
||||||
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
|
|
||||||
)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
|
|
||||||
// TODO: check with cpython
|
|
||||||
char::try_from(ch)
|
|
||||||
.map(|x| x.is_alphanumeric())
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn is_uni_word(ch: u32) -> bool {
|
|
||||||
ch == '_' as u32 || is_uni_alnum(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub fn lower_unicode(ch: u32) -> u32 {
|
|
||||||
// TODO: check with cpython
|
|
||||||
char::try_from(ch)
|
|
||||||
.map(|x| x.to_lowercase().next().unwrap() as u32)
|
|
||||||
.unwrap_or(ch)
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
pub fn upper_unicode(ch: u32) -> u32 {
|
|
||||||
// TODO: check with cpython
|
|
||||||
char::try_from(ch)
|
|
||||||
.map(|x| x.to_uppercase().next().unwrap() as u32)
|
|
||||||
.unwrap_or(ch)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ flame-it = ["flame"]
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
# rustpython crates
|
# rustpython crates
|
||||||
rustpython-derive = { workspace = true }
|
rustpython-derive = { workspace = true }
|
||||||
|
rustpython-unicode = { workspace = true, features = ["casefold"] }
|
||||||
rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
|
rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
|
||||||
rustpython-common = { workspace = true }
|
rustpython-common = { workspace = true }
|
||||||
|
|
||||||
@@ -76,13 +77,6 @@ pbkdf2 = { version = "0.12", features = ["hmac"] }
|
|||||||
constant_time_eq = { workspace = true }
|
constant_time_eq = { workspace = true }
|
||||||
|
|
||||||
## unicode stuff
|
## unicode stuff
|
||||||
unicode_names2 = { workspace = true }
|
|
||||||
# update version all at the same time
|
|
||||||
icu_properties = { workspace = true }
|
|
||||||
icu_normalizer = { workspace = true }
|
|
||||||
unic-ucd-age = { workspace = true }
|
|
||||||
ucd = "0.1.1"
|
|
||||||
|
|
||||||
# compression
|
# compression
|
||||||
adler32 = "1.2.0"
|
adler32 = "1.2.0"
|
||||||
crc32fast = "1.3.2"
|
crc32fast = "1.3.2"
|
||||||
|
|||||||
@@ -6,59 +6,30 @@
|
|||||||
|
|
||||||
pub(crate) use unicodedata::module_def;
|
pub(crate) use unicodedata::module_def;
|
||||||
|
|
||||||
use crate::vm::{
|
|
||||||
PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum NormalizeForm {
|
|
||||||
Nfc,
|
|
||||||
Nfkc,
|
|
||||||
Nfd,
|
|
||||||
Nfkd,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
|
|
||||||
fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
|
|
||||||
obj.try_value_with(
|
|
||||||
|form: &PyStr| match form.as_bytes() {
|
|
||||||
b"NFC" => Ok(Self::Nfc),
|
|
||||||
b"NFKC" => Ok(Self::Nfkc),
|
|
||||||
b"NFD" => Ok(Self::Nfd),
|
|
||||||
b"NFKD" => Ok(Self::Nfkd),
|
|
||||||
_ => Err(vm.new_value_error("invalid normalization form")),
|
|
||||||
},
|
|
||||||
vm,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
mod unicodedata {
|
mod unicodedata {
|
||||||
use super::NormalizeForm::*;
|
|
||||||
use crate::vm::{
|
use crate::vm::{
|
||||||
Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
|
Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
|
||||||
builtins::{PyModule, PyStrRef},
|
builtins::{PyModule, PyStrRef},
|
||||||
function::OptionalArg,
|
function::OptionalArg,
|
||||||
};
|
};
|
||||||
|
|
||||||
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
|
|
||||||
use icu_properties::{
|
|
||||||
CodePointSetData,
|
|
||||||
props::{
|
|
||||||
BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
|
|
||||||
GeneralCategory, NamedEnumeratedProperty,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
|
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
|
||||||
use ucd::{Codepoint, DecompositionType, Number, NumericType};
|
use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data};
|
||||||
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
|
|
||||||
|
fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult<NormalizeForm> {
|
||||||
|
form.to_str()
|
||||||
|
.ok_or_else(|| vm.new_value_error("invalid normalization form"))?
|
||||||
|
.parse()
|
||||||
|
.map_err(|()| vm.new_value_error("invalid normalization form"))
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
|
pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
|
||||||
__module_exec(vm, module);
|
__module_exec(vm, module);
|
||||||
|
|
||||||
// Add UCD methods as module-level functions
|
// Add UCD methods as module-level functions
|
||||||
let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into();
|
let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into();
|
||||||
|
|
||||||
for attr in [
|
for attr in [
|
||||||
"category",
|
"category",
|
||||||
@@ -84,56 +55,40 @@ mod unicodedata {
|
|||||||
#[pyattr]
|
#[pyattr]
|
||||||
#[pyclass(name = "UCD")]
|
#[pyclass(name = "UCD")]
|
||||||
#[derive(Debug, PyPayload)]
|
#[derive(Debug, PyPayload)]
|
||||||
pub(super) struct Ucd {
|
pub(super) struct PyUcd(data::Ucd);
|
||||||
unic_version: UnicodeVersion,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ucd {
|
impl PyUcd {
|
||||||
pub const fn new(unic_version: UnicodeVersion) -> Self {
|
pub const fn new(ucd: data::Ucd) -> Self {
|
||||||
Self { unic_version }
|
Self(ucd)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_age(&self, c: CodePoint) -> bool {
|
fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
|
||||||
c.to_char()
|
character
|
||||||
.is_none_or(|c| Age::of(c).is_some_and(|age| age.actual() <= self.unic_version))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_char(
|
|
||||||
&self,
|
|
||||||
character: PyStrRef,
|
|
||||||
vm: &VirtualMachine,
|
|
||||||
) -> PyResult<Option<CodePoint>> {
|
|
||||||
let c = character
|
|
||||||
.as_wtf8()
|
.as_wtf8()
|
||||||
.code_points()
|
.code_points()
|
||||||
.exactly_one()
|
.exactly_one()
|
||||||
.map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?;
|
.map_err(|_| vm.new_type_error("argument must be a Unicode character, not str"))
|
||||||
|
|
||||||
Ok(self.check_age(c).then_some(c))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(flags(DISALLOW_INSTANTIATION))]
|
#[pyclass(flags(DISALLOW_INSTANTIATION))]
|
||||||
impl Ucd {
|
impl PyUcd {
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.extract_char(character, vm)?
|
.0
|
||||||
.map_or(GeneralCategory::Unassigned, |c| {
|
.category(Self::extract_char(character, vm)?.to_u32())
|
||||||
c.to_char()
|
|
||||||
.map_or(GeneralCategory::Surrogate, GeneralCategory::for_char)
|
|
||||||
})
|
|
||||||
.short_name()
|
|
||||||
.to_owned())
|
.to_owned())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||||
if let Some(name_str) = name.to_str()
|
if let Some(name_str) = name.to_str()
|
||||||
&& let Some(character) = unicode_names2::character(name_str)
|
&& let Some(character) = self.0.lookup(name_str)
|
||||||
&& self.check_age(character.into())
|
|
||||||
{
|
{
|
||||||
return Ok(character.to_string());
|
return Ok(char::from_u32(character)
|
||||||
|
.expect("unicode_names2 only returns Unicode scalar values")
|
||||||
|
.to_string());
|
||||||
}
|
}
|
||||||
Err(vm.new_key_error(
|
Err(vm.new_key_error(
|
||||||
vm.ctx
|
vm.ctx
|
||||||
@@ -149,13 +104,8 @@ mod unicodedata {
|
|||||||
default: OptionalArg<PyObjectRef>,
|
default: OptionalArg<PyObjectRef>,
|
||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult {
|
) -> PyResult {
|
||||||
let c = self.extract_char(character, vm)?;
|
if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) {
|
||||||
|
return Ok(vm.ctx.new_str(name).into());
|
||||||
if let Some(c) = c
|
|
||||||
&& self.check_age(c)
|
|
||||||
&& let Some(name) = c.to_char().and_then(unicode_names2::name)
|
|
||||||
{
|
|
||||||
return Ok(vm.ctx.new_str(name.to_string()).into());
|
|
||||||
}
|
}
|
||||||
default.ok_or_else(|| vm.new_value_error("no such name"))
|
default.ok_or_else(|| vm.new_value_error("no such name"))
|
||||||
}
|
}
|
||||||
@@ -166,14 +116,9 @@ mod unicodedata {
|
|||||||
character: PyStrRef,
|
character: PyStrRef,
|
||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult<&'static str> {
|
) -> PyResult<&'static str> {
|
||||||
let bidi = match self.extract_char(character, vm)? {
|
Ok(self
|
||||||
Some(c) => c
|
.0
|
||||||
.to_char()
|
.bidirectional(Self::extract_char(character, vm)?.to_u32()))
|
||||||
.map_or(BidiClass::LeftToRight, BidiClass::for_char)
|
|
||||||
.short_name(),
|
|
||||||
None => "",
|
|
||||||
};
|
|
||||||
Ok(bidi)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NOTE: This function uses 9.0.0 database instead of 3.2.0
|
/// NOTE: This function uses 9.0.0 database instead of 3.2.0
|
||||||
@@ -184,111 +129,51 @@ mod unicodedata {
|
|||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult<&'static str> {
|
) -> PyResult<&'static str> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.extract_char(character, vm)?
|
.0
|
||||||
.and_then(|c| c.to_char())
|
.east_asian_width(Self::extract_char(character, vm)?.to_u32()))
|
||||||
.map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
|
|
||||||
.short_name())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
|
fn normalize(
|
||||||
let text = unistr.as_wtf8();
|
&self,
|
||||||
let normalized_text = match form {
|
form: PyStrRef,
|
||||||
Nfc => {
|
unistr: PyStrRef,
|
||||||
let normalizer = ComposingNormalizerBorrowed::new_nfc();
|
vm: &VirtualMachine,
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
) -> PyResult<Wtf8Buf> {
|
||||||
.collect()
|
Ok(self
|
||||||
}
|
.0
|
||||||
Nfkc => {
|
.normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
|
||||||
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
Nfd => {
|
|
||||||
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
Nfkd => {
|
|
||||||
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Ok(normalized_text)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
|
fn is_normalized(
|
||||||
let text = unistr.as_wtf8();
|
&self,
|
||||||
let normalized: Wtf8Buf = match form {
|
form: PyStrRef,
|
||||||
Nfc => {
|
unistr: PyStrRef,
|
||||||
let normalizer = ComposingNormalizerBorrowed::new_nfc();
|
vm: &VirtualMachine,
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
) -> PyResult<bool> {
|
||||||
.collect()
|
Ok(self
|
||||||
}
|
.0
|
||||||
Nfkc => {
|
.is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
|
||||||
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
Nfd => {
|
|
||||||
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
Nfkd => {
|
|
||||||
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
|
|
||||||
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Ok(text == &*normalized)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
|
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
|
||||||
match self.extract_char(character, vm)? {
|
Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32)
|
||||||
Some(c) => {
|
|
||||||
if let Some(ch) = c.to_char() {
|
|
||||||
// Check if the character is mirrored in bidirectional text using Unicode standard
|
|
||||||
let bidi_mirrored = CodePointSetData::new::<BidiMirrored>();
|
|
||||||
Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 })
|
|
||||||
} else {
|
|
||||||
Ok(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => Ok(0),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
|
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.extract_char(character, vm)?
|
.0
|
||||||
.and_then(|c| c.to_char())
|
.combining(Self::extract_char(character, vm)?.to_u32()))
|
||||||
.map_or(0, |ch| {
|
|
||||||
CanonicalCombiningClass::for_char(ch).to_icu4c_value()
|
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
|
||||||
let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) {
|
Ok(self
|
||||||
Some(ch) => ch,
|
.0
|
||||||
None => return Ok(String::new()),
|
.decomposition(Self::extract_char(character, vm)?.to_u32()))
|
||||||
};
|
|
||||||
let chars: Vec<char> = ch.decomposition_map().collect();
|
|
||||||
// If decomposition maps to just the character itself, there's no decomposition
|
|
||||||
if chars.len() == 1 && chars[0] == ch {
|
|
||||||
return Ok(String::new());
|
|
||||||
}
|
|
||||||
let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
|
|
||||||
let tag = match ch.decomposition_type() {
|
|
||||||
Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
|
|
||||||
Some(dt) => decomposition_type_tag(dt),
|
|
||||||
};
|
|
||||||
Ok(format!("<{tag}> {hex_parts}"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
@@ -298,15 +183,8 @@ mod unicodedata {
|
|||||||
default: OptionalArg<PyObjectRef>,
|
default: OptionalArg<PyObjectRef>,
|
||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult {
|
) -> PyResult {
|
||||||
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
|
if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) {
|
||||||
if let Some(ch) = ch
|
return Ok(vm.ctx.new_int(value).into());
|
||||||
&& matches!(
|
|
||||||
ch.numeric_type(),
|
|
||||||
Some(NumericType::Decimal) | Some(NumericType::Digit)
|
|
||||||
)
|
|
||||||
&& let Some(Number::Integer(n)) = ch.numeric_value()
|
|
||||||
{
|
|
||||||
return Ok(vm.ctx.new_int(n).into());
|
|
||||||
}
|
}
|
||||||
default.ok_or_else(|| vm.new_value_error("not a digit"))
|
default.ok_or_else(|| vm.new_value_error("not a digit"))
|
||||||
}
|
}
|
||||||
@@ -318,12 +196,8 @@ mod unicodedata {
|
|||||||
default: OptionalArg<PyObjectRef>,
|
default: OptionalArg<PyObjectRef>,
|
||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult {
|
) -> PyResult {
|
||||||
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
|
if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) {
|
||||||
if let Some(ch) = ch
|
return Ok(vm.ctx.new_int(value).into());
|
||||||
&& ch.numeric_type() == Some(NumericType::Decimal)
|
|
||||||
&& let Some(Number::Integer(n)) = ch.numeric_value()
|
|
||||||
{
|
|
||||||
return Ok(vm.ctx.new_int(n).into());
|
|
||||||
}
|
}
|
||||||
default.ok_or_else(|| vm.new_value_error("not a decimal"))
|
default.ok_or_else(|| vm.new_value_error("not a decimal"))
|
||||||
}
|
}
|
||||||
@@ -335,58 +209,29 @@ mod unicodedata {
|
|||||||
default: OptionalArg<PyObjectRef>,
|
default: OptionalArg<PyObjectRef>,
|
||||||
vm: &VirtualMachine,
|
vm: &VirtualMachine,
|
||||||
) -> PyResult {
|
) -> PyResult {
|
||||||
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
|
if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) {
|
||||||
if let Some(ch) = ch {
|
let value = match value {
|
||||||
match ch.numeric_value() {
|
data::NumericValue::Integer(n) => n as f64,
|
||||||
Some(Number::Integer(n)) => {
|
data::NumericValue::Rational(num, den) => num as f64 / den as f64,
|
||||||
return Ok(vm.ctx.new_float(n as f64).into());
|
};
|
||||||
}
|
return Ok(vm.ctx.new_float(value).into());
|
||||||
Some(Number::Rational(num, den)) => {
|
|
||||||
return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
|
|
||||||
}
|
|
||||||
None => {}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
default.ok_or_else(|| vm.new_value_error("not a numeric character"))
|
default.ok_or_else(|| vm.new_value_error("not a numeric character"))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pygetset]
|
#[pygetset]
|
||||||
fn unidata_version(&self) -> String {
|
fn unidata_version(&self) -> String {
|
||||||
self.unic_version.to_string()
|
self.0.unicode_version().to_string()
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
|
|
||||||
match dt {
|
|
||||||
DecompositionType::Canonical => "canonical",
|
|
||||||
DecompositionType::Compat => "compat",
|
|
||||||
DecompositionType::Circle => "circle",
|
|
||||||
DecompositionType::Final => "final",
|
|
||||||
DecompositionType::Font => "font",
|
|
||||||
DecompositionType::Fraction => "fraction",
|
|
||||||
DecompositionType::Initial => "initial",
|
|
||||||
DecompositionType::Isolated => "isolated",
|
|
||||||
DecompositionType::Medial => "medial",
|
|
||||||
DecompositionType::Narrow => "narrow",
|
|
||||||
DecompositionType::Nobreak => "noBreak",
|
|
||||||
DecompositionType::Small => "small",
|
|
||||||
DecompositionType::Square => "square",
|
|
||||||
DecompositionType::Sub => "sub",
|
|
||||||
DecompositionType::Super => "super",
|
|
||||||
DecompositionType::Vertical => "vertical",
|
|
||||||
DecompositionType::Wide => "wide",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyattr]
|
#[pyattr]
|
||||||
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
|
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<PyUcd> {
|
||||||
Ucd {
|
PyUcd::new(data::Ucd::new(UnicodeVersion {
|
||||||
unic_version: UnicodeVersion {
|
major: 3,
|
||||||
major: 3,
|
minor: 2,
|
||||||
minor: 2,
|
micro: 0,
|
||||||
micro: 0,
|
}))
|
||||||
},
|
|
||||||
}
|
|
||||||
.into_ref(&vm.ctx)
|
.into_ref(&vm.ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
29
crates/unicode/Cargo.toml
Normal file
29
crates/unicode/Cargo.toml
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
[package]
|
||||||
|
name = "rustpython-unicode"
|
||||||
|
description = "Shared Unicode semantics and data for RustPython and related Python tooling."
|
||||||
|
version.workspace = true
|
||||||
|
authors.workspace = true
|
||||||
|
edition.workspace = true
|
||||||
|
rust-version.workspace = true
|
||||||
|
repository.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["std", "casefold"]
|
||||||
|
std = []
|
||||||
|
casefold = ["std", "dep:caseless"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
rustpython-wtf8 = { workspace = true }
|
||||||
|
|
||||||
|
icu_normalizer = { workspace = true }
|
||||||
|
icu_properties = { workspace = true }
|
||||||
|
itertools = { workspace = true }
|
||||||
|
unicode-casing = { workspace = true }
|
||||||
|
unicode_names2 = { version = "2.0.0", default-features = false, features = ["no_std"] }
|
||||||
|
unic-ucd-age = { workspace = true }
|
||||||
|
ucd = "0.1.1"
|
||||||
|
caseless = { version = "0.2.2", optional = true }
|
||||||
|
|
||||||
|
[lints]
|
||||||
|
workspace = true
|
||||||
111
crates/unicode/src/case.rs
Normal file
111
crates/unicode/src/case.rs
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
use alloc::string::String;
|
||||||
|
|
||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
use rustpython_wtf8::Wtf8Chunk;
|
||||||
|
use rustpython_wtf8::{Wtf8, Wtf8Buf};
|
||||||
|
use unicode_casing::CharExt;
|
||||||
|
|
||||||
|
use crate::char_from_codepoint;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct CaseMapping {
|
||||||
|
len: u8,
|
||||||
|
codepoints: [u32; 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CaseMapping {
|
||||||
|
pub const fn identity(cp: u32) -> Self {
|
||||||
|
Self {
|
||||||
|
len: 1,
|
||||||
|
codepoints: [cp, 0, 0],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn first(self) -> Option<u32> {
|
||||||
|
if self.len == 0 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.codepoints[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(self) -> impl Iterator<Item = u32> {
|
||||||
|
self.codepoints.into_iter().take(usize::from(self.len))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mapping_from_chars(chars: impl Iterator<Item = char>) -> CaseMapping {
|
||||||
|
let mut codepoints = [0; 3];
|
||||||
|
let mut len = 0;
|
||||||
|
for ch in chars.take(codepoints.len()) {
|
||||||
|
codepoints[len] = ch as u32;
|
||||||
|
len += 1;
|
||||||
|
}
|
||||||
|
CaseMapping {
|
||||||
|
len: len as u8,
|
||||||
|
codepoints,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
fn mapping_from_string(text: String) -> CaseMapping {
|
||||||
|
mapping_from_chars(text.chars())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_lowercase(cp: u32) -> CaseMapping {
|
||||||
|
char_from_codepoint(cp).map_or_else(
|
||||||
|
|| CaseMapping::identity(cp),
|
||||||
|
|ch| mapping_from_chars(ch.to_lowercase()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_uppercase(cp: u32) -> CaseMapping {
|
||||||
|
char_from_codepoint(cp).map_or_else(
|
||||||
|
|| CaseMapping::identity(cp),
|
||||||
|
|ch| mapping_from_chars(ch.to_uppercase()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_titlecase(cp: u32) -> CaseMapping {
|
||||||
|
char_from_codepoint(cp).map_or_else(
|
||||||
|
|| CaseMapping::identity(cp),
|
||||||
|
|ch| mapping_from_chars(ch.to_titlecase()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_lowercase_wtf8(text: &Wtf8) -> Wtf8Buf {
|
||||||
|
text.map_utf8(|s| s.chars().flat_map(char::to_lowercase))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_uppercase_wtf8(text: &Wtf8) -> Wtf8Buf {
|
||||||
|
text.map_utf8(|s| s.chars().flat_map(char::to_uppercase))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
pub fn casefold(cp: u32) -> CaseMapping {
|
||||||
|
char_from_codepoint(cp).map_or_else(
|
||||||
|
|| CaseMapping::identity(cp),
|
||||||
|
|ch| {
|
||||||
|
let mut buf = [0; 4];
|
||||||
|
mapping_from_string(caseless::default_case_fold_str(ch.encode_utf8(&mut buf)))
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
pub fn casefold_str(text: &str) -> String {
|
||||||
|
caseless::default_case_fold_str(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "casefold")]
|
||||||
|
pub fn casefold_wtf8(text: &Wtf8) -> Wtf8Buf {
|
||||||
|
text.chunks()
|
||||||
|
.map(|chunk| match chunk {
|
||||||
|
Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(casefold_str(s)),
|
||||||
|
Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
67
crates/unicode/src/classify.rs
Normal file
67
crates/unicode/src/classify.rs
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
use icu_properties::props::{BidiClass, EnumeratedProperty, GeneralCategory};
|
||||||
|
use ucd::{Codepoint, NumericType};
|
||||||
|
|
||||||
|
use crate::{char_from_codepoint, is_surrogate};
|
||||||
|
|
||||||
|
pub fn general_category(cp: u32) -> GeneralCategory {
|
||||||
|
if is_surrogate(cp) {
|
||||||
|
GeneralCategory::Surrogate
|
||||||
|
} else {
|
||||||
|
char_from_codepoint(cp).map_or(GeneralCategory::Unassigned, GeneralCategory::for_char)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_alpha(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(char::is_alphabetic)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_alnum(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(char::is_alphanumeric)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_decimal(cp: u32) -> bool {
|
||||||
|
matches!(general_category(cp), GeneralCategory::DecimalNumber)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_digit(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(|ch| {
|
||||||
|
matches!(
|
||||||
|
ch.numeric_type(),
|
||||||
|
Some(NumericType::Decimal) | Some(NumericType::Digit)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_numeric(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(|ch| ch.numeric_value().is_some())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_space(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(|ch| {
|
||||||
|
matches!(general_category(cp), GeneralCategory::SpaceSeparator)
|
||||||
|
|| matches!(
|
||||||
|
BidiClass::for_char(ch),
|
||||||
|
BidiClass::WhiteSpace | BidiClass::ParagraphSeparator | BidiClass::SegmentSeparator
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Python's `str.isprintable()` semantics, which treat ASCII space as printable.
|
||||||
|
pub fn is_printable(cp: u32) -> bool {
|
||||||
|
cp == '\u{0020}' as u32 || is_repr_printable(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Repr/escape printable semantics, which exclude all Unicode space separators.
|
||||||
|
pub fn is_repr_printable(cp: u32) -> bool {
|
||||||
|
!matches!(
|
||||||
|
general_category(cp),
|
||||||
|
GeneralCategory::SpaceSeparator
|
||||||
|
| GeneralCategory::LineSeparator
|
||||||
|
| GeneralCategory::ParagraphSeparator
|
||||||
|
| GeneralCategory::Control
|
||||||
|
| GeneralCategory::Format
|
||||||
|
| GeneralCategory::Surrogate
|
||||||
|
| GeneralCategory::PrivateUse
|
||||||
|
| GeneralCategory::Unassigned
|
||||||
|
)
|
||||||
|
}
|
||||||
230
crates/unicode/src/data.rs
Normal file
230
crates/unicode/src/data.rs
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
use alloc::{format, string::String, vec::Vec};
|
||||||
|
|
||||||
|
use icu_properties::{
|
||||||
|
CodePointSetData,
|
||||||
|
props::{
|
||||||
|
BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
|
||||||
|
NamedEnumeratedProperty,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use itertools::Itertools;
|
||||||
|
use ucd::{Codepoint, DecompositionType, Number, NumericType};
|
||||||
|
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
|
||||||
|
|
||||||
|
use crate::{char_from_codepoint, classify, is_surrogate};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||||
|
pub enum NumericValue {
|
||||||
|
Integer(i64),
|
||||||
|
Rational(i64, i64),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct Ucd {
|
||||||
|
unic_version: UnicodeVersion,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Ucd {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new(UNICODE_VERSION)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ucd {
|
||||||
|
pub const fn new(unic_version: UnicodeVersion) -> Self {
|
||||||
|
Self { unic_version }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn unicode_version(&self) -> UnicodeVersion {
|
||||||
|
self.unic_version
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn category(&self, cp: u32) -> &'static str {
|
||||||
|
if self.contains(cp) {
|
||||||
|
category(cp)
|
||||||
|
} else {
|
||||||
|
"Cn"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lookup(&self, name: &str) -> Option<u32> {
|
||||||
|
let cp = lookup(name)?;
|
||||||
|
self.contains(cp).then_some(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name(&self, cp: u32) -> Option<String> {
|
||||||
|
self.contains(cp).then(|| name(cp)).flatten()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bidirectional(&self, cp: u32) -> &'static str {
|
||||||
|
if self.contains(cp) {
|
||||||
|
bidirectional(cp)
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn east_asian_width(&self, cp: u32) -> &'static str {
|
||||||
|
if self.contains(cp) {
|
||||||
|
east_asian_width(cp)
|
||||||
|
} else {
|
||||||
|
"N"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn normalize(
|
||||||
|
&self,
|
||||||
|
form: crate::NormalizeForm,
|
||||||
|
text: &rustpython_wtf8::Wtf8,
|
||||||
|
) -> rustpython_wtf8::Wtf8Buf {
|
||||||
|
crate::normalize::normalize(form, text)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_normalized(&self, form: crate::NormalizeForm, text: &rustpython_wtf8::Wtf8) -> bool {
|
||||||
|
crate::normalize::is_normalized(form, text)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mirrored(&self, cp: u32) -> bool {
|
||||||
|
self.contains(cp) && mirrored(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn combining(&self, cp: u32) -> u8 {
|
||||||
|
if self.contains(cp) { combining(cp) } else { 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decomposition(&self, cp: u32) -> String {
|
||||||
|
if self.contains(cp) {
|
||||||
|
decomposition(cp)
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn digit(&self, cp: u32) -> Option<u32> {
|
||||||
|
self.contains(cp).then(|| digit(cp)).flatten()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decimal(&self, cp: u32) -> Option<u32> {
|
||||||
|
self.contains(cp).then(|| decimal(cp)).flatten()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn numeric(&self, cp: u32) -> Option<NumericValue> {
|
||||||
|
self.contains(cp).then(|| numeric(cp)).flatten()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn contains(&self, cp: u32) -> bool {
|
||||||
|
is_assigned_in_version(cp, self.unic_version)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool {
|
||||||
|
if is_surrogate(cp) {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
char_from_codepoint(cp)
|
||||||
|
.is_some_and(|ch| Age::of(ch).is_some_and(|age| age.actual() <= version))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn category(cp: u32) -> &'static str {
|
||||||
|
classify::general_category(cp).short_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lookup(name: &str) -> Option<u32> {
|
||||||
|
unicode_names2::character(name).map(u32::from)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name(cp: u32) -> Option<String> {
|
||||||
|
char_from_codepoint(cp)
|
||||||
|
.and_then(unicode_names2::name)
|
||||||
|
.map(|name| name.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bidirectional(cp: u32) -> &'static str {
|
||||||
|
char_from_codepoint(cp)
|
||||||
|
.map_or(BidiClass::LeftToRight, BidiClass::for_char)
|
||||||
|
.short_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn east_asian_width(cp: u32) -> &'static str {
|
||||||
|
char_from_codepoint(cp)
|
||||||
|
.map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
|
||||||
|
.short_name()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mirrored(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(|ch| CodePointSetData::new::<BidiMirrored>().contains(ch))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn combining(cp: u32) -> u8 {
|
||||||
|
char_from_codepoint(cp).map_or(0, |ch| {
|
||||||
|
CanonicalCombiningClass::for_char(ch).to_icu4c_value()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decomposition(cp: u32) -> String {
|
||||||
|
let ch = match char_from_codepoint(cp) {
|
||||||
|
Some(ch) => ch,
|
||||||
|
None => return String::new(),
|
||||||
|
};
|
||||||
|
let chars: Vec<char> = ch.decomposition_map().collect();
|
||||||
|
if chars.len() == 1 && chars[0] == ch {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
|
||||||
|
match ch.decomposition_type() {
|
||||||
|
Some(DecompositionType::Canonical) | None => hex_parts,
|
||||||
|
Some(dt) => format!("<{}> {hex_parts}", decomposition_type_tag(dt)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn digit(cp: u32) -> Option<u32> {
|
||||||
|
let ch = char_from_codepoint(cp)?;
|
||||||
|
if matches!(
|
||||||
|
ch.numeric_type(),
|
||||||
|
Some(NumericType::Decimal) | Some(NumericType::Digit)
|
||||||
|
) && let Some(Number::Integer(value)) = ch.numeric_value()
|
||||||
|
{
|
||||||
|
return u32::try_from(value).ok();
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decimal(cp: u32) -> Option<u32> {
|
||||||
|
let ch = char_from_codepoint(cp)?;
|
||||||
|
if ch.numeric_type() == Some(NumericType::Decimal)
|
||||||
|
&& let Some(Number::Integer(value)) = ch.numeric_value()
|
||||||
|
{
|
||||||
|
return u32::try_from(value).ok();
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn numeric(cp: u32) -> Option<NumericValue> {
|
||||||
|
match char_from_codepoint(cp)?.numeric_value()? {
|
||||||
|
Number::Integer(value) => Some(NumericValue::Integer(value)),
|
||||||
|
Number::Rational(num, den) => Some(NumericValue::Rational(num.into(), den.into())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
|
||||||
|
match dt {
|
||||||
|
DecompositionType::Canonical => "canonical",
|
||||||
|
DecompositionType::Compat => "compat",
|
||||||
|
DecompositionType::Circle => "circle",
|
||||||
|
DecompositionType::Final => "final",
|
||||||
|
DecompositionType::Font => "font",
|
||||||
|
DecompositionType::Fraction => "fraction",
|
||||||
|
DecompositionType::Initial => "initial",
|
||||||
|
DecompositionType::Isolated => "isolated",
|
||||||
|
DecompositionType::Medial => "medial",
|
||||||
|
DecompositionType::Narrow => "narrow",
|
||||||
|
DecompositionType::Nobreak => "noBreak",
|
||||||
|
DecompositionType::Small => "small",
|
||||||
|
DecompositionType::Square => "square",
|
||||||
|
DecompositionType::Sub => "sub",
|
||||||
|
DecompositionType::Super => "super",
|
||||||
|
DecompositionType::Vertical => "vertical",
|
||||||
|
DecompositionType::Wide => "wide",
|
||||||
|
}
|
||||||
|
}
|
||||||
27
crates/unicode/src/identifier.rs
Normal file
27
crates/unicode/src/identifier.rs
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
use icu_properties::props::{BinaryProperty, XidContinue, XidStart};
|
||||||
|
|
||||||
|
use crate::char_from_codepoint;
|
||||||
|
|
||||||
|
pub fn is_xid_start(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(XidStart::for_char)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_xid_continue(cp: u32) -> bool {
|
||||||
|
char_from_codepoint(cp).is_some_and(XidContinue::for_char)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_python_identifier_start(cp: u32) -> bool {
|
||||||
|
cp == '_' as u32 || is_xid_start(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_python_identifier_continue(cp: u32) -> bool {
|
||||||
|
is_xid_continue(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_python_identifier(text: &str) -> bool {
|
||||||
|
let mut chars = text.chars();
|
||||||
|
let is_identifier_start = chars
|
||||||
|
.next()
|
||||||
|
.is_some_and(|ch| is_python_identifier_start(ch as u32));
|
||||||
|
is_identifier_start && chars.all(|ch| is_python_identifier_continue(ch as u32))
|
||||||
|
}
|
||||||
77
crates/unicode/src/lib.rs
Normal file
77
crates/unicode/src/lib.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
#![cfg_attr(not(feature = "std"), no_std)]
|
||||||
|
|
||||||
|
extern crate alloc;
|
||||||
|
|
||||||
|
pub mod case;
|
||||||
|
pub mod classify;
|
||||||
|
pub mod data;
|
||||||
|
pub mod identifier;
|
||||||
|
pub mod normalize;
|
||||||
|
pub mod regex;
|
||||||
|
|
||||||
|
pub use normalize::NormalizeForm;
|
||||||
|
pub use unic_ucd_age::{UNICODE_VERSION, UnicodeVersion};
|
||||||
|
|
||||||
|
use core::char;
|
||||||
|
|
||||||
|
pub(crate) fn char_from_codepoint(cp: u32) -> Option<char> {
|
||||||
|
char::from_u32(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) const fn is_surrogate(cp: u32) -> bool {
|
||||||
|
matches!(cp, 0xD800..=0xDFFF)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use alloc::vec::Vec;
|
||||||
|
use rustpython_wtf8::Wtf8Buf;
|
||||||
|
|
||||||
|
use crate::{NormalizeForm, case, classify, data, identifier, normalize, regex};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn printable_and_repr_printable_follow_python_rules() {
|
||||||
|
assert!(classify::is_printable(' ' as u32));
|
||||||
|
assert!(!classify::is_repr_printable(' ' as u32));
|
||||||
|
assert!(!classify::is_printable('\n' as u32));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn identifier_and_regex_predicates_share_unicode_tables() {
|
||||||
|
assert!(identifier::is_python_identifier_start('_' as u32));
|
||||||
|
assert!(identifier::is_python_identifier("유니코드"));
|
||||||
|
assert!(regex::is_unicode_word('가' as u32));
|
||||||
|
assert!(regex::is_unicode_digit('५' as u32));
|
||||||
|
assert!(regex::is_unicode_space('\u{3000}' as u32));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn case_and_normalization_helpers_support_full_mappings() {
|
||||||
|
let upper: Vec<_> = case::to_uppercase('ß' as u32).iter().collect();
|
||||||
|
assert_eq!(upper, vec!['S' as u32, 'S' as u32]);
|
||||||
|
|
||||||
|
let text = Wtf8Buf::from("e\u{301}");
|
||||||
|
assert_eq!(
|
||||||
|
normalize::normalize(NormalizeForm::Nfc, &text),
|
||||||
|
Wtf8Buf::from("é")
|
||||||
|
);
|
||||||
|
assert!(normalize::is_normalized(
|
||||||
|
NormalizeForm::Nfd,
|
||||||
|
&normalize::normalize(NormalizeForm::Nfd, &Wtf8Buf::from("é"))
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unicode_data_queries_match_existing_unicodedata_behavior() {
|
||||||
|
assert_eq!(data::category('A' as u32), "Lu");
|
||||||
|
assert_eq!(data::category(0xD800), "Cs");
|
||||||
|
assert_eq!(data::lookup("SNOWMAN"), Some('☃' as u32));
|
||||||
|
assert_eq!(data::name('☃' as u32).as_deref(), Some("SNOWMAN"));
|
||||||
|
assert_eq!(data::decimal('५' as u32), Some(5));
|
||||||
|
assert_eq!(data::digit('²' as u32), Some(2));
|
||||||
|
assert_eq!(
|
||||||
|
data::numeric('⅓' as u32),
|
||||||
|
Some(data::NumericValue::Rational(1, 3))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
55
crates/unicode/src/normalize.rs
Normal file
55
crates/unicode/src/normalize.rs
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
use core::str::FromStr;
|
||||||
|
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
|
||||||
|
use rustpython_wtf8::{Wtf8, Wtf8Buf};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum NormalizeForm {
|
||||||
|
Nfc,
|
||||||
|
Nfkc,
|
||||||
|
Nfd,
|
||||||
|
Nfkd,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for NormalizeForm {
|
||||||
|
type Err = ();
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"NFC" => Ok(Self::Nfc),
|
||||||
|
"NFKC" => Ok(Self::Nfkc),
|
||||||
|
"NFD" => Ok(Self::Nfd),
|
||||||
|
"NFKD" => Ok(Self::Nfkd),
|
||||||
|
_ => Err(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf {
|
||||||
|
match form {
|
||||||
|
NormalizeForm::Nfc => {
|
||||||
|
let normalizer = ComposingNormalizerBorrowed::new_nfc();
|
||||||
|
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
NormalizeForm::Nfkc => {
|
||||||
|
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
|
||||||
|
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
NormalizeForm::Nfd => {
|
||||||
|
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
|
||||||
|
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
NormalizeForm::Nfkd => {
|
||||||
|
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
|
||||||
|
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_normalized(form: NormalizeForm, text: &Wtf8) -> bool {
|
||||||
|
let normalized = normalize(form, text);
|
||||||
|
text == &*normalized
|
||||||
|
}
|
||||||
87
crates/unicode/src/regex.rs
Normal file
87
crates/unicode/src/regex.rs
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
use crate::{case, classify};
|
||||||
|
|
||||||
|
const UNDERSCORE: u32 = '_' as u32;
|
||||||
|
|
||||||
|
const fn is_py_ascii_whitespace(byte: u8) -> bool {
|
||||||
|
matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_word(cp: u32) -> bool {
|
||||||
|
cp == UNDERSCORE
|
||||||
|
|| u8::try_from(cp)
|
||||||
|
.map(|byte| byte.is_ascii_alphanumeric())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_space(cp: u32) -> bool {
|
||||||
|
u8::try_from(cp)
|
||||||
|
.map(is_py_ascii_whitespace)
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_digit(cp: u32) -> bool {
|
||||||
|
u8::try_from(cp)
|
||||||
|
.map(|byte| byte.is_ascii_digit())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_locale_alnum(cp: u32) -> bool {
|
||||||
|
u8::try_from(cp)
|
||||||
|
.map(|byte| byte.is_ascii_alphanumeric())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_locale_word(cp: u32) -> bool {
|
||||||
|
cp == UNDERSCORE || is_locale_alnum(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn is_linebreak(cp: u32) -> bool {
|
||||||
|
cp == '\n' as u32
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lower_ascii(cp: u32) -> u32 {
|
||||||
|
u8::try_from(cp)
|
||||||
|
.map(|byte| byte.to_ascii_lowercase() as u32)
|
||||||
|
.unwrap_or(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lower_locale(cp: u32) -> u32 {
|
||||||
|
lower_ascii(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn upper_locale(cp: u32) -> u32 {
|
||||||
|
u8::try_from(cp)
|
||||||
|
.map(|byte| byte.to_ascii_uppercase() as u32)
|
||||||
|
.unwrap_or(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unicode_digit(cp: u32) -> bool {
|
||||||
|
classify::is_decimal(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unicode_space(cp: u32) -> bool {
|
||||||
|
classify::is_space(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn is_unicode_linebreak(cp: u32) -> bool {
|
||||||
|
matches!(
|
||||||
|
cp,
|
||||||
|
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unicode_alnum(cp: u32) -> bool {
|
||||||
|
classify::is_alnum(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unicode_word(cp: u32) -> bool {
|
||||||
|
cp == UNDERSCORE || is_unicode_alnum(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lower_unicode(cp: u32) -> u32 {
|
||||||
|
case::to_lowercase(cp).first().unwrap_or(cp)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn upper_unicode(cp: u32) -> u32 {
|
||||||
|
case::to_uppercase(cp).first().unwrap_or(cp)
|
||||||
|
}
|
||||||
@@ -41,6 +41,7 @@ ruff_text_size = { workspace = true, optional = true }
|
|||||||
rustpython-compiler-core = { workspace = true }
|
rustpython-compiler-core = { workspace = true }
|
||||||
rustpython-literal = { workspace = true }
|
rustpython-literal = { workspace = true }
|
||||||
rustpython-sre_engine = { workspace = true }
|
rustpython-sre_engine = { workspace = true }
|
||||||
|
rustpython-unicode = { workspace = true, features = ["casefold"] }
|
||||||
|
|
||||||
ascii = { workspace = true }
|
ascii = { workspace = true }
|
||||||
ahash = { workspace = true }
|
ahash = { workspace = true }
|
||||||
@@ -74,7 +75,6 @@ strum_macros = { workspace = true }
|
|||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
memchr = { workspace = true }
|
memchr = { workspace = true }
|
||||||
|
|
||||||
caseless = "0.2.2"
|
|
||||||
flamer = { version = "0.5", optional = true }
|
flamer = { version = "0.5", optional = true }
|
||||||
half = "2"
|
half = "2"
|
||||||
psm = "0.1"
|
psm = "0.1"
|
||||||
@@ -86,7 +86,6 @@ timsort = "0.1.2"
|
|||||||
# TODO: use unic for this; needed for title case:
|
# TODO: use unic for this; needed for title case:
|
||||||
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
|
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
|
||||||
unicode-casing = { workspace = true }
|
unicode-casing = { workspace = true }
|
||||||
icu_properties = { workspace = true }
|
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
rustix = { workspace = true }
|
rustix = { workspace = true }
|
||||||
|
|||||||
@@ -41,12 +41,9 @@ use rustpython_common::{
|
|||||||
hash,
|
hash,
|
||||||
lock::PyMutex,
|
lock::PyMutex,
|
||||||
str::DeduceStrKind,
|
str::DeduceStrKind,
|
||||||
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
|
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat},
|
||||||
};
|
};
|
||||||
|
|
||||||
use icu_properties::props::{
|
|
||||||
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
|
|
||||||
};
|
|
||||||
use unicode_casing::CharExt;
|
use unicode_casing::CharExt;
|
||||||
|
|
||||||
impl<'a> TryFromBorrowedObject<'a> for String {
|
impl<'a> TryFromBorrowedObject<'a> for String {
|
||||||
@@ -698,7 +695,7 @@ impl PyStr {
|
|||||||
match self.as_str_kind() {
|
match self.as_str_kind() {
|
||||||
PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(),
|
PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(),
|
||||||
PyKindStr::Utf8(s) => s.to_lowercase().into(),
|
PyKindStr::Utf8(s) => s.to_lowercase().into(),
|
||||||
PyKindStr::Wtf8(w) => w.to_lowercase().into(),
|
PyKindStr::Wtf8(w) => rustpython_unicode::case::to_lowercase_wtf8(w).into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -706,16 +703,9 @@ impl PyStr {
|
|||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn casefold(&self) -> Self {
|
fn casefold(&self) -> Self {
|
||||||
match self.as_str_kind() {
|
match self.as_str_kind() {
|
||||||
PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(),
|
PyKindStr::Ascii(s) => rustpython_unicode::case::casefold_str(s.as_str()).into(),
|
||||||
PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(),
|
PyKindStr::Utf8(s) => rustpython_unicode::case::casefold_str(s).into(),
|
||||||
PyKindStr::Wtf8(w) => w
|
PyKindStr::Wtf8(w) => rustpython_unicode::case::casefold_wtf8(w).into(),
|
||||||
.chunks()
|
|
||||||
.map(|c| match c {
|
|
||||||
Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)),
|
|
||||||
Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
|
|
||||||
})
|
|
||||||
.collect::<Wtf8Buf>()
|
|
||||||
.into(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -724,7 +714,7 @@ impl PyStr {
|
|||||||
match self.as_str_kind() {
|
match self.as_str_kind() {
|
||||||
PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(),
|
PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(),
|
||||||
PyKindStr::Utf8(s) => s.to_uppercase().into(),
|
PyKindStr::Utf8(s) => s.to_uppercase().into(),
|
||||||
PyKindStr::Wtf8(w) => w.to_uppercase().into(),
|
PyKindStr::Wtf8(w) => rustpython_unicode::case::to_uppercase_wtf8(w).into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -967,9 +957,7 @@ impl PyStr {
|
|||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn isdecimal(&self) -> bool {
|
fn isdecimal(&self) -> bool {
|
||||||
!self.data.is_empty()
|
!self.data.is_empty()
|
||||||
&& self.char_all(|c| {
|
&& self.char_all(|c| rustpython_unicode::classify::is_decimal(c as u32))
|
||||||
matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
|
fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
|
||||||
@@ -1089,23 +1077,12 @@ impl PyStr {
|
|||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn isprintable(&self) -> bool {
|
fn isprintable(&self) -> bool {
|
||||||
self.char_all(|c| c == '\u{0020}' || rustpython_literal::char::is_printable(c))
|
self.char_all(|c| rustpython_unicode::classify::is_printable(c as u32))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
fn isspace(&self) -> bool {
|
fn isspace(&self) -> bool {
|
||||||
!self.data.is_empty()
|
!self.data.is_empty() && self.char_all(|c| rustpython_unicode::classify::is_space(c as u32))
|
||||||
&& self.char_all(|c| {
|
|
||||||
matches!(
|
|
||||||
GeneralCategory::for_char(c),
|
|
||||||
GeneralCategory::SpaceSeparator
|
|
||||||
) || matches!(
|
|
||||||
BidiClass::for_char(c),
|
|
||||||
BidiClass::WhiteSpace
|
|
||||||
| BidiClass::ParagraphSeparator
|
|
||||||
| BidiClass::SegmentSeparator
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise.
|
// Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise.
|
||||||
@@ -1362,15 +1339,8 @@ impl PyStr {
|
|||||||
|
|
||||||
#[pymethod]
|
#[pymethod]
|
||||||
pub fn isidentifier(&self) -> bool {
|
pub fn isidentifier(&self) -> bool {
|
||||||
let Some(s) = self.to_str() else { return false };
|
self.to_str()
|
||||||
let mut chars = s.chars();
|
.is_some_and(rustpython_unicode::identifier::is_python_identifier)
|
||||||
|
|
||||||
let is_identifier_start = chars
|
|
||||||
.next()
|
|
||||||
.is_some_and(|c| c == '_' || XidStart::for_char(c));
|
|
||||||
|
|
||||||
// a string is not an identifier if it has whitespace or starts with a number
|
|
||||||
is_identifier_start && chars.all(XidContinue::for_char)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://docs.python.org/3/library/stdtypes.html#str.translate
|
// https://docs.python.org/3/library/stdtypes.html#str.translate
|
||||||
|
|||||||
@@ -21,10 +21,8 @@ mod _sre {
|
|||||||
use crossbeam_utils::atomic::AtomicCell;
|
use crossbeam_utils::atomic::AtomicCell;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use num_traits::ToPrimitive;
|
use num_traits::ToPrimitive;
|
||||||
use rustpython_sre_engine::{
|
use rustpython_sre_engine::{Request, SearchIter, SreFlag, State, StrDrive};
|
||||||
Request, SearchIter, SreFlag, State, StrDrive,
|
use rustpython_unicode::regex as unicode_regex;
|
||||||
string::{lower_ascii, lower_unicode, upper_unicode},
|
|
||||||
};
|
|
||||||
|
|
||||||
#[pyattr]
|
#[pyattr]
|
||||||
pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC};
|
pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC};
|
||||||
@@ -42,17 +40,17 @@ mod _sre {
|
|||||||
#[pyfunction]
|
#[pyfunction]
|
||||||
fn unicode_iscased(ch: i32) -> bool {
|
fn unicode_iscased(ch: i32) -> bool {
|
||||||
let ch = ch as u32;
|
let ch = ch as u32;
|
||||||
ch != lower_unicode(ch) || ch != upper_unicode(ch)
|
ch != unicode_regex::lower_unicode(ch) || ch != unicode_regex::upper_unicode(ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyfunction]
|
#[pyfunction]
|
||||||
fn ascii_tolower(ch: i32) -> i32 {
|
fn ascii_tolower(ch: i32) -> i32 {
|
||||||
lower_ascii(ch as u32) as i32
|
unicode_regex::lower_ascii(ch as u32) as i32
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyfunction]
|
#[pyfunction]
|
||||||
fn unicode_tolower(ch: i32) -> i32 {
|
fn unicode_tolower(ch: i32) -> i32 {
|
||||||
lower_unicode(ch as u32) as i32
|
unicode_regex::lower_unicode(ch as u32) as i32
|
||||||
}
|
}
|
||||||
|
|
||||||
trait SreStr: StrDrive {
|
trait SreStr: StrDrive {
|
||||||
|
|||||||
20
extra_tests/snippets/stdlib_unicode_shared.py
Normal file
20
extra_tests/snippets/stdlib_unicode_shared.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
assert "유니코드".isidentifier()
|
||||||
|
assert "५".isdecimal()
|
||||||
|
assert "\u3000".isspace()
|
||||||
|
assert " ".isprintable()
|
||||||
|
assert not "\n".isprintable()
|
||||||
|
|
||||||
|
assert unicodedata.category("\ud800") == "Cs"
|
||||||
|
assert unicodedata.lookup("SNOWMAN") == "☃"
|
||||||
|
assert unicodedata.name("☃") == "SNOWMAN"
|
||||||
|
assert unicodedata.normalize("NFC", "e\u0301") == "é"
|
||||||
|
assert unicodedata.digit("²") == 2
|
||||||
|
assert unicodedata.decimal("५") == 5
|
||||||
|
assert unicodedata.numeric("⅓") == 1 / 3
|
||||||
|
|
||||||
|
assert re.fullmatch(r"\w+", "가나다")
|
||||||
|
assert re.fullmatch(r"\d+", "५६७")
|
||||||
|
assert re.fullmatch(r"\s+", "\u3000")
|
||||||
Reference in New Issue
Block a user