Files
RustPython/stdlib/src/unicodedata.rs
2021-10-01 01:54:04 +09:00

157 lines
4.5 KiB
Rust

/* Access to the unicode database.
See also: https://docs.python.org/3/library/unicodedata.html
*/
use crate::vm::{
builtins::PyStrRef, extend_module, function::OptionalArg, py_module, PyClassImpl, PyObject,
PyObjectRef, PyResult, PyValue, VirtualMachine,
};
use itertools::Itertools;
use unic_char_property::EnumeratedCharProperty;
use unic_normal::StrNormalForm;
use unic_ucd_age::{Age, UnicodeVersion, UNICODE_VERSION};
use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
pub fn make_module(vm: &VirtualMachine) -> PyObjectRef {
let ctx = &vm.ctx;
let ucd_class = PyUCD::make_class(ctx);
let ucd = PyObject::new(PyUCD::default(), ucd_class.clone(), None);
let ucd_3_2_0 = PyObject::new(
PyUCD {
unic_version: UnicodeVersion {
major: 3,
minor: 2,
micro: 0,
},
},
ucd_class.clone(),
None,
);
let module = py_module!(vm, "unicodedata", {
"UCD" => ucd_class.into_object(),
"ucd_3_2_0" => ucd_3_2_0,
// we do unidata_version here because the getter tries to do PyUCD::class() before
// the module is in the VM
"unidata_version" => ctx.new_utf8_str(PyUCD::default().unic_version.to_string()),
});
for attr in ["category", "lookup", "name", "bidirectional", "normalize"]
.iter()
.copied()
{
extend_module!(vm, &module, {
attr => vm.get_attribute(ucd.clone(), attr).unwrap(),
});
}
module
}
#[pyclass(module = "unicodedata", name = "UCD")]
#[derive(Debug, PyValue)]
struct PyUCD {
unic_version: UnicodeVersion,
}
impl Default for PyUCD {
#[inline(always)]
fn default() -> Self {
PyUCD {
unic_version: UNICODE_VERSION,
}
}
}
#[pyimpl]
impl PyUCD {
fn check_age(&self, c: char) -> bool {
Age::of(c).map_or(false, |age| age.actual() <= self.unic_version)
}
fn extract_char(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<Option<char>> {
let c = character.as_str().chars().exactly_one().map_err(|_| {
vm.new_type_error("argument must be an unicode character, not str".to_owned())
})?;
if self.check_age(c) {
Ok(Some(c))
} else {
Ok(None)
}
}
#[pymethod]
fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
Ok(self
.extract_char(character, vm)?
.map_or(GeneralCategory::Unassigned, GeneralCategory::of)
.abbr_name()
.to_owned())
}
#[pymethod]
fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
if let Some(character) = unicode_names2::character(name.as_str()) {
if self.check_age(character) {
return Ok(character.to_string());
}
}
Err(vm.new_lookup_error(format!("undefined character name '{}'", name)))
}
#[pymethod]
fn name(
&self,
character: PyStrRef,
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
let c = self.extract_char(character, vm)?;
if let Some(c) = c {
if self.check_age(c) {
if let Some(name) = unicode_names2::name(c) {
return Ok(vm.ctx.new_utf8_str(name.to_string()));
}
}
}
match default {
OptionalArg::Present(obj) => Ok(obj),
OptionalArg::Missing => Err(vm.new_value_error("character name not found!".to_owned())),
}
}
#[pymethod]
fn bidirectional(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
let bidi = match self.extract_char(character, vm)? {
Some(c) => BidiClass::of(c).abbr_name(),
None => "",
};
Ok(bidi.to_owned())
}
#[pymethod]
fn normalize(&self, form: PyStrRef, unistr: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
let text = unistr.as_str();
let normalized_text = match form.as_str() {
"NFC" => text.nfc().collect::<String>(),
"NFKC" => text.nfkc().collect::<String>(),
"NFD" => text.nfd().collect::<String>(),
"NFKD" => text.nfkd().collect::<String>(),
_ => return Err(vm.new_value_error("invalid normalization form".to_owned())),
};
Ok(normalized_text)
}
#[pyproperty]
fn unidata_version(&self) -> String {
self.unic_version.to_string()
}
}