From 3b875291eef74565699f228e28686f899101c318 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 30 Sep 2020 21:20:13 -0500 Subject: [PATCH] WIP - native _sre --- Lib/sre_constants.py | 66 ++++++++++++----- vm/src/builtins/dict.rs | 15 ++++ vm/src/builtins/pystr.rs | 11 +++ vm/src/stdlib/mod.rs | 2 + vm/src/stdlib/sre.rs | 59 +++++++++++++++ vm/src/stdlib/sre/constants.rs | 114 +++++++++++++++++++++++++++++ vm/src/stdlib/sre/interp.rs | 126 +++++++++++++++++++++++++++++++++ 7 files changed, 374 insertions(+), 19 deletions(-) create mode 100644 vm/src/stdlib/sre.rs create mode 100644 vm/src/stdlib/sre/constants.rs create mode 100644 vm/src/stdlib/sre/interp.rs diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index d80e50b23..33882190c 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -179,11 +179,22 @@ SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": - def dump(f, d, prefix): + def dump(f, d, typ, int_t, prefix): items = sorted(d) + f.write(f"""\ +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr({int_t})] +#[allow(non_camel_case_types)] +pub enum {typ} {{ +""") for item in items: - f.write("#define %s_%s %d\n" % (prefix, item, item)) - with open("sre_constants.h", "w") as f: + name = str(item).removeprefix(prefix) + val = int(item) + f.write(f" {name} = {val},\n") + f.write("""\ +} +""") + with open("vm/src/stdlib/sre/constants.rs", "w") as f: f.write("""\ /* * Secret Labs' Regular Expression Engine @@ -200,24 +211,41 @@ if __name__ == "__main__": """) - f.write("#define SRE_MAGIC %d\n" % MAGIC) + f.write("use bitflags::bitflags;\n\n"); - dump(f, OPCODES, "SRE_OP") - dump(f, ATCODES, "SRE") - dump(f, CHCODES, "SRE") + f.write("pub const SRE_MAGIC: usize = %d;\n" % MAGIC) - f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) - f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) - f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) - f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) - f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) - f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) - f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) - f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG) - f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII) + dump(f, OPCODES, "SreOpcode", "u32", "") + dump(f, ATCODES, "SreAtCode", "u32", "AT_") + dump(f, CHCODES, "SreCatCode", "u32", "CATEGORY_") - f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) - f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) - f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) + def bitflags(typ, int_t, prefix, flags): + f.write(f"""\ +bitflags! {{ + pub struct {typ}: {int_t} {{ +""") + for name in flags: + val = globals()[prefix + name] + f.write(f" const {name} = {val};\n") + f.write("""\ + } +} +""") + + bitflags("SreFlag", "u16", "SRE_FLAG_", [ + "TEMPLATE", + "IGNORECASE", + "LOCALE", + "MULTILINE", + "DOTALL", + "UNICODE", + "VERBOSE", + "DEBUG", + "ASCII", + ]) + + bitflags("SreInfo", "u32", "SRE_INFO_", [ + "PREFIX", "LITERAL", "CHARSET", + ]) print("done") diff --git a/vm/src/builtins/dict.rs b/vm/src/builtins/dict.rs index 2e09b6811..6bb1b44ac 100644 --- a/vm/src/builtins/dict.rs +++ b/vm/src/builtins/dict.rs @@ -860,3 +860,18 @@ impl PyMapping { self.dict } } + +impl TryFromObject for std::collections::HashMap +where + K: TryFromObject + std::hash::Hash + Eq, + V: TryFromObject, +{ + fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult { + let mapping = PyMapping::try_from_object(vm, obj)?; + mapping + .into_dict() + .into_iter() + .map(|(k, v)| Ok((K::try_from_object(vm, k)?, V::try_from_object(vm, v)?))) + .collect() + } +} diff --git a/vm/src/builtins/pystr.rs b/vm/src/builtins/pystr.rs index f35a284be..a42603153 100644 --- a/vm/src/builtins/pystr.rs +++ b/vm/src/builtins/pystr.rs @@ -285,6 +285,9 @@ impl PyStr { len }) } + pub fn char_len(&self) -> usize { + self.len() + } #[pymethod(name = "__sizeof__")] fn sizeof(&self) -> usize { @@ -1457,3 +1460,11 @@ impl<'s> AnyStr<'s> for str { splited } } + +impl TryFromObject for String { + fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult { + Ok(PyStrRef::try_from_object(vm, obj)? + .borrow_value() + .to_owned()) + } +} diff --git a/vm/src/stdlib/mod.rs b/vm/src/stdlib/mod.rs index 970b27718..2bf47149a 100644 --- a/vm/src/stdlib/mod.rs +++ b/vm/src/stdlib/mod.rs @@ -29,6 +29,7 @@ mod re; mod serde_json; #[cfg(not(target_arch = "wasm32"))] pub mod socket; +mod sre; mod string; #[cfg(feature = "rustpython-compiler")] mod symtable; @@ -94,6 +95,7 @@ pub fn get_module_inits() -> HashMap "regex_crate".to_owned() => Box::new(re::make_module), "_random".to_owned() => Box::new(random::make_module), "_serde_json".to_owned() => Box::new(serde_json::make_module), + "_sre".to_owned() => Box::new(sre::make_module), "_string".to_owned() => Box::new(string::make_module), "_struct".to_owned() => Box::new(pystruct::make_module), "time".to_owned() => Box::new(time_module::make_module), diff --git a/vm/src/stdlib/sre.rs b/vm/src/stdlib/sre.rs new file mode 100644 index 000000000..7581aa155 --- /dev/null +++ b/vm/src/stdlib/sre.rs @@ -0,0 +1,59 @@ +mod constants; +mod interp; + +pub(crate) use _sre::make_module; + +#[pymodule] +mod _sre { + use super::constants::SreFlag; + use crate::builtins::PyTypeRef; + use crate::pyobject::{PyObjectRef, PyResult, PyValue, StaticType}; + use crate::VirtualMachine; + use std::collections::HashMap; + + #[pyattr] + use super::constants::SRE_MAGIC as MAGIC; + + #[pyfunction] + fn compile( + pattern: PyObjectRef, + flags: u16, + code: PyObjectRef, + groups: usize, + groupindex: HashMap, + indexgroup: PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult { + let code = vm.extract_elements::(&code)?; + + Ok(Pattern { + pattern, + flags: SreFlag::from_bits_truncate(flags), + code, + groups, + groupindex, + indexgroup: vm.extract_elements(&indexgroup)?, + }) + } + + #[pyattr] + #[pyclass(name = "Pattern")] + #[derive(Debug)] + struct Pattern { + pattern: PyObjectRef, + flags: SreFlag, + code: Vec, + groups: usize, + groupindex: HashMap, + indexgroup: Vec>, + } + + impl PyValue for Pattern { + fn class(_vm: &VirtualMachine) -> &PyTypeRef { + Self::static_type() + } + } + + #[pyimpl] + impl Pattern {} +} diff --git a/vm/src/stdlib/sre/constants.rs b/vm/src/stdlib/sre/constants.rs new file mode 100644 index 000000000..f6aeb3182 --- /dev/null +++ b/vm/src/stdlib/sre/constants.rs @@ -0,0 +1,114 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +use bitflags::bitflags; + +pub const SRE_MAGIC: usize = 20140917; +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreOpcode { + FAILURE = 0, + SUCCESS = 1, + ANY = 2, + ANY_ALL = 3, + ASSERT = 4, + ASSERT_NOT = 5, + AT = 6, + BRANCH = 7, + CALL = 8, + CATEGORY = 9, + CHARSET = 10, + BIGCHARSET = 11, + GROUPREF = 12, + GROUPREF_EXISTS = 13, + GROUPREF_IGNORE = 14, + IN = 15, + IN_IGNORE = 16, + INFO = 17, + JUMP = 18, + LITERAL = 19, + LITERAL_IGNORE = 20, + MARK = 21, + MAX_UNTIL = 22, + MIN_UNTIL = 23, + NOT_LITERAL = 24, + NOT_LITERAL_IGNORE = 25, + NEGATE = 26, + RANGE = 27, + REPEAT = 28, + REPEAT_ONE = 29, + SUBPATTERN = 30, + MIN_REPEAT_ONE = 31, + RANGE_IGNORE = 32, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreAtCode { + BEGINNING = 0, + BEGINNING_LINE = 1, + BEGINNING_STRING = 2, + BOUNDARY = 3, + NON_BOUNDARY = 4, + END = 5, + END_LINE = 6, + END_STRING = 7, + LOC_BOUNDARY = 8, + LOC_NON_BOUNDARY = 9, + UNI_BOUNDARY = 10, + UNI_NON_BOUNDARY = 11, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreCatCode { + DIGIT = 0, + NOT_DIGIT = 1, + SPACE = 2, + NOT_SPACE = 3, + WORD = 4, + NOT_WORD = 5, + LINEBREAK = 6, + NOT_LINEBREAK = 7, + LOC_WORD = 8, + LOC_NOT_WORD = 9, + UNI_DIGIT = 10, + UNI_NOT_DIGIT = 11, + UNI_SPACE = 12, + UNI_NOT_SPACE = 13, + UNI_WORD = 14, + UNI_NOT_WORD = 15, + UNI_LINEBREAK = 16, + UNI_NOT_LINEBREAK = 17, +} +bitflags! { + pub struct SreFlag: u16 { + const TEMPLATE = 1; + const IGNORECASE = 2; + const LOCALE = 4; + const MULTILINE = 8; + const DOTALL = 16; + const UNICODE = 32; + const VERBOSE = 64; + const DEBUG = 128; + const ASCII = 256; + } +} +bitflags! { + pub struct SreInfo: u32 { + const PREFIX = 1; + const LITERAL = 2; + const CHARSET = 4; + } +} diff --git a/vm/src/stdlib/sre/interp.rs b/vm/src/stdlib/sre/interp.rs new file mode 100644 index 000000000..7f93a82eb --- /dev/null +++ b/vm/src/stdlib/sre/interp.rs @@ -0,0 +1,126 @@ +// good luck to those that follow; here be dragons + +use crate::builtins::PyStrRef; + +use super::constants::{SreFlag, SreOpcode}; + +use std::convert::TryFrom; +use std::{iter, slice}; + +pub struct State { + start: usize, + s_pos: usize, + end: usize, + pos: usize, + flags: SreFlag, + marks: Vec, + lastindex: isize, + marks_stack: Vec, + context_stack: Vec, + repeat: Option, + s: PyStrRef, +} + +// struct State1<'a> { +// state: &'a mut State, +// } + +struct MatchContext { + s_pos: usize, + code_pos: usize, +} + +// struct Context<'a> { +// context_stack: &mut Vec, +// } + +impl State { + pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { + let end = std::cmp::min(end, s.char_len()); + Self { + start, + s_pos: start, + end, + pos: start, + flags, + marks: Vec::new(), + lastindex: -1, + marks_stack: Vec::new(), + context_stack: Vec::new(), + repeat: None, + s, + } + } +} + +// struct OpcodeDispatcher { +// executing_contexts: HashMap>, +// } + +pub struct BadSreCode; + +pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { + let mut it = code.iter().copied(); + std::iter::from_fn(move || -> Option> { + let op = it.next()?; + let op = SreOpcode::try_from(op) + .ok() + .and_then(|op| extract_code(op, &mut it)); + Some(op) + }) + .map(|x| x.ok_or(BadSreCode)) +} + +type It<'a> = iter::Copied>; +fn extract_code(op: SreOpcode, it: &mut It) -> Option { + let skip = |it: &mut It| { + let skip = it.next()? as usize; + if skip > it.len() { + None + } else { + Some(skip) + } + }; + match op { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::INFO => { + // let skip = it.next()?; + } + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::RANGE_IGNORE => {} + } + todo!() +} + +pub enum Op { + Info {}, +}