WIP - native _sre

This commit is contained in:
Noah
2020-09-30 21:20:13 -05:00
committed by Kangzhi Shi
parent 2ab02e34f1
commit 3b875291ee
7 changed files with 374 additions and 19 deletions

66
Lib/sre_constants.py vendored
View File

@@ -179,11 +179,22 @@ SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
def dump(f, d, prefix):
def dump(f, d, typ, int_t, prefix):
items = sorted(d)
f.write(f"""\
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr({int_t})]
#[allow(non_camel_case_types)]
pub enum {typ} {{
""")
for item in items:
f.write("#define %s_%s %d\n" % (prefix, item, item))
with open("sre_constants.h", "w") as f:
name = str(item).removeprefix(prefix)
val = int(item)
f.write(f" {name} = {val},\n")
f.write("""\
}
""")
with open("vm/src/stdlib/sre/constants.rs", "w") as f:
f.write("""\
/*
* Secret Labs' Regular Expression Engine
@@ -200,24 +211,41 @@ if __name__ == "__main__":
""")
f.write("#define SRE_MAGIC %d\n" % MAGIC)
f.write("use bitflags::bitflags;\n\n");
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.write("pub const SRE_MAGIC: usize = %d;\n" % MAGIC)
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
dump(f, OPCODES, "SreOpcode", "u32", "")
dump(f, ATCODES, "SreAtCode", "u32", "AT_")
dump(f, CHCODES, "SreCatCode", "u32", "CATEGORY_")
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
def bitflags(typ, int_t, prefix, flags):
f.write(f"""\
bitflags! {{
pub struct {typ}: {int_t} {{
""")
for name in flags:
val = globals()[prefix + name]
f.write(f" const {name} = {val};\n")
f.write("""\
}
}
""")
bitflags("SreFlag", "u16", "SRE_FLAG_", [
"TEMPLATE",
"IGNORECASE",
"LOCALE",
"MULTILINE",
"DOTALL",
"UNICODE",
"VERBOSE",
"DEBUG",
"ASCII",
])
bitflags("SreInfo", "u32", "SRE_INFO_", [
"PREFIX", "LITERAL", "CHARSET",
])
print("done")

View File

@@ -860,3 +860,18 @@ impl PyMapping {
self.dict
}
}
impl<K, V> TryFromObject for std::collections::HashMap<K, V>
where
K: TryFromObject + std::hash::Hash + Eq,
V: TryFromObject,
{
fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult<Self> {
let mapping = PyMapping::try_from_object(vm, obj)?;
mapping
.into_dict()
.into_iter()
.map(|(k, v)| Ok((K::try_from_object(vm, k)?, V::try_from_object(vm, v)?)))
.collect()
}
}

View File

@@ -285,6 +285,9 @@ impl PyStr {
len
})
}
pub fn char_len(&self) -> usize {
self.len()
}
#[pymethod(name = "__sizeof__")]
fn sizeof(&self) -> usize {
@@ -1457,3 +1460,11 @@ impl<'s> AnyStr<'s> for str {
splited
}
}
impl TryFromObject for String {
fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult<Self> {
Ok(PyStrRef::try_from_object(vm, obj)?
.borrow_value()
.to_owned())
}
}

View File

@@ -29,6 +29,7 @@ mod re;
mod serde_json;
#[cfg(not(target_arch = "wasm32"))]
pub mod socket;
mod sre;
mod string;
#[cfg(feature = "rustpython-compiler")]
mod symtable;
@@ -94,6 +95,7 @@ pub fn get_module_inits() -> HashMap<String, StdlibInitFunc, ahash::RandomState>
"regex_crate".to_owned() => Box::new(re::make_module),
"_random".to_owned() => Box::new(random::make_module),
"_serde_json".to_owned() => Box::new(serde_json::make_module),
"_sre".to_owned() => Box::new(sre::make_module),
"_string".to_owned() => Box::new(string::make_module),
"_struct".to_owned() => Box::new(pystruct::make_module),
"time".to_owned() => Box::new(time_module::make_module),

59
vm/src/stdlib/sre.rs Normal file
View File

@@ -0,0 +1,59 @@
mod constants;
mod interp;
pub(crate) use _sre::make_module;
#[pymodule]
mod _sre {
use super::constants::SreFlag;
use crate::builtins::PyTypeRef;
use crate::pyobject::{PyObjectRef, PyResult, PyValue, StaticType};
use crate::VirtualMachine;
use std::collections::HashMap;
#[pyattr]
use super::constants::SRE_MAGIC as MAGIC;
#[pyfunction]
fn compile(
pattern: PyObjectRef,
flags: u16,
code: PyObjectRef,
groups: usize,
groupindex: HashMap<String, usize>,
indexgroup: PyObjectRef,
vm: &VirtualMachine,
) -> PyResult<Pattern> {
let code = vm.extract_elements::<u32>(&code)?;
Ok(Pattern {
pattern,
flags: SreFlag::from_bits_truncate(flags),
code,
groups,
groupindex,
indexgroup: vm.extract_elements(&indexgroup)?,
})
}
#[pyattr]
#[pyclass(name = "Pattern")]
#[derive(Debug)]
struct Pattern {
pattern: PyObjectRef,
flags: SreFlag,
code: Vec<u32>,
groups: usize,
groupindex: HashMap<String, usize>,
indexgroup: Vec<Option<String>>,
}
impl PyValue for Pattern {
fn class(_vm: &VirtualMachine) -> &PyTypeRef {
Self::static_type()
}
}
#[pyimpl]
impl Pattern {}
}

View File

@@ -0,0 +1,114 @@
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
use bitflags::bitflags;
pub const SRE_MAGIC: usize = 20140917;
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreOpcode {
FAILURE = 0,
SUCCESS = 1,
ANY = 2,
ANY_ALL = 3,
ASSERT = 4,
ASSERT_NOT = 5,
AT = 6,
BRANCH = 7,
CALL = 8,
CATEGORY = 9,
CHARSET = 10,
BIGCHARSET = 11,
GROUPREF = 12,
GROUPREF_EXISTS = 13,
GROUPREF_IGNORE = 14,
IN = 15,
IN_IGNORE = 16,
INFO = 17,
JUMP = 18,
LITERAL = 19,
LITERAL_IGNORE = 20,
MARK = 21,
MAX_UNTIL = 22,
MIN_UNTIL = 23,
NOT_LITERAL = 24,
NOT_LITERAL_IGNORE = 25,
NEGATE = 26,
RANGE = 27,
REPEAT = 28,
REPEAT_ONE = 29,
SUBPATTERN = 30,
MIN_REPEAT_ONE = 31,
RANGE_IGNORE = 32,
}
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreAtCode {
BEGINNING = 0,
BEGINNING_LINE = 1,
BEGINNING_STRING = 2,
BOUNDARY = 3,
NON_BOUNDARY = 4,
END = 5,
END_LINE = 6,
END_STRING = 7,
LOC_BOUNDARY = 8,
LOC_NON_BOUNDARY = 9,
UNI_BOUNDARY = 10,
UNI_NON_BOUNDARY = 11,
}
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreCatCode {
DIGIT = 0,
NOT_DIGIT = 1,
SPACE = 2,
NOT_SPACE = 3,
WORD = 4,
NOT_WORD = 5,
LINEBREAK = 6,
NOT_LINEBREAK = 7,
LOC_WORD = 8,
LOC_NOT_WORD = 9,
UNI_DIGIT = 10,
UNI_NOT_DIGIT = 11,
UNI_SPACE = 12,
UNI_NOT_SPACE = 13,
UNI_WORD = 14,
UNI_NOT_WORD = 15,
UNI_LINEBREAK = 16,
UNI_NOT_LINEBREAK = 17,
}
bitflags! {
pub struct SreFlag: u16 {
const TEMPLATE = 1;
const IGNORECASE = 2;
const LOCALE = 4;
const MULTILINE = 8;
const DOTALL = 16;
const UNICODE = 32;
const VERBOSE = 64;
const DEBUG = 128;
const ASCII = 256;
}
}
bitflags! {
pub struct SreInfo: u32 {
const PREFIX = 1;
const LITERAL = 2;
const CHARSET = 4;
}
}

126
vm/src/stdlib/sre/interp.rs Normal file
View File

@@ -0,0 +1,126 @@
// good luck to those that follow; here be dragons
use crate::builtins::PyStrRef;
use super::constants::{SreFlag, SreOpcode};
use std::convert::TryFrom;
use std::{iter, slice};
pub struct State {
start: usize,
s_pos: usize,
end: usize,
pos: usize,
flags: SreFlag,
marks: Vec<usize>,
lastindex: isize,
marks_stack: Vec<usize>,
context_stack: Vec<MatchContext>,
repeat: Option<usize>,
s: PyStrRef,
}
// struct State1<'a> {
// state: &'a mut State,
// }
struct MatchContext {
s_pos: usize,
code_pos: usize,
}
// struct Context<'a> {
// context_stack: &mut Vec<MatchContext>,
// }
impl State {
pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self {
let end = std::cmp::min(end, s.char_len());
Self {
start,
s_pos: start,
end,
pos: start,
flags,
marks: Vec::new(),
lastindex: -1,
marks_stack: Vec::new(),
context_stack: Vec::new(),
repeat: None,
s,
}
}
}
// struct OpcodeDispatcher {
// executing_contexts: HashMap<usize, Rc<State>>,
// }
pub struct BadSreCode;
pub fn parse_ops(code: &[u32]) -> impl Iterator<Item = Result<Op, BadSreCode>> + '_ {
let mut it = code.iter().copied();
std::iter::from_fn(move || -> Option<Option<Op>> {
let op = it.next()?;
let op = SreOpcode::try_from(op)
.ok()
.and_then(|op| extract_code(op, &mut it));
Some(op)
})
.map(|x| x.ok_or(BadSreCode))
}
type It<'a> = iter::Copied<slice::Iter<'a, u32>>;
fn extract_code(op: SreOpcode, it: &mut It) -> Option<Op> {
let skip = |it: &mut It| {
let skip = it.next()? as usize;
if skip > it.len() {
None
} else {
Some(skip)
}
};
match op {
SreOpcode::FAILURE => {}
SreOpcode::SUCCESS => {}
SreOpcode::ANY => {}
SreOpcode::ANY_ALL => {}
SreOpcode::ASSERT => {}
SreOpcode::ASSERT_NOT => {}
SreOpcode::AT => {}
SreOpcode::BRANCH => {}
SreOpcode::CALL => {}
SreOpcode::CATEGORY => {}
SreOpcode::CHARSET => {}
SreOpcode::BIGCHARSET => {}
SreOpcode::GROUPREF => {}
SreOpcode::GROUPREF_EXISTS => {}
SreOpcode::GROUPREF_IGNORE => {}
SreOpcode::IN => {}
SreOpcode::IN_IGNORE => {}
SreOpcode::INFO => {
// let skip = it.next()?;
}
SreOpcode::JUMP => {}
SreOpcode::LITERAL => {}
SreOpcode::LITERAL_IGNORE => {}
SreOpcode::MARK => {}
SreOpcode::MAX_UNTIL => {}
SreOpcode::MIN_UNTIL => {}
SreOpcode::NOT_LITERAL => {}
SreOpcode::NOT_LITERAL_IGNORE => {}
SreOpcode::NEGATE => {}
SreOpcode::RANGE => {}
SreOpcode::REPEAT => {}
SreOpcode::REPEAT_ONE => {}
SreOpcode::SUBPATTERN => {}
SreOpcode::MIN_REPEAT_ONE => {}
SreOpcode::RANGE_IGNORE => {}
}
todo!()
}
pub enum Op {
Info {},
}