mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
WIP - native _sre
This commit is contained in:
66
Lib/sre_constants.py
vendored
66
Lib/sre_constants.py
vendored
@@ -179,11 +179,22 @@ SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
|
||||
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
|
||||
|
||||
if __name__ == "__main__":
|
||||
def dump(f, d, prefix):
|
||||
def dump(f, d, typ, int_t, prefix):
|
||||
items = sorted(d)
|
||||
f.write(f"""\
|
||||
#[derive(num_enum::TryFromPrimitive, Debug)]
|
||||
#[repr({int_t})]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum {typ} {{
|
||||
""")
|
||||
for item in items:
|
||||
f.write("#define %s_%s %d\n" % (prefix, item, item))
|
||||
with open("sre_constants.h", "w") as f:
|
||||
name = str(item).removeprefix(prefix)
|
||||
val = int(item)
|
||||
f.write(f" {name} = {val},\n")
|
||||
f.write("""\
|
||||
}
|
||||
""")
|
||||
with open("vm/src/stdlib/sre/constants.rs", "w") as f:
|
||||
f.write("""\
|
||||
/*
|
||||
* Secret Labs' Regular Expression Engine
|
||||
@@ -200,24 +211,41 @@ if __name__ == "__main__":
|
||||
|
||||
""")
|
||||
|
||||
f.write("#define SRE_MAGIC %d\n" % MAGIC)
|
||||
f.write("use bitflags::bitflags;\n\n");
|
||||
|
||||
dump(f, OPCODES, "SRE_OP")
|
||||
dump(f, ATCODES, "SRE")
|
||||
dump(f, CHCODES, "SRE")
|
||||
f.write("pub const SRE_MAGIC: usize = %d;\n" % MAGIC)
|
||||
|
||||
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
|
||||
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
|
||||
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
|
||||
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
|
||||
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
|
||||
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
|
||||
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
|
||||
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
|
||||
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
|
||||
dump(f, OPCODES, "SreOpcode", "u32", "")
|
||||
dump(f, ATCODES, "SreAtCode", "u32", "AT_")
|
||||
dump(f, CHCODES, "SreCatCode", "u32", "CATEGORY_")
|
||||
|
||||
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
|
||||
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
|
||||
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
|
||||
def bitflags(typ, int_t, prefix, flags):
|
||||
f.write(f"""\
|
||||
bitflags! {{
|
||||
pub struct {typ}: {int_t} {{
|
||||
""")
|
||||
for name in flags:
|
||||
val = globals()[prefix + name]
|
||||
f.write(f" const {name} = {val};\n")
|
||||
f.write("""\
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
bitflags("SreFlag", "u16", "SRE_FLAG_", [
|
||||
"TEMPLATE",
|
||||
"IGNORECASE",
|
||||
"LOCALE",
|
||||
"MULTILINE",
|
||||
"DOTALL",
|
||||
"UNICODE",
|
||||
"VERBOSE",
|
||||
"DEBUG",
|
||||
"ASCII",
|
||||
])
|
||||
|
||||
bitflags("SreInfo", "u32", "SRE_INFO_", [
|
||||
"PREFIX", "LITERAL", "CHARSET",
|
||||
])
|
||||
|
||||
print("done")
|
||||
|
||||
@@ -860,3 +860,18 @@ impl PyMapping {
|
||||
self.dict
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V> TryFromObject for std::collections::HashMap<K, V>
|
||||
where
|
||||
K: TryFromObject + std::hash::Hash + Eq,
|
||||
V: TryFromObject,
|
||||
{
|
||||
fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult<Self> {
|
||||
let mapping = PyMapping::try_from_object(vm, obj)?;
|
||||
mapping
|
||||
.into_dict()
|
||||
.into_iter()
|
||||
.map(|(k, v)| Ok((K::try_from_object(vm, k)?, V::try_from_object(vm, v)?)))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,6 +285,9 @@ impl PyStr {
|
||||
len
|
||||
})
|
||||
}
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
#[pymethod(name = "__sizeof__")]
|
||||
fn sizeof(&self) -> usize {
|
||||
@@ -1457,3 +1460,11 @@ impl<'s> AnyStr<'s> for str {
|
||||
splited
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFromObject for String {
|
||||
fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult<Self> {
|
||||
Ok(PyStrRef::try_from_object(vm, obj)?
|
||||
.borrow_value()
|
||||
.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ mod re;
|
||||
mod serde_json;
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
pub mod socket;
|
||||
mod sre;
|
||||
mod string;
|
||||
#[cfg(feature = "rustpython-compiler")]
|
||||
mod symtable;
|
||||
@@ -94,6 +95,7 @@ pub fn get_module_inits() -> HashMap<String, StdlibInitFunc, ahash::RandomState>
|
||||
"regex_crate".to_owned() => Box::new(re::make_module),
|
||||
"_random".to_owned() => Box::new(random::make_module),
|
||||
"_serde_json".to_owned() => Box::new(serde_json::make_module),
|
||||
"_sre".to_owned() => Box::new(sre::make_module),
|
||||
"_string".to_owned() => Box::new(string::make_module),
|
||||
"_struct".to_owned() => Box::new(pystruct::make_module),
|
||||
"time".to_owned() => Box::new(time_module::make_module),
|
||||
|
||||
59
vm/src/stdlib/sre.rs
Normal file
59
vm/src/stdlib/sre.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
mod constants;
|
||||
mod interp;
|
||||
|
||||
pub(crate) use _sre::make_module;
|
||||
|
||||
#[pymodule]
|
||||
mod _sre {
|
||||
use super::constants::SreFlag;
|
||||
use crate::builtins::PyTypeRef;
|
||||
use crate::pyobject::{PyObjectRef, PyResult, PyValue, StaticType};
|
||||
use crate::VirtualMachine;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[pyattr]
|
||||
use super::constants::SRE_MAGIC as MAGIC;
|
||||
|
||||
#[pyfunction]
|
||||
fn compile(
|
||||
pattern: PyObjectRef,
|
||||
flags: u16,
|
||||
code: PyObjectRef,
|
||||
groups: usize,
|
||||
groupindex: HashMap<String, usize>,
|
||||
indexgroup: PyObjectRef,
|
||||
vm: &VirtualMachine,
|
||||
) -> PyResult<Pattern> {
|
||||
let code = vm.extract_elements::<u32>(&code)?;
|
||||
|
||||
Ok(Pattern {
|
||||
pattern,
|
||||
flags: SreFlag::from_bits_truncate(flags),
|
||||
code,
|
||||
groups,
|
||||
groupindex,
|
||||
indexgroup: vm.extract_elements(&indexgroup)?,
|
||||
})
|
||||
}
|
||||
|
||||
#[pyattr]
|
||||
#[pyclass(name = "Pattern")]
|
||||
#[derive(Debug)]
|
||||
struct Pattern {
|
||||
pattern: PyObjectRef,
|
||||
flags: SreFlag,
|
||||
code: Vec<u32>,
|
||||
groups: usize,
|
||||
groupindex: HashMap<String, usize>,
|
||||
indexgroup: Vec<Option<String>>,
|
||||
}
|
||||
|
||||
impl PyValue for Pattern {
|
||||
fn class(_vm: &VirtualMachine) -> &PyTypeRef {
|
||||
Self::static_type()
|
||||
}
|
||||
}
|
||||
|
||||
#[pyimpl]
|
||||
impl Pattern {}
|
||||
}
|
||||
114
vm/src/stdlib/sre/constants.rs
Normal file
114
vm/src/stdlib/sre/constants.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Secret Labs' Regular Expression Engine
|
||||
*
|
||||
* regular expression matching engine
|
||||
*
|
||||
* NOTE: This file is generated by sre_constants.py. If you need
|
||||
* to change anything in here, edit sre_constants.py and run it.
|
||||
*
|
||||
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
||||
*
|
||||
* See the _sre.c file for information on usage and redistribution.
|
||||
*/
|
||||
|
||||
use bitflags::bitflags;
|
||||
|
||||
pub const SRE_MAGIC: usize = 20140917;
|
||||
#[derive(num_enum::TryFromPrimitive, Debug)]
|
||||
#[repr(u32)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum SreOpcode {
|
||||
FAILURE = 0,
|
||||
SUCCESS = 1,
|
||||
ANY = 2,
|
||||
ANY_ALL = 3,
|
||||
ASSERT = 4,
|
||||
ASSERT_NOT = 5,
|
||||
AT = 6,
|
||||
BRANCH = 7,
|
||||
CALL = 8,
|
||||
CATEGORY = 9,
|
||||
CHARSET = 10,
|
||||
BIGCHARSET = 11,
|
||||
GROUPREF = 12,
|
||||
GROUPREF_EXISTS = 13,
|
||||
GROUPREF_IGNORE = 14,
|
||||
IN = 15,
|
||||
IN_IGNORE = 16,
|
||||
INFO = 17,
|
||||
JUMP = 18,
|
||||
LITERAL = 19,
|
||||
LITERAL_IGNORE = 20,
|
||||
MARK = 21,
|
||||
MAX_UNTIL = 22,
|
||||
MIN_UNTIL = 23,
|
||||
NOT_LITERAL = 24,
|
||||
NOT_LITERAL_IGNORE = 25,
|
||||
NEGATE = 26,
|
||||
RANGE = 27,
|
||||
REPEAT = 28,
|
||||
REPEAT_ONE = 29,
|
||||
SUBPATTERN = 30,
|
||||
MIN_REPEAT_ONE = 31,
|
||||
RANGE_IGNORE = 32,
|
||||
}
|
||||
#[derive(num_enum::TryFromPrimitive, Debug)]
|
||||
#[repr(u32)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum SreAtCode {
|
||||
BEGINNING = 0,
|
||||
BEGINNING_LINE = 1,
|
||||
BEGINNING_STRING = 2,
|
||||
BOUNDARY = 3,
|
||||
NON_BOUNDARY = 4,
|
||||
END = 5,
|
||||
END_LINE = 6,
|
||||
END_STRING = 7,
|
||||
LOC_BOUNDARY = 8,
|
||||
LOC_NON_BOUNDARY = 9,
|
||||
UNI_BOUNDARY = 10,
|
||||
UNI_NON_BOUNDARY = 11,
|
||||
}
|
||||
#[derive(num_enum::TryFromPrimitive, Debug)]
|
||||
#[repr(u32)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum SreCatCode {
|
||||
DIGIT = 0,
|
||||
NOT_DIGIT = 1,
|
||||
SPACE = 2,
|
||||
NOT_SPACE = 3,
|
||||
WORD = 4,
|
||||
NOT_WORD = 5,
|
||||
LINEBREAK = 6,
|
||||
NOT_LINEBREAK = 7,
|
||||
LOC_WORD = 8,
|
||||
LOC_NOT_WORD = 9,
|
||||
UNI_DIGIT = 10,
|
||||
UNI_NOT_DIGIT = 11,
|
||||
UNI_SPACE = 12,
|
||||
UNI_NOT_SPACE = 13,
|
||||
UNI_WORD = 14,
|
||||
UNI_NOT_WORD = 15,
|
||||
UNI_LINEBREAK = 16,
|
||||
UNI_NOT_LINEBREAK = 17,
|
||||
}
|
||||
bitflags! {
|
||||
pub struct SreFlag: u16 {
|
||||
const TEMPLATE = 1;
|
||||
const IGNORECASE = 2;
|
||||
const LOCALE = 4;
|
||||
const MULTILINE = 8;
|
||||
const DOTALL = 16;
|
||||
const UNICODE = 32;
|
||||
const VERBOSE = 64;
|
||||
const DEBUG = 128;
|
||||
const ASCII = 256;
|
||||
}
|
||||
}
|
||||
bitflags! {
|
||||
pub struct SreInfo: u32 {
|
||||
const PREFIX = 1;
|
||||
const LITERAL = 2;
|
||||
const CHARSET = 4;
|
||||
}
|
||||
}
|
||||
126
vm/src/stdlib/sre/interp.rs
Normal file
126
vm/src/stdlib/sre/interp.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
// good luck to those that follow; here be dragons
|
||||
|
||||
use crate::builtins::PyStrRef;
|
||||
|
||||
use super::constants::{SreFlag, SreOpcode};
|
||||
|
||||
use std::convert::TryFrom;
|
||||
use std::{iter, slice};
|
||||
|
||||
pub struct State {
|
||||
start: usize,
|
||||
s_pos: usize,
|
||||
end: usize,
|
||||
pos: usize,
|
||||
flags: SreFlag,
|
||||
marks: Vec<usize>,
|
||||
lastindex: isize,
|
||||
marks_stack: Vec<usize>,
|
||||
context_stack: Vec<MatchContext>,
|
||||
repeat: Option<usize>,
|
||||
s: PyStrRef,
|
||||
}
|
||||
|
||||
// struct State1<'a> {
|
||||
// state: &'a mut State,
|
||||
// }
|
||||
|
||||
struct MatchContext {
|
||||
s_pos: usize,
|
||||
code_pos: usize,
|
||||
}
|
||||
|
||||
// struct Context<'a> {
|
||||
// context_stack: &mut Vec<MatchContext>,
|
||||
// }
|
||||
|
||||
impl State {
|
||||
pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self {
|
||||
let end = std::cmp::min(end, s.char_len());
|
||||
Self {
|
||||
start,
|
||||
s_pos: start,
|
||||
end,
|
||||
pos: start,
|
||||
flags,
|
||||
marks: Vec::new(),
|
||||
lastindex: -1,
|
||||
marks_stack: Vec::new(),
|
||||
context_stack: Vec::new(),
|
||||
repeat: None,
|
||||
s,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// struct OpcodeDispatcher {
|
||||
// executing_contexts: HashMap<usize, Rc<State>>,
|
||||
// }
|
||||
|
||||
pub struct BadSreCode;
|
||||
|
||||
pub fn parse_ops(code: &[u32]) -> impl Iterator<Item = Result<Op, BadSreCode>> + '_ {
|
||||
let mut it = code.iter().copied();
|
||||
std::iter::from_fn(move || -> Option<Option<Op>> {
|
||||
let op = it.next()?;
|
||||
let op = SreOpcode::try_from(op)
|
||||
.ok()
|
||||
.and_then(|op| extract_code(op, &mut it));
|
||||
Some(op)
|
||||
})
|
||||
.map(|x| x.ok_or(BadSreCode))
|
||||
}
|
||||
|
||||
type It<'a> = iter::Copied<slice::Iter<'a, u32>>;
|
||||
fn extract_code(op: SreOpcode, it: &mut It) -> Option<Op> {
|
||||
let skip = |it: &mut It| {
|
||||
let skip = it.next()? as usize;
|
||||
if skip > it.len() {
|
||||
None
|
||||
} else {
|
||||
Some(skip)
|
||||
}
|
||||
};
|
||||
match op {
|
||||
SreOpcode::FAILURE => {}
|
||||
SreOpcode::SUCCESS => {}
|
||||
SreOpcode::ANY => {}
|
||||
SreOpcode::ANY_ALL => {}
|
||||
SreOpcode::ASSERT => {}
|
||||
SreOpcode::ASSERT_NOT => {}
|
||||
SreOpcode::AT => {}
|
||||
SreOpcode::BRANCH => {}
|
||||
SreOpcode::CALL => {}
|
||||
SreOpcode::CATEGORY => {}
|
||||
SreOpcode::CHARSET => {}
|
||||
SreOpcode::BIGCHARSET => {}
|
||||
SreOpcode::GROUPREF => {}
|
||||
SreOpcode::GROUPREF_EXISTS => {}
|
||||
SreOpcode::GROUPREF_IGNORE => {}
|
||||
SreOpcode::IN => {}
|
||||
SreOpcode::IN_IGNORE => {}
|
||||
SreOpcode::INFO => {
|
||||
// let skip = it.next()?;
|
||||
}
|
||||
SreOpcode::JUMP => {}
|
||||
SreOpcode::LITERAL => {}
|
||||
SreOpcode::LITERAL_IGNORE => {}
|
||||
SreOpcode::MARK => {}
|
||||
SreOpcode::MAX_UNTIL => {}
|
||||
SreOpcode::MIN_UNTIL => {}
|
||||
SreOpcode::NOT_LITERAL => {}
|
||||
SreOpcode::NOT_LITERAL_IGNORE => {}
|
||||
SreOpcode::NEGATE => {}
|
||||
SreOpcode::RANGE => {}
|
||||
SreOpcode::REPEAT => {}
|
||||
SreOpcode::REPEAT_ONE => {}
|
||||
SreOpcode::SUBPATTERN => {}
|
||||
SreOpcode::MIN_REPEAT_ONE => {}
|
||||
SreOpcode::RANGE_IGNORE => {}
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub enum Op {
|
||||
Info {},
|
||||
}
|
||||
Reference in New Issue
Block a user