WIP structure

This commit is contained in:
Kangzhi Shi
2020-12-01 17:51:11 +02:00
parent c2ee9ca3e0
commit e1362ead3c
2 changed files with 380 additions and 93 deletions

View File

@@ -14,6 +14,10 @@
use bitflags::bitflags;
pub const SRE_MAGIC: usize = 20140917;
pub const SRE_CODESIZE: usize = 4;
pub const SRE_MAXREPEAT: usize = usize::max_value();
pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::<usize>() / 2;
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]

469
interp.rs
View File

@@ -1,126 +1,409 @@
// good luck to those that follow; here be dragons
use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT};
use crate::builtins::PyStrRef;
use super::constants::{SreFlag, SreOpcode};
use rustpython_common::borrow::BorrowValue;
use std::collections::HashMap;
use std::convert::TryFrom;
use std::{iter, slice};
pub struct State {
pub struct State<'a> {
// py_string: PyStrRef,
string: &'a str,
start: usize,
s_pos: usize,
end: usize,
pos: usize,
flags: SreFlag,
pattern_codes: Vec<u32>,
marks: Vec<usize>,
lastindex: isize,
marks_stack: Vec<usize>,
context_stack: Vec<MatchContext>,
repeat: Option<usize>,
s: PyStrRef,
string_position: usize,
}
// struct State1<'a> {
// state: &'a mut State,
// }
struct MatchContext {
s_pos: usize,
code_pos: usize,
}
// struct Context<'a> {
// context_stack: &mut Vec<MatchContext>,
// }
impl State {
pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self {
let end = std::cmp::min(end, s.char_len());
impl<'a> State<'a> {
pub(crate) fn new(
// py_string: PyStrRef,
string: &'a str,
start: usize,
end: usize,
flags: SreFlag,
pattern_codes: Vec<u32>,
) -> Self {
// let string = py_string.borrow_value();
Self {
// py_string,
string,
start,
s_pos: start,
end,
pos: start,
flags,
marks: Vec::new(),
pattern_codes,
lastindex: -1,
marks_stack: Vec::new(),
context_stack: Vec::new(),
repeat: None,
s,
marks: Vec::new(),
string_position: start,
}
}
fn reset(&mut self) {
self.marks.clear();
self.lastindex = -1;
self.marks_stack.clear();
self.context_stack.clear();
self.repeat = None;
}
}
// struct OpcodeDispatcher {
// executing_contexts: HashMap<usize, Rc<State>>,
// }
pub struct BadSreCode;
pub fn parse_ops(code: &[u32]) -> impl Iterator<Item = Result<Op, BadSreCode>> + '_ {
let mut it = code.iter().copied();
std::iter::from_fn(move || -> Option<Option<Op>> {
let op = it.next()?;
let op = SreOpcode::try_from(op)
.ok()
.and_then(|op| extract_code(op, &mut it));
Some(op)
})
.map(|x| x.ok_or(BadSreCode))
}
type It<'a> = iter::Copied<slice::Iter<'a, u32>>;
fn extract_code(op: SreOpcode, it: &mut It) -> Option<Op> {
let skip = |it: &mut It| {
let skip = it.next()? as usize;
if skip > it.len() {
None
} else {
Some(skip)
}
pub(crate) fn pymatch(mut state: State) -> bool {
let ctx = MatchContext {
string_position: state.start,
code_position: 0,
has_matched: None,
};
match op {
SreOpcode::FAILURE => {}
SreOpcode::SUCCESS => {}
SreOpcode::ANY => {}
SreOpcode::ANY_ALL => {}
SreOpcode::ASSERT => {}
SreOpcode::ASSERT_NOT => {}
SreOpcode::AT => {}
SreOpcode::BRANCH => {}
SreOpcode::CALL => {}
SreOpcode::CATEGORY => {}
SreOpcode::CHARSET => {}
SreOpcode::BIGCHARSET => {}
SreOpcode::GROUPREF => {}
SreOpcode::GROUPREF_EXISTS => {}
SreOpcode::GROUPREF_IGNORE => {}
SreOpcode::IN => {}
SreOpcode::IN_IGNORE => {}
SreOpcode::INFO => {
// let skip = it.next()?;
state.context_stack.push(ctx);
let mut has_matched = None;
loop {
if state.context_stack.is_empty() {
break;
}
let ctx_id = state.context_stack.len() - 1;
let mut drive = MatchContextDrive::drive(ctx_id, state);
let mut dispatcher = OpcodeDispatcher::new();
has_matched = dispatcher.pymatch(&mut drive);
state = drive.take();
if has_matched.is_some() {
state.context_stack.pop();
}
SreOpcode::JUMP => {}
SreOpcode::LITERAL => {}
SreOpcode::LITERAL_IGNORE => {}
SreOpcode::MARK => {}
SreOpcode::MAX_UNTIL => {}
SreOpcode::MIN_UNTIL => {}
SreOpcode::NOT_LITERAL => {}
SreOpcode::NOT_LITERAL_IGNORE => {}
SreOpcode::NEGATE => {}
SreOpcode::RANGE => {}
SreOpcode::REPEAT => {}
SreOpcode::REPEAT_ONE => {}
SreOpcode::SUBPATTERN => {}
SreOpcode::MIN_REPEAT_ONE => {}
SreOpcode::RANGE_IGNORE => {}
}
todo!()
has_matched.unwrap_or(false)
}
pub enum Op {
Info {},
#[derive(Debug, Copy, Clone)]
struct MatchContext {
string_position: usize,
code_position: usize,
has_matched: Option<bool>,
}
struct MatchContextDrive<'a> {
state: State<'a>,
ctx_id: usize,
}
impl<'a> MatchContextDrive<'a> {
fn id(&self) -> usize {
self.ctx_id
}
fn ctx_mut(&mut self) -> &mut MatchContext {
&mut self.state.context_stack[self.ctx_id]
}
fn ctx(&self) -> &MatchContext {
&self.state.context_stack[self.ctx_id]
}
fn push_new_context(&mut self, pattern_offset: usize) -> usize {
let ctx = self.ctx();
let child_ctx = MatchContext {
string_position: ctx.string_position,
code_position: ctx.code_position + pattern_offset,
has_matched: None,
};
self.state.context_stack.push(child_ctx);
self.state.context_stack.len() - 1
}
fn drive(ctx_id: usize, state: State<'a>) -> Self {
Self { state, ctx_id }
}
fn take(self) -> State<'a> {
self.state
}
fn str(&self) -> &str {
unsafe {
std::str::from_utf8_unchecked(
&self.state.string.as_bytes()[self.ctx().string_position..],
)
}
}
fn peek_char(&self) -> char {
self.str().chars().next().unwrap()
}
fn peek_code(&self, peek: usize) -> u32 {
self.state.pattern_codes[self.ctx().code_position + peek]
}
fn skip_char(&mut self, skip_count: usize) {
let skipped = self.str().char_indices().nth(skip_count).unwrap().0;
self.ctx_mut().string_position += skipped;
}
fn skip_code(&mut self, skip_count: usize) {
self.ctx_mut().code_position += skip_count;
}
fn remaining_chars(&self) -> usize {
let end = self.state.end;
end - self.ctx().string_position + self.str().len()
}
fn remaining_codes(&self) -> usize {
self.state.pattern_codes.len() - self.ctx().code_position
}
fn at_beginning(&self) -> bool {
self.ctx().string_position == 0
}
fn at_end(&self) -> bool {
self.str().is_empty()
}
fn at_linebreak(&self) -> bool {
match self.str().chars().next() {
Some(c) => c == '\n',
None => false,
}
}
}
struct OpcodeDispatcher {
executing_contexts: HashMap<usize, Box<dyn OpcodeExecutor>>,
}
macro_rules! once {
($val:expr) => {
Box::new(OpEmpty {})
};
}
trait OpcodeExecutor {
fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>;
}
struct OpFailure {}
impl OpcodeExecutor for OpFailure {
fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
drive.ctx_mut().has_matched = Some(false);
None
}
}
struct OpEmpty {}
impl OpcodeExecutor for OpEmpty {
fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
None
}
}
struct OpOnce<F> {
f: Option<F>,
}
impl<F: FnOnce(&mut MatchContextDrive)> OpcodeExecutor for OpOnce<F> {
fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
let f = self.f.take()?;
f(drive);
None
}
}
fn once<F: FnOnce(&mut MatchContextDrive)>(f: F) -> Box<OpOnce<F>> {
Box::new(OpOnce { f: Some(f) })
}
struct OpMinRepeatOne {
trace_id: usize,
mincount: usize,
maxcount: usize,
count: usize,
child_ctx_id: usize,
}
impl OpcodeExecutor for OpMinRepeatOne {
fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
match self.trace_id {
0 => self._0(drive),
_ => unreachable!(),
}
}
}
impl Default for OpMinRepeatOne {
fn default() -> Self {
OpMinRepeatOne {
trace_id: 0,
mincount: 0,
maxcount: 0,
count: 0,
child_ctx_id: 0,
}
}
}
impl OpMinRepeatOne {
fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
self.mincount = drive.peek_code(2) as usize;
self.maxcount = drive.peek_code(3) as usize;
if drive.remaining_chars() < self.mincount {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.state.string_position = drive.ctx().string_position;
self.count = if self.mincount == 0 {
0
} else {
let count = count_repetitions(drive, self.mincount);
if count < self.mincount {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.skip_char(count);
count
};
if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 {
drive.state.string_position = drive.ctx().string_position;
drive.ctx_mut().has_matched = Some(true);
return None;
}
// mark push
self.trace_id = 1;
self._1(drive)
}
fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount {
drive.state.string_position = drive.ctx().string_position;
self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1);
self.trace_id = 2;
return Some(());
}
// mark discard
drive.ctx_mut().has_matched = Some(false);
None
}
fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched {
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.string_position = drive.ctx().string_position;
if count_repetitions(drive, 1) == 0 {
self.trace_id = 3;
return self._3(drive);
}
drive.skip_char(1);
self.count += 1;
// marks pop keep
self.trace_id = 1;
self._1(drive)
}
fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> {
// mark discard
drive.ctx_mut().has_matched = Some(false);
None
}
}
impl OpcodeDispatcher {
fn new() -> Self {
Self {
executing_contexts: HashMap::new(),
}
}
// Returns True if the current context matches, False if it doesn't and
// None if matching is not finished, ie must be resumed after child
// contexts have been matched.
fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option<bool> {
while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() {
let code = drive.peek_code(0);
let opcode = SreOpcode::try_from(code).unwrap();
self.dispatch(opcode, drive);
// self.drive = self.drive;
}
match drive.ctx().has_matched {
Some(matched) => Some(matched),
None => {
drive.ctx_mut().has_matched = Some(false);
Some(false)
}
}
}
// Dispatches a context on a given opcode. Returns True if the context
// is done matching, False if it must be resumed when next encountered.
fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool {
let mut executor = match self.executing_contexts.remove_entry(&drive.id()) {
Some((_, mut executor)) => executor,
None => self.dispatch_table(opcode, drive),
};
if let Some(()) = executor.next(drive) {
self.executing_contexts.insert(drive.id(), executor);
false
} else {
true
}
}
fn dispatch_table(
&mut self,
opcode: SreOpcode,
drive: &mut MatchContextDrive,
) -> Box<dyn OpcodeExecutor> {
// move || {
match opcode {
SreOpcode::FAILURE => {
Box::new(OpFailure {})
}
SreOpcode::SUCCESS => once(|drive| {
drive.state.string_position = drive.ctx().string_position;
drive.ctx_mut().has_matched = Some(true);
}),
SreOpcode::ANY => once!(true),
SreOpcode::ANY_ALL => once!(true),
SreOpcode::ASSERT => once!(true),
SreOpcode::ASSERT_NOT => once!(true),
SreOpcode::AT => once!(true),
SreOpcode::BRANCH => once!(true),
SreOpcode::CALL => once!(true),
SreOpcode::CATEGORY => once!(true),
SreOpcode::CHARSET => once!(true),
SreOpcode::BIGCHARSET => once!(true),
SreOpcode::GROUPREF => once!(true),
SreOpcode::GROUPREF_EXISTS => once!(true),
SreOpcode::GROUPREF_IGNORE => once!(true),
SreOpcode::IN => once!(true),
SreOpcode::IN_IGNORE => once!(true),
SreOpcode::INFO => once!(true),
SreOpcode::JUMP => once!(true),
SreOpcode::LITERAL => {
if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_char(1);
}
drive.skip_code(2);
once!(true)
}
SreOpcode::LITERAL_IGNORE => once!(true),
SreOpcode::MARK => once!(true),
SreOpcode::MAX_UNTIL => once!(true),
SreOpcode::MIN_UNTIL => once!(true),
SreOpcode::NOT_LITERAL => once!(true),
SreOpcode::NOT_LITERAL_IGNORE => once!(true),
SreOpcode::NEGATE => once!(true),
SreOpcode::RANGE => once!(true),
SreOpcode::REPEAT => once!(true),
SreOpcode::REPEAT_ONE => once!(true),
SreOpcode::SUBPATTERN => once!(true),
SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()),
SreOpcode::RANGE_IGNORE => once!(true),
}
}
}
// Returns the number of repetitions of a single item, starting from the
// current string position. The code pointer is expected to point to a
// REPEAT_ONE operation (with the repeated 4 ahead).
fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize {
let mut count = 0;
let mut real_maxcount = drive.state.end - drive.ctx().string_position;
if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT {
real_maxcount = maxcount;
}
count
}