From 344afc96ce1393dc7e467120e96214da29b4cfce Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 1 Dec 2020 17:51:11 +0200 Subject: [PATCH] WIP structure --- vm/src/stdlib/sre.rs | 82 +++++- vm/src/stdlib/sre/constants.rs | 4 + vm/src/stdlib/sre/interp.rs | 469 ++++++++++++++++++++++++++------- 3 files changed, 458 insertions(+), 97 deletions(-) diff --git a/vm/src/stdlib/sre.rs b/vm/src/stdlib/sre.rs index 7581aa155..5b082bfed 100644 --- a/vm/src/stdlib/sre.rs +++ b/vm/src/stdlib/sre.rs @@ -5,14 +5,24 @@ pub(crate) use _sre::make_module; #[pymodule] mod _sre { - use super::constants::SreFlag; - use crate::builtins::PyTypeRef; - use crate::pyobject::{PyObjectRef, PyResult, PyValue, StaticType}; + use super::{constants::SreFlag, interp::{self, State}}; + use crate::common::borrow::BorrowValue; use crate::VirtualMachine; + use crate::{ + builtins::PyStrRef, + pyobject::{Either, PyCallable, PyObjectRef, PyResult, PyValue, StaticType}, + }; + use crate::{builtins::PyTypeRef, byteslike::PyBytesLike}; use std::collections::HashMap; + #[pyattr] + use super::constants::SRE_CODESIZE as CODESIZE; #[pyattr] use super::constants::SRE_MAGIC as MAGIC; + #[pyattr] + use super::constants::SRE_MAXGROUPS as MAXGROUPS; + #[pyattr] + use super::constants::SRE_MAXREPEAT as MAXREPEAT; #[pyfunction] fn compile( @@ -25,6 +35,7 @@ mod _sre { vm: &VirtualMachine, ) -> PyResult { let code = vm.extract_elements::(&code)?; + dbg!(&code); Ok(Pattern { pattern, @@ -36,6 +47,26 @@ mod _sre { }) } + #[derive(FromArgs)] + struct StringArgs { + #[pyarg(any)] + string: PyStrRef, + #[pyarg(any, default = "0")] + pos: usize, + #[pyarg(any, default = "std::isize::MAX as usize")] + endpos: usize, + } + + #[derive(FromArgs)] + struct SubArgs { + #[pyarg(any)] + repl: Either, + #[pyarg(any)] + string: PyBytesLike, + #[pyarg(any, default = "0")] + count: usize, + } + #[pyattr] #[pyclass(name = "Pattern")] #[derive(Debug)] @@ -55,5 +86,48 @@ mod _sre { } #[pyimpl] - impl Pattern {} + impl Pattern { + #[pymethod(name = "match")] + fn pymatch(&self, string_args: StringArgs) -> Option { + let start = string_args.pos; + let end = string_args.endpos; + let flags = self.flags; + let pattern_codes = self.code.clone(); + let string = string_args.string.borrow_value(); + let mut state = State::new( + // string_args.string, + string, + start, + end, + flags, + pattern_codes + ); + interp::pymatch(state); + None + } + #[pymethod] + fn fullmatch(&self, string_args: StringArgs) -> Option { + None + } + #[pymethod] + fn search(&self, string_args: StringArgs) -> Option { + None + } + #[pymethod] + fn findall(&self, string_args: StringArgs) -> Option { + None + } + #[pymethod] + fn finditer(&self, string_args: StringArgs) -> Option { + None + } + #[pymethod] + fn scanner(&self, string_args: StringArgs) -> Option { + None + } + #[pymethod] + fn sub(&self, sub_args: SubArgs, vm: &VirtualMachine) -> PyResult { + Err(vm.new_not_implemented_error("".to_owned())) + } + } } diff --git a/vm/src/stdlib/sre/constants.rs b/vm/src/stdlib/sre/constants.rs index f6aeb3182..a80534d70 100644 --- a/vm/src/stdlib/sre/constants.rs +++ b/vm/src/stdlib/sre/constants.rs @@ -14,6 +14,10 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20140917; +pub const SRE_CODESIZE: usize = 4; +pub const SRE_MAXREPEAT: usize = usize::max_value(); +pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; + #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] diff --git a/vm/src/stdlib/sre/interp.rs b/vm/src/stdlib/sre/interp.rs index 7f93a82eb..43c2a5451 100644 --- a/vm/src/stdlib/sre/interp.rs +++ b/vm/src/stdlib/sre/interp.rs @@ -1,126 +1,409 @@ // good luck to those that follow; here be dragons +use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; use crate::builtins::PyStrRef; - -use super::constants::{SreFlag, SreOpcode}; - +use rustpython_common::borrow::BorrowValue; +use std::collections::HashMap; use std::convert::TryFrom; -use std::{iter, slice}; -pub struct State { +pub struct State<'a> { + // py_string: PyStrRef, + string: &'a str, start: usize, - s_pos: usize, end: usize, - pos: usize, flags: SreFlag, + pattern_codes: Vec, marks: Vec, lastindex: isize, marks_stack: Vec, context_stack: Vec, repeat: Option, - s: PyStrRef, + string_position: usize, } -// struct State1<'a> { -// state: &'a mut State, -// } - -struct MatchContext { - s_pos: usize, - code_pos: usize, -} - -// struct Context<'a> { -// context_stack: &mut Vec, -// } - -impl State { - pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { - let end = std::cmp::min(end, s.char_len()); +impl<'a> State<'a> { + pub(crate) fn new( + // py_string: PyStrRef, + string: &'a str, + start: usize, + end: usize, + flags: SreFlag, + pattern_codes: Vec, + ) -> Self { + // let string = py_string.borrow_value(); Self { + // py_string, + string, start, - s_pos: start, end, - pos: start, flags, - marks: Vec::new(), + pattern_codes, lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat: None, - s, + marks: Vec::new(), + string_position: start, } } + + fn reset(&mut self) { + self.marks.clear(); + self.lastindex = -1; + self.marks_stack.clear(); + self.context_stack.clear(); + self.repeat = None; + } } -// struct OpcodeDispatcher { -// executing_contexts: HashMap>, -// } - -pub struct BadSreCode; - -pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { - let mut it = code.iter().copied(); - std::iter::from_fn(move || -> Option> { - let op = it.next()?; - let op = SreOpcode::try_from(op) - .ok() - .and_then(|op| extract_code(op, &mut it)); - Some(op) - }) - .map(|x| x.ok_or(BadSreCode)) -} - -type It<'a> = iter::Copied>; -fn extract_code(op: SreOpcode, it: &mut It) -> Option { - let skip = |it: &mut It| { - let skip = it.next()? as usize; - if skip > it.len() { - None - } else { - Some(skip) - } +pub(crate) fn pymatch(mut state: State) -> bool { + let ctx = MatchContext { + string_position: state.start, + code_position: 0, + has_matched: None, }; - match op { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::INFO => { - // let skip = it.next()?; + state.context_stack.push(ctx); + + let mut has_matched = None; + loop { + if state.context_stack.is_empty() { + break; + } + let ctx_id = state.context_stack.len() - 1; + let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut dispatcher = OpcodeDispatcher::new(); + + has_matched = dispatcher.pymatch(&mut drive); + state = drive.take(); + if has_matched.is_some() { + state.context_stack.pop(); } - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::RANGE_IGNORE => {} } - todo!() + has_matched.unwrap_or(false) } -pub enum Op { - Info {}, +#[derive(Debug, Copy, Clone)] +struct MatchContext { + string_position: usize, + code_position: usize, + has_matched: Option, +} + +struct MatchContextDrive<'a> { + state: State<'a>, + ctx_id: usize, +} + +impl<'a> MatchContextDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn str(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked( + &self.state.string.as_bytes()[self.ctx().string_position..], + ) + } + } + fn peek_char(&self) -> char { + self.str().chars().next().unwrap() + } + fn peek_code(&self, peek: usize) -> u32 { + self.state.pattern_codes[self.ctx().code_position + peek] + } + fn skip_char(&mut self, skip_count: usize) { + let skipped = self.str().char_indices().nth(skip_count).unwrap().0; + self.ctx_mut().string_position += skipped; + } + fn skip_code(&mut self, skip_count: usize) { + self.ctx_mut().code_position += skip_count; + } + fn remaining_chars(&self) -> usize { + let end = self.state.end; + end - self.ctx().string_position + self.str().len() + } + fn remaining_codes(&self) -> usize { + self.state.pattern_codes.len() - self.ctx().code_position + } + fn at_beginning(&self) -> bool { + self.ctx().string_position == 0 + } + fn at_end(&self) -> bool { + self.str().is_empty() + } + fn at_linebreak(&self) -> bool { + match self.str().chars().next() { + Some(c) => c == '\n', + None => false, + } + } +} + +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} + +macro_rules! once { + ($val:expr) => { + Box::new(OpEmpty {}) + }; +} + +trait OpcodeExecutor { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; +} + +struct OpFailure {} +impl OpcodeExecutor for OpFailure { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + drive.ctx_mut().has_matched = Some(false); + None + } +} + +struct OpEmpty {} +impl OpcodeExecutor for OpEmpty { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + None + } +} + +struct OpOnce { + f: Option, +} +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let f = self.f.take()?; + f(drive); + None + } +} +fn once(f: F) -> Box> { + Box::new(OpOnce { f: Some(f) }) +} + +struct OpMinRepeatOne { + trace_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.trace_id { + 0 => self._0(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + trace_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count_repetitions(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + // mark push + self.trace_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.trace_id = 2; + return Some(()); + } + + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count_repetitions(drive, 1) == 0 { + self.trace_id = 3; + return self._3(drive); + } + drive.skip_char(1); + self.count += 1; + // marks pop keep + self.trace_id = 1; + self._1(drive) + } + fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } +} + +impl OpcodeDispatcher { + fn new() -> Self { + Self { + executing_contexts: HashMap::new(), + } + } + // Returns True if the current context matches, False if it doesn't and + // None if matching is not finished, ie must be resumed after child + // contexts have been matched. + fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { + let code = drive.peek_code(0); + let opcode = SreOpcode::try_from(code).unwrap(); + self.dispatch(opcode, drive); + // self.drive = self.drive; + } + match drive.ctx().has_matched { + Some(matched) => Some(matched), + None => { + drive.ctx_mut().has_matched = Some(false); + Some(false) + } + } + } + + // Dispatches a context on a given opcode. Returns True if the context + // is done matching, False if it must be resumed when next encountered. + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { + Some((_, mut executor)) => executor, + None => self.dispatch_table(opcode, drive), + }; + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + false + } else { + true + } + } + + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut MatchContextDrive, + ) -> Box { + // move || { + match opcode { + SreOpcode::FAILURE => { + Box::new(OpFailure {}) + } + SreOpcode::SUCCESS => once(|drive| { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + }), + SreOpcode::ANY => once!(true), + SreOpcode::ANY_ALL => once!(true), + SreOpcode::ASSERT => once!(true), + SreOpcode::ASSERT_NOT => once!(true), + SreOpcode::AT => once!(true), + SreOpcode::BRANCH => once!(true), + SreOpcode::CALL => once!(true), + SreOpcode::CATEGORY => once!(true), + SreOpcode::CHARSET => once!(true), + SreOpcode::BIGCHARSET => once!(true), + SreOpcode::GROUPREF => once!(true), + SreOpcode::GROUPREF_EXISTS => once!(true), + SreOpcode::GROUPREF_IGNORE => once!(true), + SreOpcode::IN => once!(true), + SreOpcode::IN_IGNORE => once!(true), + SreOpcode::INFO => once!(true), + SreOpcode::JUMP => once!(true), + SreOpcode::LITERAL => { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_char(1); + } + drive.skip_code(2); + once!(true) + } + SreOpcode::LITERAL_IGNORE => once!(true), + SreOpcode::MARK => once!(true), + SreOpcode::MAX_UNTIL => once!(true), + SreOpcode::MIN_UNTIL => once!(true), + SreOpcode::NOT_LITERAL => once!(true), + SreOpcode::NOT_LITERAL_IGNORE => once!(true), + SreOpcode::NEGATE => once!(true), + SreOpcode::RANGE => once!(true), + SreOpcode::REPEAT => once!(true), + SreOpcode::REPEAT_ONE => once!(true), + SreOpcode::SUBPATTERN => once!(true), + SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::RANGE_IGNORE => once!(true), + } + } +} + +// Returns the number of repetitions of a single item, starting from the +// current string position. The code pointer is expected to point to a +// REPEAT_ONE operation (with the repeated 4 ahead). +fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.state.end - drive.ctx().string_position; + if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { + real_maxcount = maxcount; + } + count }