Files
RustPython/interp.rs
2021-01-20 08:29:57 +02:00

1394 lines
46 KiB
Rust

// good luck to those that follow; here be dragons
use super::_sre::{Match, Pattern, MAXREPEAT};
use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode};
use crate::builtins::PyStrRef;
use crate::pyobject::PyRef;
use rustpython_common::borrow::BorrowValue;
use std::collections::HashMap;
use std::convert::TryFrom;
#[derive(Debug)]
pub(crate) struct State<'a> {
string: &'a str,
// chars count
string_len: usize,
pub start: usize,
pub end: usize,
flags: SreFlag,
pattern_codes: &'a [u32],
pub marks: Vec<Option<usize>>,
pub lastindex: isize,
marks_stack: Vec<(Vec<Option<usize>>, isize)>,
context_stack: Vec<MatchContext>,
repeat_stack: Vec<RepeatContext>,
pub string_position: usize,
popped_context: Option<MatchContext>,
}
impl<'a> State<'a> {
pub(crate) fn new(
string: &'a str,
start: usize,
end: usize,
flags: SreFlag,
pattern_codes: &'a [u32],
) -> Self {
let string_len = string.chars().count();
let end = std::cmp::min(end, string_len);
let start = std::cmp::min(start, end);
Self {
string,
string_len,
start,
end,
flags,
pattern_codes,
lastindex: -1,
marks_stack: Vec::new(),
context_stack: Vec::new(),
repeat_stack: Vec::new(),
marks: Vec::new(),
string_position: start,
popped_context: None,
}
}
fn reset(&mut self) {
self.marks.clear();
self.lastindex = -1;
self.marks_stack.clear();
self.context_stack.clear();
self.repeat_stack.clear();
self.popped_context = None;
}
fn set_mark(&mut self, mark_nr: usize, position: usize) {
if mark_nr & 1 != 0 {
self.lastindex = mark_nr as isize / 2 + 1;
}
if mark_nr >= self.marks.len() {
self.marks.resize(mark_nr + 1, None);
}
self.marks[mark_nr] = Some(position);
}
fn get_marks(&self, group_index: usize) -> (Option<usize>, Option<usize>) {
let marks_index = 2 * group_index;
if marks_index + 1 < self.marks.len() {
(self.marks[marks_index], self.marks[marks_index + 1])
} else {
(None, None)
}
}
fn marks_push(&mut self) {
self.marks_stack.push((self.marks.clone(), self.lastindex));
}
fn marks_pop(&mut self) {
let (marks, lastindex) = self.marks_stack.pop().unwrap();
self.marks = marks;
self.lastindex = lastindex;
}
fn marks_pop_keep(&mut self) {
let (marks, lastindex) = self.marks_stack.last().unwrap().clone();
self.marks = marks;
self.lastindex = lastindex;
}
fn marks_pop_discard(&mut self) {
self.marks_stack.pop();
}
}
pub(crate) fn pymatch(
string: PyStrRef,
start: usize,
end: usize,
pattern: PyRef<Pattern>,
) -> Option<Match> {
let mut state = State::new(
string.borrow_value(),
start,
end,
pattern.flags,
&pattern.code,
);
let ctx = MatchContext {
string_position: state.start,
string_offset: calc_string_offset(state.string, state.start),
code_position: 0,
has_matched: None,
};
state.context_stack.push(ctx);
let mut dispatcher = OpcodeDispatcher::new();
let mut has_matched = None;
loop {
if state.context_stack.is_empty() {
break;
}
let ctx_id = state.context_stack.len() - 1;
let mut drive = StackDrive::drive(ctx_id, state);
has_matched = dispatcher.pymatch(&mut drive);
state = drive.take();
if has_matched.is_some() {
state.popped_context = state.context_stack.pop();
}
}
if has_matched != Some(true) {
None
} else {
Some(Match::new(&state, pattern.clone(), string.clone()))
}
}
pub(crate) fn search(
string: PyStrRef,
start: usize,
end: usize,
pattern: PyRef<Pattern>,
) -> Option<Match> {
// TODO: optimize by op info and skip prefix
let end = std::cmp::min(end, string.char_len());
for i in start..end {
if let Some(m) = pymatch(
string.clone(),
i,
end,
pattern.clone(),
) {
return Some(m);
}
}
None
}
#[derive(Debug, Copy, Clone)]
struct MatchContext {
string_position: usize,
string_offset: usize,
code_position: usize,
has_matched: Option<bool>,
}
trait MatchContextDrive {
fn ctx_mut(&mut self) -> &mut MatchContext;
fn ctx(&self) -> &MatchContext;
fn state(&self) -> &State;
fn repeat_ctx(&self) -> &RepeatContext {
self.state().repeat_stack.last().unwrap()
}
fn str(&self) -> &str {
unsafe {
std::str::from_utf8_unchecked(
&self.state().string.as_bytes()[self.ctx().string_offset..],
)
}
}
fn pattern(&self) -> &[u32] {
&self.state().pattern_codes[self.ctx().code_position..]
}
fn peek_char(&self) -> char {
self.str().chars().next().unwrap()
}
fn peek_code(&self, peek: usize) -> u32 {
self.state().pattern_codes[self.ctx().code_position + peek]
}
fn skip_char(&mut self, skip_count: usize) {
match self.str().char_indices().nth(skip_count).map(|x| x.0) {
Some(skipped) => {
self.ctx_mut().string_position += skip_count;
self.ctx_mut().string_offset += skipped;
}
None => {
self.ctx_mut().string_position = self.state().end;
self.ctx_mut().string_offset = self.state().string.len(); // bytes len
}
}
}
fn skip_code(&mut self, skip_count: usize) {
self.ctx_mut().code_position += skip_count;
if self.ctx().code_position > self.state().pattern_codes.len() {
self.ctx_mut().code_position = self.state().pattern_codes.len();
}
}
fn remaining_chars(&self) -> usize {
self.state().end - self.ctx().string_position
}
fn remaining_codes(&self) -> usize {
self.state().pattern_codes.len() - self.ctx().code_position
}
fn at_beginning(&self) -> bool {
self.ctx().string_position == self.state().start
}
fn at_end(&self) -> bool {
self.ctx().string_position == self.state().end
}
fn at_linebreak(&self) -> bool {
!self.at_end() && is_linebreak(self.peek_char())
}
fn at_boundary<F: FnMut(char) -> bool>(&self, mut word_checker: F) -> bool {
if self.at_beginning() && self.at_end() {
return false;
}
let that = !self.at_beginning() && word_checker(self.back_peek_char());
let this = !self.at_end() && word_checker(self.peek_char());
this != that
}
fn back_peek_offset(&self) -> usize {
let bytes = self.state().string.as_bytes();
let mut offset = self.ctx().string_offset - 1;
if !is_utf8_first_byte(bytes[offset]) {
offset -= 1;
if !is_utf8_first_byte(bytes[offset]) {
offset -= 1;
if !is_utf8_first_byte(bytes[offset]) {
offset -= 1;
if !is_utf8_first_byte(bytes[offset]) {
panic!("not utf-8 code point");
}
}
}
}
offset
}
fn back_peek_char(&self) -> char {
let bytes = self.state().string.as_bytes();
let offset = self.back_peek_offset();
let current_offset = self.ctx().string_offset;
let code = match current_offset - offset {
1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]),
2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]),
3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]),
4 => u32::from_ne_bytes([
bytes[offset],
bytes[offset + 1],
bytes[offset + 2],
bytes[offset + 3],
]),
_ => unreachable!(),
};
// TODO: char::from_u32_unchecked is stable from 1.5.0
unsafe { std::mem::transmute(code) }
}
fn back_skip_char(&mut self, skip_count: usize) {
self.ctx_mut().string_position -= skip_count;
for _ in 0..skip_count {
self.ctx_mut().string_offset = self.back_peek_offset();
}
}
}
struct StackDrive<'a> {
state: State<'a>,
ctx_id: usize,
}
impl<'a> StackDrive<'a> {
fn id(&self) -> usize {
self.ctx_id
}
fn drive(ctx_id: usize, state: State<'a>) -> Self {
Self { state, ctx_id }
}
fn take(self) -> State<'a> {
self.state
}
fn push_new_context(&mut self, pattern_offset: usize) {
let ctx = self.ctx();
let mut child_ctx = MatchContext { ..*ctx };
child_ctx.code_position += pattern_offset;
if child_ctx.code_position > self.state.pattern_codes.len() {
child_ctx.code_position = self.state.pattern_codes.len();
}
self.state.context_stack.push(child_ctx);
}
fn repeat_ctx_mut(&mut self) -> &mut RepeatContext {
self.state.repeat_stack.last_mut().unwrap()
}
}
impl MatchContextDrive for StackDrive<'_> {
fn ctx_mut(&mut self) -> &mut MatchContext {
&mut self.state.context_stack[self.ctx_id]
}
fn ctx(&self) -> &MatchContext {
&self.state.context_stack[self.ctx_id]
}
fn state(&self) -> &State {
&self.state
}
}
struct WrapDrive<'a> {
stack_drive: &'a StackDrive<'a>,
ctx: MatchContext,
}
impl<'a> WrapDrive<'a> {
fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self {
Self { stack_drive, ctx }
}
}
impl MatchContextDrive for WrapDrive<'_> {
fn ctx_mut(&mut self) -> &mut MatchContext {
&mut self.ctx
}
fn ctx(&self) -> &MatchContext {
&self.ctx
}
fn state(&self) -> &State {
self.stack_drive.state()
}
}
trait OpcodeExecutor {
fn next(&mut self, drive: &mut StackDrive) -> Option<()>;
}
struct OpOnce<F> {
f: Option<F>,
}
impl<F: FnOnce(&mut StackDrive)> OpcodeExecutor for OpOnce<F> {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
let f = self.f.take()?;
f(drive);
None
}
}
fn once<F: FnOnce(&mut StackDrive)>(f: F) -> Box<OpOnce<F>> {
Box::new(OpOnce { f: Some(f) })
}
// F1 F2 are same identical, but workaround for closure
struct OpTwice<F1, F2> {
f1: Option<F1>,
f2: Option<F2>,
}
impl<F1, F2> OpcodeExecutor for OpTwice<F1, F2>
where
F1: FnOnce(&mut StackDrive),
F2: FnOnce(&mut StackDrive),
{
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
if let Some(f1) = self.f1.take() {
f1(drive);
Some(())
} else if let Some(f2) = self.f2.take() {
f2(drive);
None
} else {
unreachable!()
}
}
}
fn twice<F1, F2>(f1: F1, f2: F2) -> Box<OpTwice<F1, F2>>
where
F1: FnOnce(&mut StackDrive),
F2: FnOnce(&mut StackDrive),
{
Box::new(OpTwice {
f1: Some(f1),
f2: Some(f2),
})
}
struct OpcodeDispatcher {
executing_contexts: HashMap<usize, Box<dyn OpcodeExecutor>>,
}
impl OpcodeDispatcher {
fn new() -> Self {
Self {
executing_contexts: HashMap::new(),
}
}
// Returns True if the current context matches, False if it doesn't and
// None if matching is not finished, ie must be resumed after child
// contexts have been matched.
fn pymatch(&mut self, drive: &mut StackDrive) -> Option<bool> {
while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() {
let code = drive.peek_code(0);
let opcode = SreOpcode::try_from(code).unwrap();
if !self.dispatch(opcode, drive) {
return None;
}
}
match drive.ctx().has_matched {
Some(matched) => Some(matched),
None => {
drive.ctx_mut().has_matched = Some(false);
Some(false)
}
}
}
// Dispatches a context on a given opcode. Returns True if the context
// is done matching, False if it must be resumed when next encountered.
fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool {
let mut executor = match self.executing_contexts.remove_entry(&drive.id()) {
Some((_, executor)) => executor,
None => self.dispatch_table(opcode),
};
if let Some(()) = executor.next(drive) {
self.executing_contexts.insert(drive.id(), executor);
false
} else {
true
}
}
fn dispatch_table(&mut self, opcode: SreOpcode) -> Box<dyn OpcodeExecutor> {
match opcode {
SreOpcode::FAILURE => once(|drive| {
drive.ctx_mut().has_matched = Some(false);
}),
SreOpcode::SUCCESS => once(|drive| {
drive.state.string_position = drive.ctx().string_position;
drive.ctx_mut().has_matched = Some(true);
}),
SreOpcode::ANY => once(|drive| {
if drive.at_end() || drive.at_linebreak() {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(1);
drive.skip_char(1);
}
}),
SreOpcode::ANY_ALL => once(|drive| {
if drive.at_end() {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(1);
drive.skip_char(1);
}
}),
SreOpcode::ASSERT => twice(
|drive| {
let back = drive.peek_code(2) as usize;
if back > drive.ctx().string_position {
drive.ctx_mut().has_matched = Some(false);
return;
}
drive.state.string_position = drive.ctx().string_position - back;
drive.push_new_context(3);
},
|drive| {
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.skip_code(drive.peek_code(1) as usize + 1);
} else {
drive.ctx_mut().has_matched = Some(false);
}
},
),
SreOpcode::ASSERT_NOT => twice(
|drive| {
let back = drive.peek_code(2) as usize;
if back > drive.ctx().string_position {
drive.skip_code(drive.peek_code(1) as usize + 1);
return;
}
drive.state.string_position = drive.ctx().string_position - back;
drive.push_new_context(3);
},
|drive| {
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(drive.peek_code(1) as usize + 1);
}
},
),
SreOpcode::AT => once(|drive| {
let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap();
if !at(drive, atcode) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(2);
}
}),
SreOpcode::BRANCH => Box::new(OpBranch::default()),
SreOpcode::CATEGORY => once(|drive| {
let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap();
if drive.at_end() || !category(catcode, drive.peek_char()) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(2);
drive.skip_char(1);
}
}),
SreOpcode::IN => once(|drive| {
general_op_in(drive, |set, c| charset(set, c));
}),
SreOpcode::IN_IGNORE => once(|drive| {
general_op_in(drive, |set, c| charset(set, lower_ascii(c)));
}),
SreOpcode::IN_UNI_IGNORE => once(|drive| {
general_op_in(drive, |set, c| charset(set, lower_unicode(c)));
}),
SreOpcode::IN_LOC_IGNORE => once(|drive| {
general_op_in(drive, |set, c| charset_loc_ignore(set, c));
}),
SreOpcode::INFO | SreOpcode::JUMP => once(|drive| {
drive.skip_code(drive.peek_code(1) as usize + 1);
}),
SreOpcode::LITERAL => once(|drive| {
general_op_literal(drive, |code, c| code == c as u32);
}),
SreOpcode::NOT_LITERAL => once(|drive| {
general_op_literal(drive, |code, c| code != c as u32);
}),
SreOpcode::LITERAL_IGNORE => once(|drive| {
general_op_literal(drive, |code, c| code == lower_ascii(c) as u32);
}),
SreOpcode::NOT_LITERAL_IGNORE => once(|drive| {
general_op_literal(drive, |code, c| code != lower_ascii(c) as u32);
}),
SreOpcode::LITERAL_UNI_IGNORE => once(|drive| {
general_op_literal(drive, |code, c| code == lower_unicode(c) as u32);
}),
SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| {
general_op_literal(drive, |code, c| code != lower_unicode(c) as u32);
}),
SreOpcode::LITERAL_LOC_IGNORE => once(|drive| {
general_op_literal(drive, char_loc_ignore);
}),
SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| {
general_op_literal(drive, |code, c| !char_loc_ignore(code, c));
}),
SreOpcode::MARK => once(|drive| {
drive
.state
.set_mark(drive.peek_code(1) as usize, drive.ctx().string_position);
drive.skip_code(2);
}),
SreOpcode::REPEAT => twice(
// create repeat context. all the hard work is done by the UNTIL
// operator (MAX_UNTIL, MIN_UNTIL)
// <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail
|drive| {
let repeat = RepeatContext {
count: -1,
code_position: drive.ctx().code_position,
last_position: std::usize::MAX,
};
drive.state.repeat_stack.push(repeat);
drive.state.string_position = drive.ctx().string_position;
// execute UNTIL operator
drive.push_new_context(drive.peek_code(1) as usize + 1);
},
|drive| {
drive.state.repeat_stack.pop();
let child_ctx = drive.state.popped_context.unwrap();
drive.ctx_mut().has_matched = child_ctx.has_matched;
},
),
SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()),
SreOpcode::MIN_UNTIL => todo!("min until"),
SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()),
SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()),
SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)),
SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)),
SreOpcode::GROUPREF_LOC_IGNORE => {
once(|drive| general_op_groupref(drive, lower_locate))
}
SreOpcode::GROUPREF_UNI_IGNORE => {
once(|drive| general_op_groupref(drive, lower_unicode))
}
SreOpcode::GROUPREF_EXISTS => once(|drive| {
let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize);
match (group_start, group_end) {
(Some(start), Some(end)) if start <= end => {
drive.skip_code(3);
}
_ => drive.skip_code(drive.peek_code(2) as usize + 1),
}
}),
_ => {
// TODO python expcetion?
unreachable!("unexpected opcode")
}
}
}
}
fn calc_string_offset(string: &str, position: usize) -> usize {
string
.char_indices()
.nth(position)
.map(|(i, _)| i)
.unwrap_or(0)
}
fn char_loc_ignore(code: u32, c: char) -> bool {
code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32
}
fn charset_loc_ignore(set: &[u32], c: char) -> bool {
let lo = lower_locate(c);
if charset(set, c) {
return true;
}
let up = upper_locate(c);
up != lo && charset(set, up)
}
fn general_op_groupref<F: FnMut(char) -> char>(drive: &mut StackDrive, mut f: F) {
let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize);
let (group_start, group_end) = match (group_start, group_end) {
(Some(start), Some(end)) if start <= end => (start, end),
_ => {
drive.ctx_mut().has_matched = Some(false);
return;
}
};
let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive);
let mut gdrive = WrapDrive::drive(
MatchContext {
string_position: group_start,
// TODO: cache the offset
string_offset: calc_string_offset(drive.state.string, group_start),
..*drive.ctx()
},
&drive,
);
for _ in group_start..group_end {
if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) {
drive.ctx_mut().has_matched = Some(false);
return;
}
wdrive.skip_char(1);
gdrive.skip_char(1);
}
let position = wdrive.ctx().string_position;
let offset = wdrive.ctx().string_offset;
drive.skip_code(2);
drive.ctx_mut().string_position = position;
drive.ctx_mut().string_offset = offset;
}
fn general_op_literal<F: FnOnce(u32, char) -> bool>(drive: &mut StackDrive, f: F) {
if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(2);
drive.skip_char(1);
}
}
fn general_op_in<F: FnOnce(&[u32], char) -> bool>(drive: &mut StackDrive, f: F) {
let skip = drive.peek_code(1) as usize;
if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) {
drive.ctx_mut().has_matched = Some(false);
} else {
drive.skip_code(skip + 1);
drive.skip_char(1);
}
}
fn at(drive: &StackDrive, atcode: SreAtCode) -> bool {
match atcode {
SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(),
SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()),
SreAtCode::BOUNDARY => drive.at_boundary(is_word),
SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word),
SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(),
SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(),
SreAtCode::END_STRING => drive.at_end(),
SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word),
SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word),
SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word),
SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word),
}
}
fn category(catcode: SreCatCode, c: char) -> bool {
match catcode {
SreCatCode::DIGIT => is_digit(c),
SreCatCode::NOT_DIGIT => !is_digit(c),
SreCatCode::SPACE => is_space(c),
SreCatCode::NOT_SPACE => !is_space(c),
SreCatCode::WORD => is_word(c),
SreCatCode::NOT_WORD => !is_word(c),
SreCatCode::LINEBREAK => is_linebreak(c),
SreCatCode::NOT_LINEBREAK => !is_linebreak(c),
SreCatCode::LOC_WORD => is_loc_word(c),
SreCatCode::LOC_NOT_WORD => !is_loc_word(c),
SreCatCode::UNI_DIGIT => is_uni_digit(c),
SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c),
SreCatCode::UNI_SPACE => is_uni_space(c),
SreCatCode::UNI_NOT_SPACE => !is_uni_space(c),
SreCatCode::UNI_WORD => is_uni_word(c),
SreCatCode::UNI_NOT_WORD => !is_uni_word(c),
SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c),
SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c),
}
}
fn charset(set: &[u32], c: char) -> bool {
/* check if character is a member of the given set */
let ch = c as u32;
let mut ok = true;
let mut i = 0;
while i < set.len() {
let opcode = match SreOpcode::try_from(set[i]) {
Ok(code) => code,
Err(_) => {
break;
}
};
match opcode {
SreOpcode::FAILURE => {
return !ok;
}
SreOpcode::CATEGORY => {
/* <CATEGORY> <code> */
let catcode = match SreCatCode::try_from(set[i + 1]) {
Ok(code) => code,
Err(_) => {
break;
}
};
if category(catcode, c) {
return ok;
}
i += 2;
}
SreOpcode::CHARSET => {
/* <CHARSET> <bitmap> */
let set = &set[1..];
if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) {
return ok;
}
i += 8;
}
SreOpcode::BIGCHARSET => {
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
let count = set[i + 1] as usize;
if ch < 0x10000 {
let set = &set[2..];
let block_index = ch >> 8;
let (_, blockindices, _) = unsafe { set.align_to::<u8>() };
let blocks = &set[64..];
let block = blockindices[block_index as usize];
if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize]
& (1u32 << (ch & 31))
!= 0
{
return ok;
}
}
i += 2 + 64 + count * 8;
}
SreOpcode::LITERAL => {
/* <LITERAL> <code> */
if ch == set[i + 1] {
return ok;
}
i += 2;
}
SreOpcode::NEGATE => {
ok = !ok;
i += 1;
}
SreOpcode::RANGE => {
/* <RANGE> <lower> <upper> */
if set[i + 1] <= ch && ch <= set[i + 2] {
return ok;
}
i += 3;
}
SreOpcode::RANGE_UNI_IGNORE => {
/* <RANGE_UNI_IGNORE> <lower> <upper> */
if set[i + 1] <= ch && ch <= set[i + 2] {
return ok;
}
let ch = upper_unicode(c) as u32;
if set[i + 1] <= ch && ch <= set[i + 2] {
return ok;
}
i += 3;
}
_ => {
break;
}
}
}
/* internal error -- there's not much we can do about it
here, so let's just pretend it didn't match... */
false
}
fn count(drive: &mut StackDrive, maxcount: usize) -> usize {
let mut count = 0;
let maxcount = std::cmp::min(maxcount, drive.remaining_chars());
let save_ctx = *drive.ctx();
drive.skip_code(4);
let reset_position = drive.ctx().code_position;
let mut dispatcher = OpcodeDispatcher::new();
while count < maxcount {
drive.ctx_mut().code_position = reset_position;
dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive);
if drive.ctx().has_matched == Some(false) {
break;
}
count += 1;
}
*drive.ctx_mut() = save_ctx;
count
}
fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize {
let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive);
let maxcount = std::cmp::min(maxcount, drive.remaining_chars());
let end = drive.ctx().string_position + maxcount;
let opcode = match SreOpcode::try_from(drive.peek_code(1)) {
Ok(code) => code,
Err(_) => {
panic!("FIXME:COUNT1");
}
};
match opcode {
SreOpcode::ANY => {
while !drive.ctx().string_position < end && !drive.at_linebreak() {
drive.skip_char(1);
}
}
SreOpcode::ANY_ALL => {
drive.skip_char(maxcount);
}
SreOpcode::IN => {
// TODO: pattern[2 or 1..]?
while !drive.ctx().string_position < end
&& charset(&drive.pattern()[2..], drive.peek_char())
{
drive.skip_char(1);
}
}
SreOpcode::LITERAL => {
general_count_literal(&mut drive, end, |code, c| code == c as u32);
}
SreOpcode::NOT_LITERAL => {
general_count_literal(&mut drive, end, |code, c| code != c as u32);
}
SreOpcode::LITERAL_IGNORE => {
general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32);
}
SreOpcode::NOT_LITERAL_IGNORE => {
general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32);
}
SreOpcode::LITERAL_LOC_IGNORE => {
general_count_literal(&mut drive, end, char_loc_ignore);
}
SreOpcode::NOT_LITERAL_LOC_IGNORE => {
general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c));
}
SreOpcode::LITERAL_UNI_IGNORE => {
general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32);
}
SreOpcode::NOT_LITERAL_UNI_IGNORE => {
general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32);
}
_ => {
todo!("repeated single character pattern?");
}
}
drive.ctx().string_position - drive.state().string_position
}
fn general_count_literal<F: FnMut(u32, char) -> bool>(drive: &mut WrapDrive, end: usize, mut f: F) {
let ch = drive.peek_code(1);
while !drive.ctx().string_position < end && f(ch, drive.peek_char()) {
drive.skip_char(1);
}
}
fn eq_loc_ignore(code: u32, c: char) -> bool {
code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32
}
fn is_word(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '_'
}
fn is_space(c: char) -> bool {
c.is_ascii_whitespace()
}
fn is_digit(c: char) -> bool {
c.is_ascii_digit()
}
fn is_loc_alnum(c: char) -> bool {
// TODO: check with cpython
c.is_alphanumeric()
}
fn is_loc_word(c: char) -> bool {
is_loc_alnum(c) || c == '_'
}
fn is_linebreak(c: char) -> bool {
c == '\n'
}
pub(crate) fn lower_ascii(c: char) -> char {
c.to_ascii_lowercase()
}
fn lower_locate(c: char) -> char {
// TODO: check with cpython
// https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase
c.to_lowercase().next().unwrap()
}
fn upper_locate(c: char) -> char {
// TODO: check with cpython
// https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase
c.to_uppercase().next().unwrap()
}
fn is_uni_digit(c: char) -> bool {
// TODO: check with cpython
c.is_digit(10)
}
fn is_uni_space(c: char) -> bool {
// TODO: check with cpython
c.is_whitespace()
}
fn is_uni_linebreak(c: char) -> bool {
matches!(
c,
'\u{000A}'
| '\u{000B}'
| '\u{000C}'
| '\u{000D}'
| '\u{001C}'
| '\u{001D}'
| '\u{001E}'
| '\u{0085}'
| '\u{2028}'
| '\u{2029}'
)
}
fn is_uni_alnum(c: char) -> bool {
// TODO: check with cpython
c.is_alphanumeric()
}
fn is_uni_word(c: char) -> bool {
is_uni_alnum(c) || c == '_'
}
pub(crate) fn lower_unicode(c: char) -> char {
// TODO: check with cpython
c.to_lowercase().next().unwrap()
}
pub(crate) fn upper_unicode(c: char) -> char {
// TODO: check with cpython
c.to_uppercase().next().unwrap()
}
fn is_utf8_first_byte(b: u8) -> bool {
// In UTF-8, there are three kinds of byte...
// 0xxxxxxx : ASCII
// 10xxxxxx : 2nd, 3rd or 4th byte of code
// 11xxxxxx : 1st byte of multibyte code
(b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000)
}
#[derive(Debug, Copy, Clone)]
struct RepeatContext {
count: isize,
code_position: usize,
// zero-width match protection
last_position: usize,
}
struct OpMinRepeatOne {
jump_id: usize,
mincount: usize,
maxcount: usize,
count: usize,
}
impl OpcodeExecutor for OpMinRepeatOne {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
match self.jump_id {
0 => self._0(drive),
1 => self._1(drive),
2 => self._2(drive),
_ => unreachable!(),
}
}
}
impl Default for OpMinRepeatOne {
fn default() -> Self {
OpMinRepeatOne {
jump_id: 0,
mincount: 0,
maxcount: 0,
count: 0,
}
}
}
impl OpMinRepeatOne {
fn _0(&mut self, drive: &mut StackDrive) -> Option<()> {
self.mincount = drive.peek_code(2) as usize;
self.maxcount = drive.peek_code(3) as usize;
if drive.remaining_chars() < self.mincount {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.state.string_position = drive.ctx().string_position;
self.count = if self.mincount == 0 {
0
} else {
let count = count(drive, self.mincount);
if count < self.mincount {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.skip_char(count);
count
};
if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 {
drive.state.string_position = drive.ctx().string_position;
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.marks_push();
self.jump_id = 1;
self._1(drive)
}
fn _1(&mut self, drive: &mut StackDrive) -> Option<()> {
if self.maxcount == MAXREPEAT || self.count <= self.maxcount {
drive.state.string_position = drive.ctx().string_position;
drive.push_new_context(drive.peek_code(1) as usize + 1);
self.jump_id = 2;
return Some(());
}
drive.state.marks_pop_discard();
drive.ctx_mut().has_matched = Some(false);
None
}
fn _2(&mut self, drive: &mut StackDrive) -> Option<()> {
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.string_position = drive.ctx().string_position;
if count(drive, 1) == 0 {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.skip_char(1);
self.count += 1;
drive.state.marks_pop_keep();
self.jump_id = 1;
self._1(drive)
}
}
struct OpMaxUntil {
jump_id: usize,
count: isize,
save_last_position: usize,
}
impl Default for OpMaxUntil {
fn default() -> Self {
OpMaxUntil {
jump_id: 0,
count: 0,
save_last_position: 0,
}
}
}
impl OpcodeExecutor for OpMaxUntil {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
match self.jump_id {
0 => {
let RepeatContext {
count,
code_position,
last_position,
} = *drive.repeat_ctx();
drive.ctx_mut().code_position = code_position;
let mincount = drive.peek_code(2) as usize;
let maxcount = drive.peek_code(3) as usize;
drive.state.string_position = drive.ctx().string_position;
self.count = count + 1;
if (self.count as usize) < mincount {
// not enough matches
drive.repeat_ctx_mut().count = self.count;
drive.push_new_context(4);
self.jump_id = 1;
return Some(());
}
if ((count as usize) < maxcount || maxcount == MAXREPEAT)
&& drive.state.string_position != last_position
{
// we may have enough matches, if we can match another item, do so
drive.repeat_ctx_mut().count = self.count;
drive.state.marks_push();
self.save_last_position = last_position;
drive.repeat_ctx_mut().last_position = drive.state.string_position;
drive.push_new_context(4);
self.jump_id = 2;
return Some(());
}
self.jump_id = 3;
self.next(drive)
}
1 => {
let child_ctx = drive.state.popped_context.unwrap();
drive.ctx_mut().has_matched = child_ctx.has_matched;
if drive.ctx().has_matched != Some(true) {
drive.repeat_ctx_mut().count = self.count - 1;
drive.state.string_position = drive.ctx().string_position;
}
None
}
2 => {
drive.repeat_ctx_mut().last_position = self.save_last_position;
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.state.marks_pop_discard();
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.marks_pop();
drive.repeat_ctx_mut().count = self.count - 1;
drive.state.string_position = drive.ctx().string_position;
self.jump_id = 3;
self.next(drive)
}
3 => {
// cannot match more repeated items here. make sure the tail matches
drive.skip_code(drive.peek_code(1) as usize + 1);
drive.push_new_context(1);
self.jump_id = 4;
Some(())
}
4 => {
let child_ctx = drive.state.popped_context.unwrap();
drive.ctx_mut().has_matched = child_ctx.has_matched;
if drive.ctx().has_matched != Some(true) {
drive.state.string_position = drive.ctx().string_position;
}
None
}
_ => unreachable!(),
}
}
}
struct OpMinUntil {
jump_id: usize,
count: isize,
}
impl Default for OpMinUntil {
fn default() -> Self {
Self {
jump_id: 0,
count: 0,
}
}
}
impl OpcodeExecutor for OpMinUntil {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
None
// match self.jump_id {
// 0 => {
// drive.state.string_position = drive.ctx().string_position;
// let repeat = match drive.state.repeat_stack.last_mut() {
// Some(repeat) => repeat,
// None => {
// todo!("Internal re error: MAX_UNTIL without REPEAT.");
// }
// };
// self.count = repeat.count + 1;
// if self.count < repeat.mincount as isize {
// // not enough matches
// repeat.count = self.count;
// drive.push_new_context(4);
// self.jump_id = 1;
// return Some(());
// }
// // see if the tail matches
// drive.state.marks_push();
// drive.push_new_context(1);
// self.jump_id = 2;
// Some(())
// }
// 1 => {
// let child_ctx = drive.state.popped_context.unwrap();
// drive.ctx_mut().has_matched = child_ctx.has_matched;
// if drive.ctx().has_matched != Some(true) {
// drive.state.string_position = drive.ctx().string_position;
// let repeat = drive.state.repeat_stack.last_mut().unwrap();
// repeat.count = self.count - 1;
// }
// None
// }
// 2 => {
// let child_ctx = drive.state.popped_context.unwrap();
// if child_ctx.has_matched == Some(true) {
// drive.state.repeat_stack.pop();
// drive.ctx_mut().has_matched = Some(true);
// return None;
// }
// drive.state.string_position = drive.ctx().string_position;
// drive.state.marks_pop();
// // match more until tail matches
// let repeat = drive.state.repeat_stack.last_mut().unwrap();
// if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT {
// drive.ctx_mut().has_matched = Some(false);
// return None;
// }
// repeat.count = self.count;
// drive.push_new_context(4);
// self.jump_id = 1;
// Some(())
// }
// _ => unreachable!(),
// }
}
}
struct OpBranch {
jump_id: usize,
current_branch_length: usize,
}
impl Default for OpBranch {
fn default() -> Self {
Self {
jump_id: 0,
current_branch_length: 0,
}
}
}
impl OpcodeExecutor for OpBranch {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
match self.jump_id {
0 => {
drive.state.marks_push();
// jump out the head
self.current_branch_length = 1;
self.jump_id = 1;
self.next(drive)
}
1 => {
drive.skip_code(self.current_branch_length);
self.current_branch_length = drive.peek_code(0) as usize;
if self.current_branch_length == 0 {
drive.state.marks_pop_discard();
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.state.string_position = drive.ctx().string_position;
drive.push_new_context(1);
self.jump_id = 2;
Some(())
}
2 => {
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.marks_pop_keep();
self.jump_id = 1;
Some(())
}
_ => unreachable!(),
}
}
}
struct OpRepeatOne {
jump_id: usize,
mincount: usize,
maxcount: usize,
count: isize,
}
impl Default for OpRepeatOne {
fn default() -> Self {
Self {
jump_id: 0,
mincount: 0,
maxcount: 0,
count: 0,
}
}
}
impl OpcodeExecutor for OpRepeatOne {
fn next(&mut self, drive: &mut StackDrive) -> Option<()> {
match self.jump_id {
0 => {
self.mincount = drive.peek_code(2) as usize;
self.maxcount = drive.peek_code(3) as usize;
if drive.remaining_chars() < self.mincount {
drive.ctx_mut().has_matched = Some(false);
return None;
}
drive.state.string_position = drive.ctx().string_position;
self.count = count(drive, self.maxcount) as isize;
drive.skip_char(self.count as usize);
if self.count < self.mincount as isize {
drive.ctx_mut().has_matched = Some(false);
return None;
}
let next_code = drive.peek_code(drive.peek_code(1) as usize + 1);
if next_code == SreOpcode::SUCCESS as u32 {
// tail is empty. we're finished
drive.state.string_position = drive.ctx().string_position;
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.state.marks_push();
// TODO:
// Special case: Tail starts with a literal. Skip positions where
// the rest of the pattern cannot possibly match.
self.jump_id = 1;
self.next(drive)
}
1 => {
// General case: backtracking
if self.count >= self.mincount as isize {
drive.state.string_position = drive.ctx().string_position;
drive.push_new_context(drive.peek_code(1) as usize + 1);
self.jump_id = 2;
return Some(());
}
drive.state.marks_pop_discard();
drive.ctx_mut().has_matched = Some(false);
None
}
2 => {
let child_ctx = drive.state.popped_context.unwrap();
if child_ctx.has_matched == Some(true) {
drive.ctx_mut().has_matched = Some(true);
return None;
}
drive.back_skip_char(1);
self.count -= 1;
drive.state.marks_pop_keep();
self.jump_id = 1;
Some(())
}
_ => unreachable!(),
}
}
}