WIP - native _sre

This commit is contained in:
Noah
2020-09-30 21:20:13 -05:00
committed by Kangzhi Shi
commit c2ee9ca3e0
2 changed files with 240 additions and 0 deletions

114
constants.rs Normal file
View File

@@ -0,0 +1,114 @@
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
use bitflags::bitflags;
pub const SRE_MAGIC: usize = 20140917;
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreOpcode {
FAILURE = 0,
SUCCESS = 1,
ANY = 2,
ANY_ALL = 3,
ASSERT = 4,
ASSERT_NOT = 5,
AT = 6,
BRANCH = 7,
CALL = 8,
CATEGORY = 9,
CHARSET = 10,
BIGCHARSET = 11,
GROUPREF = 12,
GROUPREF_EXISTS = 13,
GROUPREF_IGNORE = 14,
IN = 15,
IN_IGNORE = 16,
INFO = 17,
JUMP = 18,
LITERAL = 19,
LITERAL_IGNORE = 20,
MARK = 21,
MAX_UNTIL = 22,
MIN_UNTIL = 23,
NOT_LITERAL = 24,
NOT_LITERAL_IGNORE = 25,
NEGATE = 26,
RANGE = 27,
REPEAT = 28,
REPEAT_ONE = 29,
SUBPATTERN = 30,
MIN_REPEAT_ONE = 31,
RANGE_IGNORE = 32,
}
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreAtCode {
BEGINNING = 0,
BEGINNING_LINE = 1,
BEGINNING_STRING = 2,
BOUNDARY = 3,
NON_BOUNDARY = 4,
END = 5,
END_LINE = 6,
END_STRING = 7,
LOC_BOUNDARY = 8,
LOC_NON_BOUNDARY = 9,
UNI_BOUNDARY = 10,
UNI_NON_BOUNDARY = 11,
}
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types)]
pub enum SreCatCode {
DIGIT = 0,
NOT_DIGIT = 1,
SPACE = 2,
NOT_SPACE = 3,
WORD = 4,
NOT_WORD = 5,
LINEBREAK = 6,
NOT_LINEBREAK = 7,
LOC_WORD = 8,
LOC_NOT_WORD = 9,
UNI_DIGIT = 10,
UNI_NOT_DIGIT = 11,
UNI_SPACE = 12,
UNI_NOT_SPACE = 13,
UNI_WORD = 14,
UNI_NOT_WORD = 15,
UNI_LINEBREAK = 16,
UNI_NOT_LINEBREAK = 17,
}
bitflags! {
pub struct SreFlag: u16 {
const TEMPLATE = 1;
const IGNORECASE = 2;
const LOCALE = 4;
const MULTILINE = 8;
const DOTALL = 16;
const UNICODE = 32;
const VERBOSE = 64;
const DEBUG = 128;
const ASCII = 256;
}
}
bitflags! {
pub struct SreInfo: u32 {
const PREFIX = 1;
const LITERAL = 2;
const CHARSET = 4;
}
}

126
interp.rs Normal file
View File

@@ -0,0 +1,126 @@
// good luck to those that follow; here be dragons
use crate::builtins::PyStrRef;
use super::constants::{SreFlag, SreOpcode};
use std::convert::TryFrom;
use std::{iter, slice};
pub struct State {
start: usize,
s_pos: usize,
end: usize,
pos: usize,
flags: SreFlag,
marks: Vec<usize>,
lastindex: isize,
marks_stack: Vec<usize>,
context_stack: Vec<MatchContext>,
repeat: Option<usize>,
s: PyStrRef,
}
// struct State1<'a> {
// state: &'a mut State,
// }
struct MatchContext {
s_pos: usize,
code_pos: usize,
}
// struct Context<'a> {
// context_stack: &mut Vec<MatchContext>,
// }
impl State {
pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self {
let end = std::cmp::min(end, s.char_len());
Self {
start,
s_pos: start,
end,
pos: start,
flags,
marks: Vec::new(),
lastindex: -1,
marks_stack: Vec::new(),
context_stack: Vec::new(),
repeat: None,
s,
}
}
}
// struct OpcodeDispatcher {
// executing_contexts: HashMap<usize, Rc<State>>,
// }
pub struct BadSreCode;
pub fn parse_ops(code: &[u32]) -> impl Iterator<Item = Result<Op, BadSreCode>> + '_ {
let mut it = code.iter().copied();
std::iter::from_fn(move || -> Option<Option<Op>> {
let op = it.next()?;
let op = SreOpcode::try_from(op)
.ok()
.and_then(|op| extract_code(op, &mut it));
Some(op)
})
.map(|x| x.ok_or(BadSreCode))
}
type It<'a> = iter::Copied<slice::Iter<'a, u32>>;
fn extract_code(op: SreOpcode, it: &mut It) -> Option<Op> {
let skip = |it: &mut It| {
let skip = it.next()? as usize;
if skip > it.len() {
None
} else {
Some(skip)
}
};
match op {
SreOpcode::FAILURE => {}
SreOpcode::SUCCESS => {}
SreOpcode::ANY => {}
SreOpcode::ANY_ALL => {}
SreOpcode::ASSERT => {}
SreOpcode::ASSERT_NOT => {}
SreOpcode::AT => {}
SreOpcode::BRANCH => {}
SreOpcode::CALL => {}
SreOpcode::CATEGORY => {}
SreOpcode::CHARSET => {}
SreOpcode::BIGCHARSET => {}
SreOpcode::GROUPREF => {}
SreOpcode::GROUPREF_EXISTS => {}
SreOpcode::GROUPREF_IGNORE => {}
SreOpcode::IN => {}
SreOpcode::IN_IGNORE => {}
SreOpcode::INFO => {
// let skip = it.next()?;
}
SreOpcode::JUMP => {}
SreOpcode::LITERAL => {}
SreOpcode::LITERAL_IGNORE => {}
SreOpcode::MARK => {}
SreOpcode::MAX_UNTIL => {}
SreOpcode::MIN_UNTIL => {}
SreOpcode::NOT_LITERAL => {}
SreOpcode::NOT_LITERAL_IGNORE => {}
SreOpcode::NEGATE => {}
SreOpcode::RANGE => {}
SreOpcode::REPEAT => {}
SreOpcode::REPEAT_ONE => {}
SreOpcode::SUBPATTERN => {}
SreOpcode::MIN_REPEAT_ONE => {}
SreOpcode::RANGE_IGNORE => {}
}
todo!()
}
pub enum Op {
Info {},
}