Add 'vm/sre_engine/' from commit '21fc2059b70ebd5bf4a7c524c40e7d4347e065dc'

git-subtree-dir: vm/sre_engine
git-subtree-mainline: 426e582ba0
git-subtree-split: 21fc2059b7
This commit is contained in:
Jeong YunWon
2024-03-18 17:05:07 +09:00
11 changed files with 2346 additions and 0 deletions

2
vm/sre_engine/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/target
Cargo.lock

21
vm/sre_engine/.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,21 @@
{
"version": "0.2.0",
"configurations": [
{
"type": "lldb",
"request": "launch",
"name": "Debug Unit Test",
"cargo": {
"args": [
"test",
"--no-run"
],
"filter": {
"kind": "test"
}
},
"args": [],
"cwd": "${workspaceFolder}"
}
]
}

15
vm/sre_engine/Cargo.toml Normal file
View File

@@ -0,0 +1,15 @@
[package]
name = "sre-engine"
version = "0.6.0"
authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
description = "A low-level implementation of Python's SRE regex engine"
repository = "https://github.com/RustPython/sre-engine"
license = "MIT"
edition = "2021"
keywords = ["regex"]
include = ["LICENSE", "src/**/*.rs"]
[dependencies]
num_enum = "0.7"
bitflags = "2"
optional = "0.5"

21
vm/sre_engine/LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 RustPython Team
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,111 @@
#![feature(test)]
extern crate test;
use test::Bencher;
use sre_engine::{Request, State, StrDrive};
struct Pattern {
code: &'static [u32],
}
impl Pattern {
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
self.state_range(string, 0..usize::MAX)
}
fn state_range<'a, S: StrDrive>(
&self,
string: S,
range: std::ops::Range<usize>,
) -> (Request<'a, S>, State) {
let req = Request::new(string, range.start, range.end, self.code, false);
let state = State::default();
(req, state)
}
}
#[bench]
fn benchmarks(b: &mut Bencher) {
// # test common prefix
// pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] };
// END GENERATED
// pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] };
// END GENERATED
// pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] };
// END GENERATED
// pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] };
// END GENERATED
// pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] };
// END GENERATED
// pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
// END GENERATED
// pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
// END GENERATED
// pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
// END GENERATED
// pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
// END GENERATED
// pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] };
// END GENERATED
// pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] };
// END GENERATED
let tests = [
(p1, "Perl"),
(p2, "Perl"),
(p3, "Perl"),
(p4, "Perl"),
(p5, "PythonPython"),
(p6, "a5,b7,c9,"),
(p7, "a5,b7,c9,"),
(p8, "Python"),
(p9, "Python"),
(p10, "Python"),
(p11, "Python"),
];
b.iter(move || {
for (p, s) in &tests {
let (req, mut state) = p.state(s.clone());
assert!(state.search(req));
let (req, mut state) = p.state(s.clone());
assert!(state.pymatch(&req));
let (mut req, mut state) = p.state(s.clone());
req.match_all = true;
assert!(state.pymatch(&req));
let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000));
let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX);
assert!(state.search(req));
let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX);
assert!(state.pymatch(&req));
let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
assert!(state.pymatch(&req));
let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
req.match_all = true;
assert!(state.pymatch(&req));
}
})
}

View File

@@ -0,0 +1,47 @@
import os
from pathlib import Path
import re
import sre_constants
import sre_compile
import sre_parse
import json
from itertools import chain
m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read())
sre_engine_magic = int(m.group(1))
del m
assert sre_constants.MAGIC == sre_engine_magic
class CompiledPattern:
@classmethod
def compile(cls, pattern, flags=0):
p = sre_parse.parse(pattern)
code = sre_compile._code(p, flags)
self = cls()
self.pattern = pattern
self.code = code
self.flags = re.RegexFlag(flags | p.state.flags)
return self
for k, v in re.RegexFlag.__members__.items():
setattr(CompiledPattern, k, v)
# matches `// pattern {varname} = re.compile(...)`
pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S)
def replace_compiled(m):
line, indent, varname, pattern = m.groups()
pattern = eval(pattern, {"re": CompiledPattern})
pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}"
return f'''{line}
{indent}// START GENERATED by generate_tests.py
{indent}#[rustfmt::skip] let {varname} = {pattern};
{indent}// END GENERATED'''
with os.scandir("tests") as t, os.scandir("benches") as b:
for f in chain(t, b):
path = Path(f.path)
if path.suffix == ".rs":
replaced = pattern_pattern.sub(replace_compiled, path.read_text())
path.write_text(replaced)

View File

@@ -0,0 +1,125 @@
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
use bitflags::bitflags;
pub const SRE_MAGIC: usize = 20221023;
#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)]
#[repr(u32)]
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
pub enum SreOpcode {
FAILURE = 0,
SUCCESS = 1,
ANY = 2,
ANY_ALL = 3,
ASSERT = 4,
ASSERT_NOT = 5,
AT = 6,
BRANCH = 7,
CATEGORY = 8,
CHARSET = 9,
BIGCHARSET = 10,
GROUPREF = 11,
GROUPREF_EXISTS = 12,
IN = 13,
INFO = 14,
JUMP = 15,
LITERAL = 16,
MARK = 17,
MAX_UNTIL = 18,
MIN_UNTIL = 19,
NOT_LITERAL = 20,
NEGATE = 21,
RANGE = 22,
REPEAT = 23,
REPEAT_ONE = 24,
SUBPATTERN = 25,
MIN_REPEAT_ONE = 26,
ATOMIC_GROUP = 27,
POSSESSIVE_REPEAT = 28,
POSSESSIVE_REPEAT_ONE = 29,
GROUPREF_IGNORE = 30,
IN_IGNORE = 31,
LITERAL_IGNORE = 32,
NOT_LITERAL_IGNORE = 33,
GROUPREF_LOC_IGNORE = 34,
IN_LOC_IGNORE = 35,
LITERAL_LOC_IGNORE = 36,
NOT_LITERAL_LOC_IGNORE = 37,
GROUPREF_UNI_IGNORE = 38,
IN_UNI_IGNORE = 39,
LITERAL_UNI_IGNORE = 40,
NOT_LITERAL_UNI_IGNORE = 41,
RANGE_UNI_IGNORE = 42,
}
#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)]
#[repr(u32)]
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
pub enum SreAtCode {
BEGINNING = 0,
BEGINNING_LINE = 1,
BEGINNING_STRING = 2,
BOUNDARY = 3,
NON_BOUNDARY = 4,
END = 5,
END_LINE = 6,
END_STRING = 7,
LOC_BOUNDARY = 8,
LOC_NON_BOUNDARY = 9,
UNI_BOUNDARY = 10,
UNI_NON_BOUNDARY = 11,
}
#[derive(num_enum::TryFromPrimitive, Debug)]
#[repr(u32)]
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
pub enum SreCatCode {
DIGIT = 0,
NOT_DIGIT = 1,
SPACE = 2,
NOT_SPACE = 3,
WORD = 4,
NOT_WORD = 5,
LINEBREAK = 6,
NOT_LINEBREAK = 7,
LOC_WORD = 8,
LOC_NOT_WORD = 9,
UNI_DIGIT = 10,
UNI_NOT_DIGIT = 11,
UNI_SPACE = 12,
UNI_NOT_SPACE = 13,
UNI_WORD = 14,
UNI_NOT_WORD = 15,
UNI_LINEBREAK = 16,
UNI_NOT_LINEBREAK = 17,
}
bitflags! {
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct SreFlag: u16 {
const TEMPLATE = 1;
const IGNORECASE = 2;
const LOCALE = 4;
const MULTILINE = 8;
const DOTALL = 16;
const UNICODE = 32;
const VERBOSE = 64;
const DEBUG = 128;
const ASCII = 256;
}
}
bitflags! {
pub struct SreInfo: u32 {
const PREFIX = 1;
const LITERAL = 2;
const CHARSET = 4;
}
}

1406
vm/sre_engine/src/engine.rs Normal file

File diff suppressed because it is too large Load Diff

19
vm/sre_engine/src/lib.rs Normal file
View File

@@ -0,0 +1,19 @@
pub mod constants;
pub mod engine;
pub mod string;
pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC};
pub use engine::{Request, SearchIter, State};
pub use string::{StrDrive, StringCursor};
pub const CODESIZE: usize = 4;
#[cfg(target_pointer_width = "32")]
pub const MAXREPEAT: usize = usize::MAX - 1;
#[cfg(target_pointer_width = "64")]
pub const MAXREPEAT: usize = u32::MAX as usize;
#[cfg(target_pointer_width = "32")]
pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2;
#[cfg(target_pointer_width = "64")]
pub const MAXGROUPS: usize = MAXREPEAT / 2;

398
vm/sre_engine/src/string.rs Normal file
View File

@@ -0,0 +1,398 @@
#[derive(Debug, Clone, Copy)]
pub struct StringCursor {
pub(crate) ptr: *const u8,
pub position: usize,
}
impl Default for StringCursor {
fn default() -> Self {
Self {
ptr: std::ptr::null(),
position: 0,
}
}
}
pub trait StrDrive: Copy {
fn count(&self) -> usize;
fn create_cursor(&self, n: usize) -> StringCursor;
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize);
fn advance(cursor: &mut StringCursor) -> u32;
fn peek(cursor: &StringCursor) -> u32;
fn skip(cursor: &mut StringCursor, n: usize);
fn back_advance(cursor: &mut StringCursor) -> u32;
fn back_peek(cursor: &StringCursor) -> u32;
fn back_skip(cursor: &mut StringCursor, n: usize);
}
impl<'a> StrDrive for &'a [u8] {
#[inline]
fn count(&self) -> usize {
self.len()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
StringCursor {
ptr: self[n..].as_ptr(),
position: n,
}
}
#[inline]
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
cursor.position = n;
cursor.ptr = self[n..].as_ptr();
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { cursor.ptr = cursor.ptr.add(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
unsafe { cursor.ptr = cursor.ptr.add(n) };
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { cursor.ptr = cursor.ptr.sub(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr.offset(-1) as u32 }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
unsafe { cursor.ptr = cursor.ptr.sub(n) };
}
}
impl StrDrive for &str {
#[inline]
fn count(&self) -> usize {
self.chars().count()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
let mut cursor = StringCursor {
ptr: self.as_ptr(),
position: 0,
};
Self::skip(&mut cursor, n);
cursor
}
#[inline]
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
if cursor.ptr.is_null() || cursor.position > n {
*cursor = Self::create_cursor(&self, n);
} else if cursor.position < n {
Self::skip(cursor, n - cursor.position);
}
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { next_code_point(&mut cursor.ptr) }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point(&mut ptr) }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
for _ in 0..n {
unsafe { next_code_point(&mut cursor.ptr) };
}
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { next_code_point_reverse(&mut cursor.ptr) }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point_reverse(&mut ptr) }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
for _ in 0..n {
unsafe { next_code_point_reverse(&mut cursor.ptr) };
}
}
}
/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
// Decode UTF-8
let x = **ptr;
*ptr = ptr.offset(1);
if x < 128 {
return x as u32;
}
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = **ptr;
*ptr = ptr.offset(1);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = **ptr;
*ptr = ptr.offset(1);
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = **ptr;
*ptr = ptr.offset(1);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
ch
}
/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
// Decode UTF-8
*ptr = ptr.offset(-1);
let w = match **ptr {
next_byte if next_byte < 128 => return next_byte as u32,
back_byte => back_byte,
};
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let z = **ptr;
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let y = **ptr;
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let x = **ptr;
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
}
ch = utf8_acc_cont_byte(ch, w);
ch
}
/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
const fn is_py_ascii_whitespace(b: u8) -> bool {
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}
#[inline]
pub(crate) fn is_word(ch: u32) -> bool {
ch == '_' as u32
|| u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_space(ch: u32) -> bool {
u8::try_from(ch)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_digit(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_word(ch: u32) -> bool {
ch == '_' as u32 || is_loc_alnum(ch)
}
#[inline]
pub(crate) fn is_linebreak(ch: u32) -> bool {
ch == '\n' as u32
}
#[inline]
pub fn lower_ascii(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_lowercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn lower_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
lower_ascii(ch)
}
#[inline]
pub(crate) fn upper_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.to_ascii_uppercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn is_uni_digit(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_space(ch: u32) -> bool {
// TODO: check with cpython
is_space(ch)
|| matches!(
ch,
0x0009
| 0x000A
| 0x000B
| 0x000C
| 0x000D
| 0x001C
| 0x001D
| 0x001E
| 0x001F
| 0x0020
| 0x0085
| 0x00A0
| 0x1680
| 0x2000
| 0x2001
| 0x2002
| 0x2003
| 0x2004
| 0x2005
| 0x2006
| 0x2007
| 0x2008
| 0x2009
| 0x200A
| 0x2028
| 0x2029
| 0x202F
| 0x205F
| 0x3000
)
}
#[inline]
pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
#[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_word(ch: u32) -> bool {
ch == '_' as u32 || is_uni_alnum(ch)
}
#[inline]
pub fn lower_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_lowercase().next().unwrap() as u32)
.unwrap_or(ch)
}
#[inline]
pub fn upper_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_uppercase().next().unwrap() as u32)
.unwrap_or(ch)
}

View File

@@ -0,0 +1,181 @@
use sre_engine::{Request, State, StrDrive};
struct Pattern {
code: &'static [u32],
}
impl Pattern {
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
let req = Request::new(string, 0, usize::MAX, self.code, false);
let state = State::default();
(req, state)
}
}
#[test]
fn test_2427() {
// pattern lookbehind = re.compile(r'(?<!\.)x\b')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let lookbehind = Pattern { code: &[14, 4, 0, 1, 1, 5, 5, 1, 16, 46, 1, 16, 120, 6, 10, 1] };
// END GENERATED
let (req, mut state) = lookbehind.state("x");
assert!(state.pymatch(&req));
}
#[test]
fn test_assert() {
// pattern positive_lookbehind = re.compile(r'(?<=abc)def')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let positive_lookbehind = Pattern { code: &[14, 4, 0, 3, 3, 4, 9, 3, 16, 97, 16, 98, 16, 99, 1, 16, 100, 16, 101, 16, 102, 1] };
// END GENERATED
let (req, mut state) = positive_lookbehind.state("abcdef");
assert!(state.search(req));
}
#[test]
fn test_string_boundaries() {
// pattern big_b = re.compile(r'\B')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let big_b = Pattern { code: &[14, 4, 0, 0, 0, 6, 11, 1] };
// END GENERATED
let (req, mut state) = big_b.state("");
assert!(!state.search(req));
}
#[test]
fn test_zerowidth() {
// pattern p = re.compile(r'\b|:+')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 7, 5, 6, 10, 15, 12, 10, 24, 6, 1, 4294967295, 16, 58, 1, 15, 2, 0, 1] };
// END GENERATED
let (mut req, mut state) = p.state("a:");
req.must_advance = true;
assert!(state.search(req));
assert_eq!(state.cursor.position, 1);
}
#[test]
fn test_repeat_context_panic() {
use optional::Optioned;
// pattern p = re.compile(r'(?:a*?(xx)??z)*')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 23, 25, 0, 4294967295, 26, 6, 0, 4294967295, 16, 97, 1, 23, 11, 0, 1, 17, 0, 16, 120, 16, 120, 17, 1, 19, 16, 122, 18, 1] };
// END GENERATED
let (req, mut state) = p.state("axxzaz");
assert!(state.pymatch(&req));
assert_eq!(
*state.marks.raw(),
vec![Optioned::some(1), Optioned::some(3)]
);
}
#[test]
fn test_double_max_until() {
// pattern p = re.compile(r'((1)?)*')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 23, 18, 0, 4294967295, 17, 0, 23, 9, 0, 1, 17, 2, 16, 49, 17, 3, 18, 17, 1, 18, 1] };
// END GENERATED
let (req, mut state) = p.state("1111");
assert!(state.pymatch(&req));
assert_eq!(state.cursor.position, 4);
}
#[test]
fn test_info_single() {
// pattern p = re.compile(r'aa*')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 8, 1, 1, 4294967295, 1, 1, 97, 0, 16, 97, 24, 6, 0, 4294967295, 16, 97, 1, 1] };
// END GENERATED
let (req, mut state) = p.state("baaaa");
assert!(state.search(req));
assert_eq!(state.start, 1);
assert_eq!(state.cursor.position, 5);
}
#[test]
fn test_info_single2() {
// pattern p = re.compile(r'Python|Perl')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] };
// END GENERATED
let (req, mut state) = p.state("Perl");
assert!(state.search(req));
}
#[test]
fn test_info_literal() {
// pattern p = re.compile(r'ababc+')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 16, 97, 16, 98, 16, 97, 16, 98, 24, 6, 1, 4294967295, 16, 99, 1, 1] };
// END GENERATED
let (req, mut state) = p.state("!ababc");
assert!(state.search(req));
}
#[test]
fn test_info_literal2() {
// pattern p = re.compile(r'(python)\1')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 112, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] };
// END GENERATED
let (req, mut state) = p.state("pythonpython");
assert!(state.search(req));
}
#[test]
fn test_repeat_in_assertions() {
// pattern p = re.compile('^([ab]*?)(?=(b)?)c', re.IGNORECASE)
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 6, 0, 17, 0, 26, 10, 0, 4294967295, 39, 5, 22, 97, 98, 0, 1, 17, 1, 4, 14, 0, 23, 9, 0, 1, 17, 2, 40, 98, 17, 3, 18, 1, 40, 99, 1] };
// END GENERATED
let (req, mut state) = p.state("abc");
assert!(state.search(req));
}
#[test]
fn test_possessive_quantifier() {
// pattern p = re.compile('e++a')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 29, 6, 1, 4294967295, 16, 101, 1, 16, 97, 1] };
// END GENERATED
let (req, mut state) = p.state("eeea");
assert!(state.pymatch(&req));
}
#[test]
fn test_possessive_atomic_group() {
// pattern p = re.compile('(?>x)++x')
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 28, 8, 1, 4294967295, 27, 4, 16, 120, 1, 1, 16, 120, 1] };
// END GENERATED
let (req, mut state) = p.state("xxx");
assert!(!state.pymatch(&req));
}
#[test]
fn test_bug_20998() {
// pattern p = re.compile('[a-c]+', re.I)
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 24, 10, 1, 4294967295, 39, 5, 22, 97, 99, 0, 1, 1] };
// END GENERATED
let (mut req, mut state) = p.state("ABC");
req.match_all = true;
assert!(state.pymatch(&req));
assert_eq!(state.cursor.position, 3);
}
#[test]
fn test_bigcharset() {
// pattern p = re.compile('[a-z]*', re.I)
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 24, 97, 0, 4294967295, 39, 92, 10, 3, 33685760, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 0, 0, 0, 134217726, 0, 0, 0, 0, 0, 131072, 0, 2147483648, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] };
// END GENERATED
let (req, mut state) = p.state("x ");
assert!(state.pymatch(&req));
assert_eq!(state.cursor.position, 1);
}
#[test]
fn test_search_nonascii() {
// pattern p = re.compile('\xe0+')
}