Add 'vm/sre_engine/' from commit '21fc2059b70ebd5bf4a7c524c40e7d4347e065dc'

git-subtree-dir: vm/sre_engine git-subtree-mainline: 426e582ba0 git-subtree-split: 21fc2059b7
2026-06-02 19:39:49 +09:00 · 2024-03-18 17:05:07 +09:00
parent 426e582ba0 21fc2059b7
commit b3a606d9df
11 changed files with 2346 additions and 0 deletions
--- a/vm/sre_engine/.gitignore
+++ b/vm/sre_engine/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
--- a/vm/sre_engine/.vscode/launch.json
+++ b/vm/sre_engine/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "lldb",
+            "request": "launch",
+            "name": "Debug Unit Test",
+            "cargo": {
+                "args": [
+                    "test",
+                    "--no-run"
+                ],
+                "filter": {
+                    "kind": "test"
+                }
+            },
+            "args": [],
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}
--- a/vm/sre_engine/Cargo.toml
+++ b/vm/sre_engine/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "sre-engine"
+version = "0.6.0"
+authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
+description = "A low-level implementation of Python's SRE regex engine"
+repository = "https://github.com/RustPython/sre-engine"
+license = "MIT"
+edition = "2021"
+keywords = ["regex"]
+include = ["LICENSE", "src/**/*.rs"]
+
+[dependencies]
+num_enum = "0.7"
+bitflags = "2"
+optional = "0.5"
--- a/vm/sre_engine/LICENSE
+++ b/vm/sre_engine/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 RustPython Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vm/sre_engine/benches/benches.rs
+++ b/vm/sre_engine/benches/benches.rs
@@ -0,0 +1,111 @@
+#![feature(test)]
+
+extern crate test;
+use test::Bencher;
+
+use sre_engine::{Request, State, StrDrive};
+
+struct Pattern {
+    code: &'static [u32],
+}
+
+impl Pattern {
+    fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
+        self.state_range(string, 0..usize::MAX)
+    }
+
+    fn state_range<'a, S: StrDrive>(
+        &self,
+        string: S,
+        range: std::ops::Range<usize>,
+    ) -> (Request<'a, S>, State) {
+        let req = Request::new(string, range.start, range.end, self.code, false);
+        let state = State::default();
+        (req, state)
+    }
+}
+
+#[bench]
+fn benchmarks(b: &mut Bencher) {
+    // # test common prefix
+    // pattern p1 = re.compile('Python|Perl') # , 'Perl'),    # Alternation
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] };
+    // END GENERATED
+    // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'),  # Grouped alternation
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] };
+    // END GENERATED
+    // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'),        # Alternation
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] };
+    // END GENERATED
+    // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'),      # Grouped alternation
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] };
+    // END GENERATED
+    // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'),    # Backreference
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] };
+    // END GENERATED
+    // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
+    // END GENERATED
+    // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
+    // END GENERATED
+    // pattern p8 = re.compile('Python') #, 'Python'),               # Simple text literal
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
+    // END GENERATED
+    // pattern p9 = re.compile('.*Python') #, 'Python'),             # Bad text literal
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
+    // END GENERATED
+    // pattern p10 = re.compile('.*Python.*') #, 'Python'),           # Worse text literal
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] };
+    // END GENERATED
+    // pattern p11 = re.compile('.*(Python)') #, 'Python'),           # Bad text literal with grouping
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] };
+    // END GENERATED
+
+    let tests = [
+        (p1, "Perl"),
+        (p2, "Perl"),
+        (p3, "Perl"),
+        (p4, "Perl"),
+        (p5, "PythonPython"),
+        (p6, "a5,b7,c9,"),
+        (p7, "a5,b7,c9,"),
+        (p8, "Python"),
+        (p9, "Python"),
+        (p10, "Python"),
+        (p11, "Python"),
+    ];
+
+    b.iter(move || {
+        for (p, s) in &tests {
+            let (req, mut state) = p.state(s.clone());
+            assert!(state.search(req));
+            let (req, mut state) = p.state(s.clone());
+            assert!(state.pymatch(&req));
+            let (mut req, mut state) = p.state(s.clone());
+            req.match_all = true;
+            assert!(state.pymatch(&req));
+            let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000));
+            let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX);
+            assert!(state.search(req));
+            let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX);
+            assert!(state.pymatch(&req));
+            let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
+            assert!(state.pymatch(&req));
+            let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
+            req.match_all = true;
+            assert!(state.pymatch(&req));
+        }
+    })
+}
--- a/vm/sre_engine/generate_tests.py
+++ b/vm/sre_engine/generate_tests.py
@@ -0,0 +1,47 @@
+import os
+from pathlib import Path
+import re
+import sre_constants
+import sre_compile
+import sre_parse
+import json
+from itertools import chain
+
+m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read())
+sre_engine_magic = int(m.group(1))
+del m
+
+assert sre_constants.MAGIC == sre_engine_magic
+
+class CompiledPattern:
+    @classmethod
+    def compile(cls, pattern, flags=0):
+        p = sre_parse.parse(pattern)
+        code = sre_compile._code(p, flags)
+        self = cls()
+        self.pattern = pattern
+        self.code = code
+        self.flags = re.RegexFlag(flags | p.state.flags)
+        return self
+
+for k, v in re.RegexFlag.__members__.items():
+    setattr(CompiledPattern, k, v)
+
+
+# matches `// pattern {varname} = re.compile(...)`
+pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S)
+def replace_compiled(m):
+    line, indent, varname, pattern = m.groups()
+    pattern = eval(pattern, {"re": CompiledPattern})
+    pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}"
+    return f'''{line}
+{indent}// START GENERATED by generate_tests.py
+{indent}#[rustfmt::skip] let {varname} = {pattern};
+{indent}// END GENERATED'''
+
+with os.scandir("tests") as t, os.scandir("benches") as b:
+    for f in chain(t, b):
+        path = Path(f.path)
+        if path.suffix == ".rs":
+            replaced = pattern_pattern.sub(replace_compiled, path.read_text())
+            path.write_text(replaced)
--- a/vm/sre_engine/src/constants.rs
+++ b/vm/sre_engine/src/constants.rs
@@ -0,0 +1,125 @@
+/*
+ * Secret Labs' Regular Expression Engine
+ *
+ * regular expression matching engine
+ *
+ * NOTE: This file is generated by sre_constants.py.  If you need
+ * to change anything in here, edit sre_constants.py and run it.
+ *
+ * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
+ *
+ * See the _sre.c file for information on usage and redistribution.
+ */
+
+use bitflags::bitflags;
+
+pub const SRE_MAGIC: usize = 20221023;
+#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)]
+#[repr(u32)]
+#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
+pub enum SreOpcode {
+    FAILURE = 0,
+    SUCCESS = 1,
+    ANY = 2,
+    ANY_ALL = 3,
+    ASSERT = 4,
+    ASSERT_NOT = 5,
+    AT = 6,
+    BRANCH = 7,
+    CATEGORY = 8,
+    CHARSET = 9,
+    BIGCHARSET = 10,
+    GROUPREF = 11,
+    GROUPREF_EXISTS = 12,
+    IN = 13,
+    INFO = 14,
+    JUMP = 15,
+    LITERAL = 16,
+    MARK = 17,
+    MAX_UNTIL = 18,
+    MIN_UNTIL = 19,
+    NOT_LITERAL = 20,
+    NEGATE = 21,
+    RANGE = 22,
+    REPEAT = 23,
+    REPEAT_ONE = 24,
+    SUBPATTERN = 25,
+    MIN_REPEAT_ONE = 26,
+    ATOMIC_GROUP = 27,
+    POSSESSIVE_REPEAT = 28,
+    POSSESSIVE_REPEAT_ONE = 29,
+    GROUPREF_IGNORE = 30,
+    IN_IGNORE = 31,
+    LITERAL_IGNORE = 32,
+    NOT_LITERAL_IGNORE = 33,
+    GROUPREF_LOC_IGNORE = 34,
+    IN_LOC_IGNORE = 35,
+    LITERAL_LOC_IGNORE = 36,
+    NOT_LITERAL_LOC_IGNORE = 37,
+    GROUPREF_UNI_IGNORE = 38,
+    IN_UNI_IGNORE = 39,
+    LITERAL_UNI_IGNORE = 40,
+    NOT_LITERAL_UNI_IGNORE = 41,
+    RANGE_UNI_IGNORE = 42,
+}
+#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)]
+#[repr(u32)]
+#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
+pub enum SreAtCode {
+    BEGINNING = 0,
+    BEGINNING_LINE = 1,
+    BEGINNING_STRING = 2,
+    BOUNDARY = 3,
+    NON_BOUNDARY = 4,
+    END = 5,
+    END_LINE = 6,
+    END_STRING = 7,
+    LOC_BOUNDARY = 8,
+    LOC_NON_BOUNDARY = 9,
+    UNI_BOUNDARY = 10,
+    UNI_NON_BOUNDARY = 11,
+}
+#[derive(num_enum::TryFromPrimitive, Debug)]
+#[repr(u32)]
+#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
+pub enum SreCatCode {
+    DIGIT = 0,
+    NOT_DIGIT = 1,
+    SPACE = 2,
+    NOT_SPACE = 3,
+    WORD = 4,
+    NOT_WORD = 5,
+    LINEBREAK = 6,
+    NOT_LINEBREAK = 7,
+    LOC_WORD = 8,
+    LOC_NOT_WORD = 9,
+    UNI_DIGIT = 10,
+    UNI_NOT_DIGIT = 11,
+    UNI_SPACE = 12,
+    UNI_NOT_SPACE = 13,
+    UNI_WORD = 14,
+    UNI_NOT_WORD = 15,
+    UNI_LINEBREAK = 16,
+    UNI_NOT_LINEBREAK = 17,
+}
+bitflags! {
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+    pub struct SreFlag: u16 {
+        const TEMPLATE = 1;
+        const IGNORECASE = 2;
+        const LOCALE = 4;
+        const MULTILINE = 8;
+        const DOTALL = 16;
+        const UNICODE = 32;
+        const VERBOSE = 64;
+        const DEBUG = 128;
+        const ASCII = 256;
+    }
+}
+bitflags! {
+    pub struct SreInfo: u32 {
+        const PREFIX = 1;
+        const LITERAL = 2;
+        const CHARSET = 4;
+    }
+}
--- a/vm/sre_engine/src/engine.rs
+++ b/vm/sre_engine/src/engine.rs
--- a/vm/sre_engine/src/lib.rs
+++ b/vm/sre_engine/src/lib.rs
@@ -0,0 +1,19 @@
+pub mod constants;
+pub mod engine;
+pub mod string;
+
+pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC};
+pub use engine::{Request, SearchIter, State};
+pub use string::{StrDrive, StringCursor};
+
+pub const CODESIZE: usize = 4;
+
+#[cfg(target_pointer_width = "32")]
+pub const MAXREPEAT: usize = usize::MAX - 1;
+#[cfg(target_pointer_width = "64")]
+pub const MAXREPEAT: usize = u32::MAX as usize;
+
+#[cfg(target_pointer_width = "32")]
+pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2;
+#[cfg(target_pointer_width = "64")]
+pub const MAXGROUPS: usize = MAXREPEAT / 2;
--- a/vm/sre_engine/src/string.rs
+++ b/vm/sre_engine/src/string.rs
@@ -0,0 +1,398 @@
+#[derive(Debug, Clone, Copy)]
+pub struct StringCursor {
+    pub(crate) ptr: *const u8,
+    pub position: usize,
+}
+
+impl Default for StringCursor {
+    fn default() -> Self {
+        Self {
+            ptr: std::ptr::null(),
+            position: 0,
+        }
+    }
+}
+
+pub trait StrDrive: Copy {
+    fn count(&self) -> usize;
+    fn create_cursor(&self, n: usize) -> StringCursor;
+    fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize);
+    fn advance(cursor: &mut StringCursor) -> u32;
+    fn peek(cursor: &StringCursor) -> u32;
+    fn skip(cursor: &mut StringCursor, n: usize);
+    fn back_advance(cursor: &mut StringCursor) -> u32;
+    fn back_peek(cursor: &StringCursor) -> u32;
+    fn back_skip(cursor: &mut StringCursor, n: usize);
+}
+
+impl<'a> StrDrive for &'a [u8] {
+    #[inline]
+    fn count(&self) -> usize {
+        self.len()
+    }
+
+    #[inline]
+    fn create_cursor(&self, n: usize) -> StringCursor {
+        StringCursor {
+            ptr: self[n..].as_ptr(),
+            position: n,
+        }
+    }
+
+    #[inline]
+    fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
+        cursor.position = n;
+        cursor.ptr = self[n..].as_ptr();
+    }
+
+    #[inline]
+    fn advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position += 1;
+        unsafe { cursor.ptr = cursor.ptr.add(1) };
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn peek(cursor: &StringCursor) -> u32 {
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position += n;
+        unsafe { cursor.ptr = cursor.ptr.add(n) };
+    }
+
+    #[inline]
+    fn back_advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position -= 1;
+        unsafe { cursor.ptr = cursor.ptr.sub(1) };
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn back_peek(cursor: &StringCursor) -> u32 {
+        unsafe { *cursor.ptr.offset(-1) as u32 }
+    }
+
+    #[inline]
+    fn back_skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position -= n;
+        unsafe { cursor.ptr = cursor.ptr.sub(n) };
+    }
+}
+
+impl StrDrive for &str {
+    #[inline]
+    fn count(&self) -> usize {
+        self.chars().count()
+    }
+
+    #[inline]
+    fn create_cursor(&self, n: usize) -> StringCursor {
+        let mut cursor = StringCursor {
+            ptr: self.as_ptr(),
+            position: 0,
+        };
+        Self::skip(&mut cursor, n);
+        cursor
+    }
+
+    #[inline]
+    fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
+        if cursor.ptr.is_null() || cursor.position > n {
+            *cursor = Self::create_cursor(&self, n);
+        } else if cursor.position < n {
+            Self::skip(cursor, n - cursor.position);
+        }
+    }
+
+    #[inline]
+    fn advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position += 1;
+        unsafe { next_code_point(&mut cursor.ptr) }
+    }
+
+    #[inline]
+    fn peek(cursor: &StringCursor) -> u32 {
+        let mut ptr = cursor.ptr;
+        unsafe { next_code_point(&mut ptr) }
+    }
+
+    #[inline]
+    fn skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position += n;
+        for _ in 0..n {
+            unsafe { next_code_point(&mut cursor.ptr) };
+        }
+    }
+
+    #[inline]
+    fn back_advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position -= 1;
+        unsafe { next_code_point_reverse(&mut cursor.ptr) }
+    }
+
+    #[inline]
+    fn back_peek(cursor: &StringCursor) -> u32 {
+        let mut ptr = cursor.ptr;
+        unsafe { next_code_point_reverse(&mut ptr) }
+    }
+
+    #[inline]
+    fn back_skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position -= n;
+        for _ in 0..n {
+            unsafe { next_code_point_reverse(&mut cursor.ptr) };
+        }
+    }
+}
+
+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[inline]
+unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
+    // Decode UTF-8
+    let x = **ptr;
+    *ptr = ptr.offset(1);
+
+    if x < 128 {
+        return x as u32;
+    }
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [[[x y] z] w]
+    // NOTE: Performance is sensitive to the exact formulation here
+    let init = utf8_first_byte(x, 2);
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    let y = **ptr;
+    *ptr = ptr.offset(1);
+    let mut ch = utf8_acc_cont_byte(init, y);
+    if x >= 0xE0 {
+        // [[x y z] w] case
+        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        let z = **ptr;
+        *ptr = ptr.offset(1);
+        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+        ch = init << 12 | y_z;
+        if x >= 0xF0 {
+            // [x y z w] case
+            // use only the lower 3 bits of `init`
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            let w = **ptr;
+            *ptr = ptr.offset(1);
+            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
+        }
+    }
+
+    ch
+}
+
+/// Reads the last code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[inline]
+unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
+    // Decode UTF-8
+    *ptr = ptr.offset(-1);
+    let w = match **ptr {
+        next_byte if next_byte < 128 => return next_byte as u32,
+        back_byte => back_byte,
+    };
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [x [y [z w]]]
+    let mut ch;
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    *ptr = ptr.offset(-1);
+    let z = **ptr;
+    ch = utf8_first_byte(z, 2);
+    if utf8_is_cont_byte(z) {
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        *ptr = ptr.offset(-1);
+        let y = **ptr;
+        ch = utf8_first_byte(y, 3);
+        if utf8_is_cont_byte(y) {
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            *ptr = ptr.offset(-1);
+            let x = **ptr;
+            ch = utf8_first_byte(x, 4);
+            ch = utf8_acc_cont_byte(ch, y);
+        }
+        ch = utf8_acc_cont_byte(ch, z);
+    }
+    ch = utf8_acc_cont_byte(ch, w);
+
+    ch
+}
+
+/// Returns the initial codepoint accumulator for the first byte.
+/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+/// for width 3, and 3 bits for width 4.
+#[inline]
+const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+    (byte & (0x7F >> width)) as u32
+}
+
+/// Returns the value of `ch` updated with continuation byte `byte`.
+#[inline]
+const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+    (ch << 6) | (byte & CONT_MASK) as u32
+}
+
+/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
+/// bits `10`).
+#[inline]
+const fn utf8_is_cont_byte(byte: u8) -> bool {
+    (byte as i8) < -64
+}
+
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111;
+
+const fn is_py_ascii_whitespace(b: u8) -> bool {
+    matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
+}
+
+#[inline]
+pub(crate) fn is_word(ch: u32) -> bool {
+    ch == '_' as u32
+        || u8::try_from(ch)
+            .map(|x| x.is_ascii_alphanumeric())
+            .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_space(ch: u32) -> bool {
+    u8::try_from(ch)
+        .map(is_py_ascii_whitespace)
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_digit(ch: u32) -> bool {
+    u8::try_from(ch)
+        .map(|x| x.is_ascii_digit())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_loc_alnum(ch: u32) -> bool {
+    // FIXME: Ignore the locales
+    u8::try_from(ch)
+        .map(|x| x.is_ascii_alphanumeric())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_loc_word(ch: u32) -> bool {
+    ch == '_' as u32 || is_loc_alnum(ch)
+}
+#[inline]
+pub(crate) fn is_linebreak(ch: u32) -> bool {
+    ch == '\n' as u32
+}
+#[inline]
+pub fn lower_ascii(ch: u32) -> u32 {
+    u8::try_from(ch)
+        .map(|x| x.to_ascii_lowercase() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub(crate) fn lower_locate(ch: u32) -> u32 {
+    // FIXME: Ignore the locales
+    lower_ascii(ch)
+}
+#[inline]
+pub(crate) fn upper_locate(ch: u32) -> u32 {
+    // FIXME: Ignore the locales
+    u8::try_from(ch)
+        .map(|x| x.to_ascii_uppercase() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub(crate) fn is_uni_digit(ch: u32) -> bool {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.is_ascii_digit())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_uni_space(ch: u32) -> bool {
+    // TODO: check with cpython
+    is_space(ch)
+        || matches!(
+            ch,
+            0x0009
+                | 0x000A
+                | 0x000B
+                | 0x000C
+                | 0x000D
+                | 0x001C
+                | 0x001D
+                | 0x001E
+                | 0x001F
+                | 0x0020
+                | 0x0085
+                | 0x00A0
+                | 0x1680
+                | 0x2000
+                | 0x2001
+                | 0x2002
+                | 0x2003
+                | 0x2004
+                | 0x2005
+                | 0x2006
+                | 0x2007
+                | 0x2008
+                | 0x2009
+                | 0x200A
+                | 0x2028
+                | 0x2029
+                | 0x202F
+                | 0x205F
+                | 0x3000
+        )
+}
+#[inline]
+pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
+    matches!(
+        ch,
+        0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
+    )
+}
+#[inline]
+pub(crate) fn is_uni_alnum(ch: u32) -> bool {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.is_alphanumeric())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_uni_word(ch: u32) -> bool {
+    ch == '_' as u32 || is_uni_alnum(ch)
+}
+#[inline]
+pub fn lower_unicode(ch: u32) -> u32 {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.to_lowercase().next().unwrap() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub fn upper_unicode(ch: u32) -> u32 {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.to_uppercase().next().unwrap() as u32)
+        .unwrap_or(ch)
+}
--- a/vm/sre_engine/tests/tests.rs
+++ b/vm/sre_engine/tests/tests.rs
@@ -0,0 +1,181 @@
+use sre_engine::{Request, State, StrDrive};
+
+struct Pattern {
+    code: &'static [u32],
+}
+
+impl Pattern {
+    fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
+        let req = Request::new(string, 0, usize::MAX, self.code, false);
+        let state = State::default();
+        (req, state)
+    }
+}
+
+#[test]
+fn test_2427() {
+    // pattern lookbehind = re.compile(r'(?<!\.)x\b')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let lookbehind = Pattern { code: &[14, 4, 0, 1, 1, 5, 5, 1, 16, 46, 1, 16, 120, 6, 10, 1] };
+    // END GENERATED
+    let (req, mut state) = lookbehind.state("x");
+    assert!(state.pymatch(&req));
+}
+
+#[test]
+fn test_assert() {
+    // pattern positive_lookbehind = re.compile(r'(?<=abc)def')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[14, 4, 0, 3, 3, 4, 9, 3, 16, 97, 16, 98, 16, 99, 1, 16, 100, 16, 101, 16, 102, 1] };
+    // END GENERATED
+    let (req, mut state) = positive_lookbehind.state("abcdef");
+    assert!(state.search(req));
+}
+
+#[test]
+fn test_string_boundaries() {
+    // pattern big_b = re.compile(r'\B')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let big_b = Pattern { code: &[14, 4, 0, 0, 0, 6, 11, 1] };
+    // END GENERATED
+    let (req, mut state) = big_b.state("");
+    assert!(!state.search(req));
+}
+
+#[test]
+fn test_zerowidth() {
+    // pattern p = re.compile(r'\b|:+')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 7, 5, 6, 10, 15, 12, 10, 24, 6, 1, 4294967295, 16, 58, 1, 15, 2, 0, 1] };
+    // END GENERATED
+    let (mut req, mut state) = p.state("a:");
+    req.must_advance = true;
+    assert!(state.search(req));
+    assert_eq!(state.cursor.position, 1);
+}
+
+#[test]
+fn test_repeat_context_panic() {
+    use optional::Optioned;
+    // pattern p = re.compile(r'(?:a*?(xx)??z)*')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 23, 25, 0, 4294967295, 26, 6, 0, 4294967295, 16, 97, 1, 23, 11, 0, 1, 17, 0, 16, 120, 16, 120, 17, 1, 19, 16, 122, 18, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("axxzaz");
+    assert!(state.pymatch(&req));
+    assert_eq!(
+        *state.marks.raw(),
+        vec![Optioned::some(1), Optioned::some(3)]
+    );
+}
+
+#[test]
+fn test_double_max_until() {
+    // pattern p = re.compile(r'((1)?)*')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 23, 18, 0, 4294967295, 17, 0, 23, 9, 0, 1, 17, 2, 16, 49, 17, 3, 18, 17, 1, 18, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("1111");
+    assert!(state.pymatch(&req));
+    assert_eq!(state.cursor.position, 4);
+}
+
+#[test]
+fn test_info_single() {
+    // pattern p = re.compile(r'aa*')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 8, 1, 1, 4294967295, 1, 1, 97, 0, 16, 97, 24, 6, 0, 4294967295, 16, 97, 1, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("baaaa");
+    assert!(state.search(req));
+    assert_eq!(state.start, 1);
+    assert_eq!(state.cursor.position, 5);
+}
+
+#[test]
+fn test_info_single2() {
+    // pattern p = re.compile(r'Python|Perl')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("Perl");
+    assert!(state.search(req));
+}
+
+#[test]
+fn test_info_literal() {
+    // pattern p = re.compile(r'ababc+')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 16, 97, 16, 98, 16, 97, 16, 98, 24, 6, 1, 4294967295, 16, 99, 1, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("!ababc");
+    assert!(state.search(req));
+}
+
+#[test]
+fn test_info_literal2() {
+    // pattern p = re.compile(r'(python)\1')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 112, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("pythonpython");
+    assert!(state.search(req));
+}
+
+#[test]
+fn test_repeat_in_assertions() {
+    // pattern p = re.compile('^([ab]*?)(?=(b)?)c', re.IGNORECASE)
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 6, 0, 17, 0, 26, 10, 0, 4294967295, 39, 5, 22, 97, 98, 0, 1, 17, 1, 4, 14, 0, 23, 9, 0, 1, 17, 2, 40, 98, 17, 3, 18, 1, 40, 99, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("abc");
+    assert!(state.search(req));
+}
+
+#[test]
+fn test_possessive_quantifier() {
+    // pattern p = re.compile('e++a')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 29, 6, 1, 4294967295, 16, 101, 1, 16, 97, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("eeea");
+    assert!(state.pymatch(&req));
+}
+
+#[test]
+fn test_possessive_atomic_group() {
+    // pattern p = re.compile('(?>x)++x')
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 28, 8, 1, 4294967295, 27, 4, 16, 120, 1, 1, 16, 120, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("xxx");
+    assert!(!state.pymatch(&req));
+}
+
+#[test]
+fn test_bug_20998() {
+    // pattern p = re.compile('[a-c]+', re.I)
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 24, 10, 1, 4294967295, 39, 5, 22, 97, 99, 0, 1, 1] };
+    // END GENERATED
+    let (mut req, mut state) = p.state("ABC");
+    req.match_all = true;
+    assert!(state.pymatch(&req));
+    assert_eq!(state.cursor.position, 3);
+}
+
+#[test]
+fn test_bigcharset() {
+    // pattern p = re.compile('[a-z]*', re.I)
+    // START GENERATED by generate_tests.py
+    #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 24, 97, 0, 4294967295, 39, 92, 10, 3, 33685760, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 0, 0, 0, 134217726, 0, 0, 0, 0, 0, 131072, 0, 2147483648, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] };
+    // END GENERATED
+    let (req, mut state) = p.state("x ");
+    assert!(state.pymatch(&req));
+    assert_eq!(state.cursor.position, 1);
+}
+
+#[test]
+fn test_search_nonascii() {
+    // pattern p = re.compile('\xe0+')
+}