improve use StringCursor replace index based position

2026-06-02 19:39:49 +09:00 · 2024-01-13 16:03:38 +02:00
parent 118a00c012
commit c93ea30b3b
6 changed files with 562 additions and 380 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sre-engine"
-version = "0.5.0"
+version = "0.6.0"
 authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
 description = "A low-level implementation of Python's SRE regex engine"
 repository = "https://github.com/RustPython/sre-engine"
--- a/benches/benches.rs
+++ b/benches/benches.rs
@@ -3,24 +3,24 @@
 extern crate test;
 use test::Bencher;

-use sre_engine::engine;
+use sre_engine::{Request, State, StrDrive};

 struct Pattern {
    code: &'static [u32],
 }

 impl Pattern {
-    fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) {
+    fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
        self.state_range(string, 0..usize::MAX)
    }

-    fn state_range<'a, S: engine::StrDrive>(
+    fn state_range<'a, S: StrDrive>(
        &self,
        string: S,
        range: std::ops::Range<usize>,
-    ) -> (engine::Request<'a, S>, engine::State) {
-        let req = engine::Request::new(string, range.start, range.end, self.code, false);
-        let state = engine::State::default();
+    ) -> (Request<'a, S>, State) {
+        let req = Request::new(string, range.start, range.end, self.code, false);
+        let state = State::default();
        (req, state)
    }
 }
--- a/src/engine.rs
+++ b/src/engine.rs
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,10 @@
 pub mod constants;
 pub mod engine;
+pub mod string;
+
+pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC};
+pub use engine::{Request, SearchIter, State};
+pub use string::{StrDrive, StringCursor};

 pub const CODESIZE: usize = 4;

--- a/src/string.rs
+++ b/src/string.rs
@@ -0,0 +1,381 @@
+#[derive(Debug, Clone, Copy)]
+pub struct StringCursor {
+    pub(crate) ptr: *const u8,
+    pub position: usize,
+}
+
+impl Default for StringCursor {
+    fn default() -> Self {
+        Self {
+            ptr: std::ptr::null(),
+            position: 0,
+        }
+    }
+}
+
+pub trait StrDrive: Copy {
+    fn count(&self) -> usize;
+    fn create_cursor(&self, n: usize) -> StringCursor;
+    fn advance(cursor: &mut StringCursor) -> u32;
+    fn peek(cursor: &StringCursor) -> u32;
+    fn skip(cursor: &mut StringCursor, n: usize);
+    fn back_advance(cursor: &mut StringCursor) -> u32;
+    fn back_peek(cursor: &StringCursor) -> u32;
+    fn back_skip(cursor: &mut StringCursor, n: usize);
+}
+
+impl<'a> StrDrive for &'a [u8] {
+    #[inline]
+    fn count(&self) -> usize {
+        self.len()
+    }
+
+    #[inline]
+    fn create_cursor(&self, n: usize) -> StringCursor {
+        StringCursor {
+            ptr: self[n..].as_ptr(),
+            position: n,
+        }
+    }
+
+    #[inline]
+    fn advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position += 1;
+        unsafe { cursor.ptr = cursor.ptr.add(1) };
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn peek(cursor: &StringCursor) -> u32 {
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position += n;
+        unsafe { cursor.ptr = cursor.ptr.add(n) };
+    }
+
+    #[inline]
+    fn back_advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position -= 1;
+        unsafe { cursor.ptr = cursor.ptr.sub(1) };
+        unsafe { *cursor.ptr as u32 }
+    }
+
+    #[inline]
+    fn back_peek(cursor: &StringCursor) -> u32 {
+        unsafe { *cursor.ptr.offset(-1) as u32 }
+    }
+
+    #[inline]
+    fn back_skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position -= n;
+        unsafe { cursor.ptr = cursor.ptr.sub(n) };
+    }
+}
+
+impl StrDrive for &str {
+    #[inline]
+    fn count(&self) -> usize {
+        self.chars().count()
+    }
+
+    #[inline]
+    fn create_cursor(&self, n: usize) -> StringCursor {
+        let mut ptr = self.as_ptr();
+        for _ in 0..n {
+            unsafe { next_code_point(&mut ptr) };
+        }
+        StringCursor { ptr, position: n }
+    }
+
+    #[inline]
+    fn advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position += 1;
+        unsafe { next_code_point(&mut cursor.ptr) }
+    }
+
+    #[inline]
+    fn peek(cursor: &StringCursor) -> u32 {
+        let mut ptr = cursor.ptr;
+        unsafe { next_code_point(&mut ptr) }
+    }
+
+    #[inline]
+    fn skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position += n;
+        for _ in 0..n {
+            unsafe { next_code_point(&mut cursor.ptr) };
+        }
+    }
+
+    #[inline]
+    fn back_advance(cursor: &mut StringCursor) -> u32 {
+        cursor.position -= 1;
+        unsafe { next_code_point_reverse(&mut cursor.ptr) }
+    }
+
+    #[inline]
+    fn back_peek(cursor: &StringCursor) -> u32 {
+        let mut ptr = cursor.ptr;
+        unsafe { next_code_point_reverse(&mut ptr) }
+    }
+
+    #[inline]
+    fn back_skip(cursor: &mut StringCursor, n: usize) {
+        cursor.position -= n;
+        for _ in 0..n {
+            unsafe { next_code_point_reverse(&mut cursor.ptr) };
+        }
+    }
+}
+
+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[inline]
+unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
+    // Decode UTF-8
+    let x = **ptr;
+    *ptr = ptr.offset(1);
+
+    if x < 128 {
+        return x as u32;
+    }
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [[[x y] z] w]
+    // NOTE: Performance is sensitive to the exact formulation here
+    let init = utf8_first_byte(x, 2);
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    let y = **ptr;
+    *ptr = ptr.offset(1);
+    let mut ch = utf8_acc_cont_byte(init, y);
+    if x >= 0xE0 {
+        // [[x y z] w] case
+        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        let z = **ptr;
+        *ptr = ptr.offset(1);
+        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+        ch = init << 12 | y_z;
+        if x >= 0xF0 {
+            // [x y z w] case
+            // use only the lower 3 bits of `init`
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            let w = **ptr;
+            *ptr = ptr.offset(1);
+            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
+        }
+    }
+
+    ch
+}
+
+/// Reads the last code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[inline]
+unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
+    // Decode UTF-8
+    *ptr = ptr.offset(-1);
+    let w = match **ptr {
+        next_byte if next_byte < 128 => return next_byte as u32,
+        back_byte => back_byte,
+    };
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [x [y [z w]]]
+    let mut ch;
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    *ptr = ptr.offset(-1);
+    let z = **ptr;
+    ch = utf8_first_byte(z, 2);
+    if utf8_is_cont_byte(z) {
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        *ptr = ptr.offset(-1);
+        let y = **ptr;
+        ch = utf8_first_byte(y, 3);
+        if utf8_is_cont_byte(y) {
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            *ptr = ptr.offset(-1);
+            let x = **ptr;
+            ch = utf8_first_byte(x, 4);
+            ch = utf8_acc_cont_byte(ch, y);
+        }
+        ch = utf8_acc_cont_byte(ch, z);
+    }
+    ch = utf8_acc_cont_byte(ch, w);
+
+    ch
+}
+
+/// Returns the initial codepoint accumulator for the first byte.
+/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+/// for width 3, and 3 bits for width 4.
+#[inline]
+const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+    (byte & (0x7F >> width)) as u32
+}
+
+/// Returns the value of `ch` updated with continuation byte `byte`.
+#[inline]
+const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+    (ch << 6) | (byte & CONT_MASK) as u32
+}
+
+/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
+/// bits `10`).
+#[inline]
+const fn utf8_is_cont_byte(byte: u8) -> bool {
+    (byte as i8) < -64
+}
+
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111;
+
+const fn is_py_ascii_whitespace(b: u8) -> bool {
+    matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
+}
+
+#[inline]
+pub(crate) fn is_word(ch: u32) -> bool {
+    ch == '_' as u32
+        || u8::try_from(ch)
+            .map(|x| x.is_ascii_alphanumeric())
+            .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_space(ch: u32) -> bool {
+    u8::try_from(ch)
+        .map(is_py_ascii_whitespace)
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_digit(ch: u32) -> bool {
+    u8::try_from(ch)
+        .map(|x| x.is_ascii_digit())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_loc_alnum(ch: u32) -> bool {
+    // FIXME: Ignore the locales
+    u8::try_from(ch)
+        .map(|x| x.is_ascii_alphanumeric())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_loc_word(ch: u32) -> bool {
+    ch == '_' as u32 || is_loc_alnum(ch)
+}
+#[inline]
+pub(crate) fn is_linebreak(ch: u32) -> bool {
+    ch == '\n' as u32
+}
+#[inline]
+pub fn lower_ascii(ch: u32) -> u32 {
+    u8::try_from(ch)
+        .map(|x| x.to_ascii_lowercase() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub(crate) fn lower_locate(ch: u32) -> u32 {
+    // FIXME: Ignore the locales
+    lower_ascii(ch)
+}
+#[inline]
+pub(crate) fn upper_locate(ch: u32) -> u32 {
+    // FIXME: Ignore the locales
+    u8::try_from(ch)
+        .map(|x| x.to_ascii_uppercase() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub(crate) fn is_uni_digit(ch: u32) -> bool {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.is_ascii_digit())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_uni_space(ch: u32) -> bool {
+    // TODO: check with cpython
+    is_space(ch)
+        || matches!(
+            ch,
+            0x0009
+                | 0x000A
+                | 0x000B
+                | 0x000C
+                | 0x000D
+                | 0x001C
+                | 0x001D
+                | 0x001E
+                | 0x001F
+                | 0x0020
+                | 0x0085
+                | 0x00A0
+                | 0x1680
+                | 0x2000
+                | 0x2001
+                | 0x2002
+                | 0x2003
+                | 0x2004
+                | 0x2005
+                | 0x2006
+                | 0x2007
+                | 0x2008
+                | 0x2009
+                | 0x200A
+                | 0x2028
+                | 0x2029
+                | 0x202F
+                | 0x205F
+                | 0x3000
+        )
+}
+#[inline]
+pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
+    matches!(
+        ch,
+        0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
+    )
+}
+#[inline]
+pub(crate) fn is_uni_alnum(ch: u32) -> bool {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.is_alphanumeric())
+        .unwrap_or(false)
+}
+#[inline]
+pub(crate) fn is_uni_word(ch: u32) -> bool {
+    ch == '_' as u32 || is_uni_alnum(ch)
+}
+#[inline]
+pub fn lower_unicode(ch: u32) -> u32 {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.to_lowercase().next().unwrap() as u32)
+        .unwrap_or(ch)
+}
+#[inline]
+pub fn upper_unicode(ch: u32) -> u32 {
+    // TODO: check with cpython
+    char::try_from(ch)
+        .map(|x| x.to_uppercase().next().unwrap() as u32)
+        .unwrap_or(ch)
+}
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -1,16 +1,13 @@
-use sre_engine::engine;
+use sre_engine::{Request, State, StrDrive};

 struct Pattern {
    code: &'static [u32],
 }

 impl Pattern {
-    fn state<'a, S: engine::StrDrive>(
-        &self,
-        string: S,
-    ) -> (engine::Request<'a, S>, engine::State) {
-        let req = engine::Request::new(string, 0, usize::MAX, self.code, false);
-        let state = engine::State::default();
+    fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
+        let req = Request::new(string, 0, usize::MAX, self.code, false);
+        let state = State::default();
        (req, state)
    }
 }
@@ -54,7 +51,7 @@ fn test_zerowidth() {
    let (mut req, mut state) = p.state("a:");
    req.must_advance = true;
    assert!(state.search(req));
-    assert_eq!(state.string_position, 1);
+    assert_eq!(state.cursor.position, 1);
 }

 #[test]
@@ -66,7 +63,10 @@ fn test_repeat_context_panic() {
    // END GENERATED
    let (req, mut state) = p.state("axxzaz");
    assert!(state.pymatch(&req));
-    assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]);
+    assert_eq!(
+        *state.marks.raw(),
+        vec![Optioned::some(1), Optioned::some(3)]
+    );
 }

 #[test]
@@ -77,7 +77,7 @@ fn test_double_max_until() {
    // END GENERATED
    let (req, mut state) = p.state("1111");
    assert!(state.pymatch(&req));
-    assert_eq!(state.string_position, 4);
+    assert_eq!(state.cursor.position, 4);
 }

 #[test]
@@ -89,7 +89,7 @@ fn test_info_single() {
    let (req, mut state) = p.state("baaaa");
    assert!(state.search(req));
    assert_eq!(state.start, 1);
-    assert_eq!(state.string_position, 5);
+    assert_eq!(state.cursor.position, 5);
 }

 #[test]
@@ -161,7 +161,7 @@ fn test_bug_20998() {
    let (mut req, mut state) = p.state("ABC");
    req.match_all = true;
    assert!(state.pymatch(&req));
-    assert_eq!(state.string_position, 3);
+    assert_eq!(state.cursor.position, 3);
 }

 #[test]
@@ -172,5 +172,10 @@ fn test_bigcharset() {
    // END GENERATED
    let (req, mut state) = p.state("x ");
    assert!(state.pymatch(&req));
-    assert_eq!(state.string_position, 1);
+    assert_eq!(state.cursor.position, 1);
+}
+
+#[test]
+fn test_search_nonascii() {
+    // pattern p = re.compile('\xe0+')
 }