improve use StringCursor replace index based position

This commit is contained in:
Kangzhi Shi
2024-01-13 16:03:38 +02:00
committed by Steve Shi
parent 118a00c012
commit c93ea30b3b
6 changed files with 562 additions and 380 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "sre-engine"
version = "0.5.0"
version = "0.6.0"
authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
description = "A low-level implementation of Python's SRE regex engine"
repository = "https://github.com/RustPython/sre-engine"

View File

@@ -3,24 +3,24 @@
extern crate test;
use test::Bencher;
use sre_engine::engine;
use sre_engine::{Request, State, StrDrive};
struct Pattern {
code: &'static [u32],
}
impl Pattern {
fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) {
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
self.state_range(string, 0..usize::MAX)
}
fn state_range<'a, S: engine::StrDrive>(
fn state_range<'a, S: StrDrive>(
&self,
string: S,
range: std::ops::Range<usize>,
) -> (engine::Request<'a, S>, engine::State) {
let req = engine::Request::new(string, range.start, range.end, self.code, false);
let state = engine::State::default();
) -> (Request<'a, S>, State) {
let req = Request::new(string, range.start, range.end, self.code, false);
let state = State::default();
(req, state)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,10 @@
pub mod constants;
pub mod engine;
pub mod string;
pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC};
pub use engine::{Request, SearchIter, State};
pub use string::{StrDrive, StringCursor};
pub const CODESIZE: usize = 4;

381
src/string.rs Normal file
View File

@@ -0,0 +1,381 @@
#[derive(Debug, Clone, Copy)]
pub struct StringCursor {
pub(crate) ptr: *const u8,
pub position: usize,
}
impl Default for StringCursor {
fn default() -> Self {
Self {
ptr: std::ptr::null(),
position: 0,
}
}
}
pub trait StrDrive: Copy {
fn count(&self) -> usize;
fn create_cursor(&self, n: usize) -> StringCursor;
fn advance(cursor: &mut StringCursor) -> u32;
fn peek(cursor: &StringCursor) -> u32;
fn skip(cursor: &mut StringCursor, n: usize);
fn back_advance(cursor: &mut StringCursor) -> u32;
fn back_peek(cursor: &StringCursor) -> u32;
fn back_skip(cursor: &mut StringCursor, n: usize);
}
impl<'a> StrDrive for &'a [u8] {
#[inline]
fn count(&self) -> usize {
self.len()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
StringCursor {
ptr: self[n..].as_ptr(),
position: n,
}
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { cursor.ptr = cursor.ptr.add(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
unsafe { cursor.ptr = cursor.ptr.add(n) };
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { cursor.ptr = cursor.ptr.sub(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr.offset(-1) as u32 }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
unsafe { cursor.ptr = cursor.ptr.sub(n) };
}
}
impl StrDrive for &str {
#[inline]
fn count(&self) -> usize {
self.chars().count()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
let mut ptr = self.as_ptr();
for _ in 0..n {
unsafe { next_code_point(&mut ptr) };
}
StringCursor { ptr, position: n }
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { next_code_point(&mut cursor.ptr) }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point(&mut ptr) }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
for _ in 0..n {
unsafe { next_code_point(&mut cursor.ptr) };
}
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { next_code_point_reverse(&mut cursor.ptr) }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point_reverse(&mut ptr) }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
for _ in 0..n {
unsafe { next_code_point_reverse(&mut cursor.ptr) };
}
}
}
/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
// Decode UTF-8
let x = **ptr;
*ptr = ptr.offset(1);
if x < 128 {
return x as u32;
}
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = **ptr;
*ptr = ptr.offset(1);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = **ptr;
*ptr = ptr.offset(1);
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = **ptr;
*ptr = ptr.offset(1);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
ch
}
/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
// Decode UTF-8
*ptr = ptr.offset(-1);
let w = match **ptr {
next_byte if next_byte < 128 => return next_byte as u32,
back_byte => back_byte,
};
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let z = **ptr;
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let y = **ptr;
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
*ptr = ptr.offset(-1);
let x = **ptr;
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
}
ch = utf8_acc_cont_byte(ch, w);
ch
}
/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
const fn is_py_ascii_whitespace(b: u8) -> bool {
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}
#[inline]
pub(crate) fn is_word(ch: u32) -> bool {
ch == '_' as u32
|| u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_space(ch: u32) -> bool {
u8::try_from(ch)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_digit(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_word(ch: u32) -> bool {
ch == '_' as u32 || is_loc_alnum(ch)
}
#[inline]
pub(crate) fn is_linebreak(ch: u32) -> bool {
ch == '\n' as u32
}
#[inline]
pub fn lower_ascii(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_lowercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn lower_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
lower_ascii(ch)
}
#[inline]
pub(crate) fn upper_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.to_ascii_uppercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn is_uni_digit(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_space(ch: u32) -> bool {
// TODO: check with cpython
is_space(ch)
|| matches!(
ch,
0x0009
| 0x000A
| 0x000B
| 0x000C
| 0x000D
| 0x001C
| 0x001D
| 0x001E
| 0x001F
| 0x0020
| 0x0085
| 0x00A0
| 0x1680
| 0x2000
| 0x2001
| 0x2002
| 0x2003
| 0x2004
| 0x2005
| 0x2006
| 0x2007
| 0x2008
| 0x2009
| 0x200A
| 0x2028
| 0x2029
| 0x202F
| 0x205F
| 0x3000
)
}
#[inline]
pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
#[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_word(ch: u32) -> bool {
ch == '_' as u32 || is_uni_alnum(ch)
}
#[inline]
pub fn lower_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_lowercase().next().unwrap() as u32)
.unwrap_or(ch)
}
#[inline]
pub fn upper_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_uppercase().next().unwrap() as u32)
.unwrap_or(ch)
}

View File

@@ -1,16 +1,13 @@
use sre_engine::engine;
use sre_engine::{Request, State, StrDrive};
struct Pattern {
code: &'static [u32],
}
impl Pattern {
fn state<'a, S: engine::StrDrive>(
&self,
string: S,
) -> (engine::Request<'a, S>, engine::State) {
let req = engine::Request::new(string, 0, usize::MAX, self.code, false);
let state = engine::State::default();
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
let req = Request::new(string, 0, usize::MAX, self.code, false);
let state = State::default();
(req, state)
}
}
@@ -54,7 +51,7 @@ fn test_zerowidth() {
let (mut req, mut state) = p.state("a:");
req.must_advance = true;
assert!(state.search(req));
assert_eq!(state.string_position, 1);
assert_eq!(state.cursor.position, 1);
}
#[test]
@@ -66,7 +63,10 @@ fn test_repeat_context_panic() {
// END GENERATED
let (req, mut state) = p.state("axxzaz");
assert!(state.pymatch(&req));
assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]);
assert_eq!(
*state.marks.raw(),
vec![Optioned::some(1), Optioned::some(3)]
);
}
#[test]
@@ -77,7 +77,7 @@ fn test_double_max_until() {
// END GENERATED
let (req, mut state) = p.state("1111");
assert!(state.pymatch(&req));
assert_eq!(state.string_position, 4);
assert_eq!(state.cursor.position, 4);
}
#[test]
@@ -89,7 +89,7 @@ fn test_info_single() {
let (req, mut state) = p.state("baaaa");
assert!(state.search(req));
assert_eq!(state.start, 1);
assert_eq!(state.string_position, 5);
assert_eq!(state.cursor.position, 5);
}
#[test]
@@ -161,7 +161,7 @@ fn test_bug_20998() {
let (mut req, mut state) = p.state("ABC");
req.match_all = true;
assert!(state.pymatch(&req));
assert_eq!(state.string_position, 3);
assert_eq!(state.cursor.position, 3);
}
#[test]
@@ -172,5 +172,10 @@ fn test_bigcharset() {
// END GENERATED
let (req, mut state) = p.state("x ");
assert!(state.pymatch(&req));
assert_eq!(state.string_position, 1);
assert_eq!(state.cursor.position, 1);
}
#[test]
fn test_search_nonascii() {
// pattern p = re.compile('\xe0+')
}