mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
improve use StringCursor replace index based position
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "sre-engine"
|
||||
version = "0.5.0"
|
||||
version = "0.6.0"
|
||||
authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
|
||||
description = "A low-level implementation of Python's SRE regex engine"
|
||||
repository = "https://github.com/RustPython/sre-engine"
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
extern crate test;
|
||||
use test::Bencher;
|
||||
|
||||
use sre_engine::engine;
|
||||
use sre_engine::{Request, State, StrDrive};
|
||||
|
||||
struct Pattern {
|
||||
code: &'static [u32],
|
||||
}
|
||||
|
||||
impl Pattern {
|
||||
fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) {
|
||||
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
|
||||
self.state_range(string, 0..usize::MAX)
|
||||
}
|
||||
|
||||
fn state_range<'a, S: engine::StrDrive>(
|
||||
fn state_range<'a, S: StrDrive>(
|
||||
&self,
|
||||
string: S,
|
||||
range: std::ops::Range<usize>,
|
||||
) -> (engine::Request<'a, S>, engine::State) {
|
||||
let req = engine::Request::new(string, range.start, range.end, self.code, false);
|
||||
let state = engine::State::default();
|
||||
) -> (Request<'a, S>, State) {
|
||||
let req = Request::new(string, range.start, range.end, self.code, false);
|
||||
let state = State::default();
|
||||
(req, state)
|
||||
}
|
||||
}
|
||||
|
||||
511
src/engine.rs
511
src/engine.rs
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,10 @@
|
||||
pub mod constants;
|
||||
pub mod engine;
|
||||
pub mod string;
|
||||
|
||||
pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC};
|
||||
pub use engine::{Request, SearchIter, State};
|
||||
pub use string::{StrDrive, StringCursor};
|
||||
|
||||
pub const CODESIZE: usize = 4;
|
||||
|
||||
|
||||
381
src/string.rs
Normal file
381
src/string.rs
Normal file
@@ -0,0 +1,381 @@
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct StringCursor {
|
||||
pub(crate) ptr: *const u8,
|
||||
pub position: usize,
|
||||
}
|
||||
|
||||
impl Default for StringCursor {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ptr: std::ptr::null(),
|
||||
position: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait StrDrive: Copy {
|
||||
fn count(&self) -> usize;
|
||||
fn create_cursor(&self, n: usize) -> StringCursor;
|
||||
fn advance(cursor: &mut StringCursor) -> u32;
|
||||
fn peek(cursor: &StringCursor) -> u32;
|
||||
fn skip(cursor: &mut StringCursor, n: usize);
|
||||
fn back_advance(cursor: &mut StringCursor) -> u32;
|
||||
fn back_peek(cursor: &StringCursor) -> u32;
|
||||
fn back_skip(cursor: &mut StringCursor, n: usize);
|
||||
}
|
||||
|
||||
impl<'a> StrDrive for &'a [u8] {
|
||||
#[inline]
|
||||
fn count(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn create_cursor(&self, n: usize) -> StringCursor {
|
||||
StringCursor {
|
||||
ptr: self[n..].as_ptr(),
|
||||
position: n,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn advance(cursor: &mut StringCursor) -> u32 {
|
||||
cursor.position += 1;
|
||||
unsafe { cursor.ptr = cursor.ptr.add(1) };
|
||||
unsafe { *cursor.ptr as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek(cursor: &StringCursor) -> u32 {
|
||||
unsafe { *cursor.ptr as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn skip(cursor: &mut StringCursor, n: usize) {
|
||||
cursor.position += n;
|
||||
unsafe { cursor.ptr = cursor.ptr.add(n) };
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_advance(cursor: &mut StringCursor) -> u32 {
|
||||
cursor.position -= 1;
|
||||
unsafe { cursor.ptr = cursor.ptr.sub(1) };
|
||||
unsafe { *cursor.ptr as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_peek(cursor: &StringCursor) -> u32 {
|
||||
unsafe { *cursor.ptr.offset(-1) as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_skip(cursor: &mut StringCursor, n: usize) {
|
||||
cursor.position -= n;
|
||||
unsafe { cursor.ptr = cursor.ptr.sub(n) };
|
||||
}
|
||||
}
|
||||
|
||||
impl StrDrive for &str {
|
||||
#[inline]
|
||||
fn count(&self) -> usize {
|
||||
self.chars().count()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn create_cursor(&self, n: usize) -> StringCursor {
|
||||
let mut ptr = self.as_ptr();
|
||||
for _ in 0..n {
|
||||
unsafe { next_code_point(&mut ptr) };
|
||||
}
|
||||
StringCursor { ptr, position: n }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn advance(cursor: &mut StringCursor) -> u32 {
|
||||
cursor.position += 1;
|
||||
unsafe { next_code_point(&mut cursor.ptr) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek(cursor: &StringCursor) -> u32 {
|
||||
let mut ptr = cursor.ptr;
|
||||
unsafe { next_code_point(&mut ptr) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn skip(cursor: &mut StringCursor, n: usize) {
|
||||
cursor.position += n;
|
||||
for _ in 0..n {
|
||||
unsafe { next_code_point(&mut cursor.ptr) };
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_advance(cursor: &mut StringCursor) -> u32 {
|
||||
cursor.position -= 1;
|
||||
unsafe { next_code_point_reverse(&mut cursor.ptr) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_peek(cursor: &StringCursor) -> u32 {
|
||||
let mut ptr = cursor.ptr;
|
||||
unsafe { next_code_point_reverse(&mut ptr) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_skip(cursor: &mut StringCursor, n: usize) {
|
||||
cursor.position -= n;
|
||||
for _ in 0..n {
|
||||
unsafe { next_code_point_reverse(&mut cursor.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the next code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
|
||||
#[inline]
|
||||
unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
|
||||
// Decode UTF-8
|
||||
let x = **ptr;
|
||||
*ptr = ptr.offset(1);
|
||||
|
||||
if x < 128 {
|
||||
return x as u32;
|
||||
}
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [[[x y] z] w]
|
||||
// NOTE: Performance is sensitive to the exact formulation here
|
||||
let init = utf8_first_byte(x, 2);
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
let y = **ptr;
|
||||
*ptr = ptr.offset(1);
|
||||
let mut ch = utf8_acc_cont_byte(init, y);
|
||||
if x >= 0xE0 {
|
||||
// [[x y z] w] case
|
||||
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
let z = **ptr;
|
||||
*ptr = ptr.offset(1);
|
||||
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
|
||||
ch = init << 12 | y_z;
|
||||
if x >= 0xF0 {
|
||||
// [x y z w] case
|
||||
// use only the lower 3 bits of `init`
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
let w = **ptr;
|
||||
*ptr = ptr.offset(1);
|
||||
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
|
||||
}
|
||||
}
|
||||
|
||||
ch
|
||||
}
|
||||
|
||||
/// Reads the last code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
|
||||
#[inline]
|
||||
unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
|
||||
// Decode UTF-8
|
||||
*ptr = ptr.offset(-1);
|
||||
let w = match **ptr {
|
||||
next_byte if next_byte < 128 => return next_byte as u32,
|
||||
back_byte => back_byte,
|
||||
};
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [x [y [z w]]]
|
||||
let mut ch;
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
*ptr = ptr.offset(-1);
|
||||
let z = **ptr;
|
||||
ch = utf8_first_byte(z, 2);
|
||||
if utf8_is_cont_byte(z) {
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
*ptr = ptr.offset(-1);
|
||||
let y = **ptr;
|
||||
ch = utf8_first_byte(y, 3);
|
||||
if utf8_is_cont_byte(y) {
|
||||
// SAFETY: `bytes` produces an UTF-8-like string,
|
||||
// so the iterator must produce a value here.
|
||||
*ptr = ptr.offset(-1);
|
||||
let x = **ptr;
|
||||
ch = utf8_first_byte(x, 4);
|
||||
ch = utf8_acc_cont_byte(ch, y);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, z);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, w);
|
||||
|
||||
ch
|
||||
}
|
||||
|
||||
/// Returns the initial codepoint accumulator for the first byte.
|
||||
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||
/// for width 3, and 3 bits for width 4.
|
||||
#[inline]
|
||||
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
|
||||
(byte & (0x7F >> width)) as u32
|
||||
}
|
||||
|
||||
/// Returns the value of `ch` updated with continuation byte `byte`.
|
||||
#[inline]
|
||||
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
|
||||
(ch << 6) | (byte & CONT_MASK) as u32
|
||||
}
|
||||
|
||||
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
|
||||
/// bits `10`).
|
||||
#[inline]
|
||||
const fn utf8_is_cont_byte(byte: u8) -> bool {
|
||||
(byte as i8) < -64
|
||||
}
|
||||
|
||||
/// Mask of the value bits of a continuation byte.
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
|
||||
const fn is_py_ascii_whitespace(b: u8) -> bool {
|
||||
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn is_word(ch: u32) -> bool {
|
||||
ch == '_' as u32
|
||||
|| u8::try_from(ch)
|
||||
.map(|x| x.is_ascii_alphanumeric())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_space(ch: u32) -> bool {
|
||||
u8::try_from(ch)
|
||||
.map(is_py_ascii_whitespace)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_digit(ch: u32) -> bool {
|
||||
u8::try_from(ch)
|
||||
.map(|x| x.is_ascii_digit())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
|
||||
// FIXME: Ignore the locales
|
||||
u8::try_from(ch)
|
||||
.map(|x| x.is_ascii_alphanumeric())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_loc_word(ch: u32) -> bool {
|
||||
ch == '_' as u32 || is_loc_alnum(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_linebreak(ch: u32) -> bool {
|
||||
ch == '\n' as u32
|
||||
}
|
||||
#[inline]
|
||||
pub fn lower_ascii(ch: u32) -> u32 {
|
||||
u8::try_from(ch)
|
||||
.map(|x| x.to_ascii_lowercase() as u32)
|
||||
.unwrap_or(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn lower_locate(ch: u32) -> u32 {
|
||||
// FIXME: Ignore the locales
|
||||
lower_ascii(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn upper_locate(ch: u32) -> u32 {
|
||||
// FIXME: Ignore the locales
|
||||
u8::try_from(ch)
|
||||
.map(|x| x.to_ascii_uppercase() as u32)
|
||||
.unwrap_or(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_uni_digit(ch: u32) -> bool {
|
||||
// TODO: check with cpython
|
||||
char::try_from(ch)
|
||||
.map(|x| x.is_ascii_digit())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_uni_space(ch: u32) -> bool {
|
||||
// TODO: check with cpython
|
||||
is_space(ch)
|
||||
|| matches!(
|
||||
ch,
|
||||
0x0009
|
||||
| 0x000A
|
||||
| 0x000B
|
||||
| 0x000C
|
||||
| 0x000D
|
||||
| 0x001C
|
||||
| 0x001D
|
||||
| 0x001E
|
||||
| 0x001F
|
||||
| 0x0020
|
||||
| 0x0085
|
||||
| 0x00A0
|
||||
| 0x1680
|
||||
| 0x2000
|
||||
| 0x2001
|
||||
| 0x2002
|
||||
| 0x2003
|
||||
| 0x2004
|
||||
| 0x2005
|
||||
| 0x2006
|
||||
| 0x2007
|
||||
| 0x2008
|
||||
| 0x2009
|
||||
| 0x200A
|
||||
| 0x2028
|
||||
| 0x2029
|
||||
| 0x202F
|
||||
| 0x205F
|
||||
| 0x3000
|
||||
)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
|
||||
matches!(
|
||||
ch,
|
||||
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
|
||||
)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
|
||||
// TODO: check with cpython
|
||||
char::try_from(ch)
|
||||
.map(|x| x.is_alphanumeric())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn is_uni_word(ch: u32) -> bool {
|
||||
ch == '_' as u32 || is_uni_alnum(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub fn lower_unicode(ch: u32) -> u32 {
|
||||
// TODO: check with cpython
|
||||
char::try_from(ch)
|
||||
.map(|x| x.to_lowercase().next().unwrap() as u32)
|
||||
.unwrap_or(ch)
|
||||
}
|
||||
#[inline]
|
||||
pub fn upper_unicode(ch: u32) -> u32 {
|
||||
// TODO: check with cpython
|
||||
char::try_from(ch)
|
||||
.map(|x| x.to_uppercase().next().unwrap() as u32)
|
||||
.unwrap_or(ch)
|
||||
}
|
||||
@@ -1,16 +1,13 @@
|
||||
use sre_engine::engine;
|
||||
use sre_engine::{Request, State, StrDrive};
|
||||
|
||||
struct Pattern {
|
||||
code: &'static [u32],
|
||||
}
|
||||
|
||||
impl Pattern {
|
||||
fn state<'a, S: engine::StrDrive>(
|
||||
&self,
|
||||
string: S,
|
||||
) -> (engine::Request<'a, S>, engine::State) {
|
||||
let req = engine::Request::new(string, 0, usize::MAX, self.code, false);
|
||||
let state = engine::State::default();
|
||||
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
|
||||
let req = Request::new(string, 0, usize::MAX, self.code, false);
|
||||
let state = State::default();
|
||||
(req, state)
|
||||
}
|
||||
}
|
||||
@@ -54,7 +51,7 @@ fn test_zerowidth() {
|
||||
let (mut req, mut state) = p.state("a:");
|
||||
req.must_advance = true;
|
||||
assert!(state.search(req));
|
||||
assert_eq!(state.string_position, 1);
|
||||
assert_eq!(state.cursor.position, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -66,7 +63,10 @@ fn test_repeat_context_panic() {
|
||||
// END GENERATED
|
||||
let (req, mut state) = p.state("axxzaz");
|
||||
assert!(state.pymatch(&req));
|
||||
assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]);
|
||||
assert_eq!(
|
||||
*state.marks.raw(),
|
||||
vec![Optioned::some(1), Optioned::some(3)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -77,7 +77,7 @@ fn test_double_max_until() {
|
||||
// END GENERATED
|
||||
let (req, mut state) = p.state("1111");
|
||||
assert!(state.pymatch(&req));
|
||||
assert_eq!(state.string_position, 4);
|
||||
assert_eq!(state.cursor.position, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -89,7 +89,7 @@ fn test_info_single() {
|
||||
let (req, mut state) = p.state("baaaa");
|
||||
assert!(state.search(req));
|
||||
assert_eq!(state.start, 1);
|
||||
assert_eq!(state.string_position, 5);
|
||||
assert_eq!(state.cursor.position, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -161,7 +161,7 @@ fn test_bug_20998() {
|
||||
let (mut req, mut state) = p.state("ABC");
|
||||
req.match_all = true;
|
||||
assert!(state.pymatch(&req));
|
||||
assert_eq!(state.string_position, 3);
|
||||
assert_eq!(state.cursor.position, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -172,5 +172,10 @@ fn test_bigcharset() {
|
||||
// END GENERATED
|
||||
let (req, mut state) = p.state("x ");
|
||||
assert!(state.pymatch(&req));
|
||||
assert_eq!(state.string_position, 1);
|
||||
assert_eq!(state.cursor.position, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_nonascii() {
|
||||
// pattern p = re.compile('\xe0+')
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user