mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-09 22:49:57 +09:00
Merge pull request #299 from holygits/fix-unicode-handling
Handle unicode string slicing with graphemes
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -717,6 +717,7 @@ dependencies = [
|
||||
"serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"statrs 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
33
tests/snippets/unicode_slicing.py
Normal file
33
tests/snippets/unicode_slicing.py
Normal file
@@ -0,0 +1,33 @@
|
||||
def test_slice_bounds(s):
|
||||
# End out of range
|
||||
assert s[0:100] == s
|
||||
assert s[0:-100] == ''
|
||||
# Start out of range
|
||||
assert s[100:1] == ''
|
||||
# Out of range both sides
|
||||
# This is the behaviour in cpython
|
||||
# assert s[-100:100] == s
|
||||
|
||||
def expect_index_error(s, index):
|
||||
try:
|
||||
s[index]
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
assert False
|
||||
|
||||
unicode_str = "∀∂"
|
||||
assert unicode_str[0] == "∀"
|
||||
assert unicode_str[1] == "∂"
|
||||
assert unicode_str[-1] == "∂"
|
||||
|
||||
test_slice_bounds(unicode_str)
|
||||
expect_index_error(unicode_str, 100)
|
||||
expect_index_error(unicode_str, -100)
|
||||
|
||||
ascii_str = "hello world"
|
||||
test_slice_bounds(ascii_str)
|
||||
assert ascii_str[0] == "h"
|
||||
assert ascii_str[1] == "e"
|
||||
assert ascii_str[-1] == "d"
|
||||
|
||||
@@ -17,4 +17,5 @@ serde_json = "1.0.26"
|
||||
byteorder = "1.2.6"
|
||||
regex = "1"
|
||||
statrs = "0.10.0"
|
||||
caseless = "0.2.1"
|
||||
caseless = "0.2.1"
|
||||
unicode-segmentation = "1.2.1"
|
||||
|
||||
@@ -14,7 +14,7 @@ pub trait PySliceableSequence {
|
||||
fn get_pos(&self, p: i32) -> usize {
|
||||
if p < 0 {
|
||||
if -p as usize > self.len() {
|
||||
// return something that is out of bounds so get_item raises an IndexError
|
||||
// return something that is out of bounds so `get_item` raises an IndexError
|
||||
self.len() + 1
|
||||
} else {
|
||||
self.len() - ((-p) as usize)
|
||||
@@ -57,7 +57,7 @@ pub trait PySliceableSequence {
|
||||
}
|
||||
}
|
||||
|
||||
impl PySliceableSequence for Vec<PyObjectRef> {
|
||||
impl<T: Clone> PySliceableSequence for Vec<T> {
|
||||
fn do_slice(&self, start: usize, stop: usize) -> Self {
|
||||
self[start..stop].to_vec()
|
||||
}
|
||||
|
||||
@@ -11,6 +11,9 @@ use num_traits::ToPrimitive;
|
||||
use std::hash::{Hash, Hasher};
|
||||
// rust's builtin to_lowercase isn't sufficient for casefold
|
||||
extern crate caseless;
|
||||
extern crate unicode_segmentation;
|
||||
|
||||
use self::unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
pub fn init(context: &PyContext) {
|
||||
let ref str_type = context.str_type;
|
||||
@@ -980,22 +983,45 @@ fn str_new(vm: &mut VirtualMachine, args: PyFuncArgs) -> PyResult {
|
||||
|
||||
impl PySliceableSequence for String {
|
||||
fn do_slice(&self, start: usize, stop: usize) -> Self {
|
||||
self[start..stop].to_string()
|
||||
to_graphemes(self)
|
||||
.get(start..stop)
|
||||
.map_or(String::default(), |c| c.join(""))
|
||||
}
|
||||
|
||||
fn do_stepped_slice(&self, start: usize, stop: usize, step: usize) -> Self {
|
||||
self[start..stop].chars().step_by(step).collect()
|
||||
if let Some(s) = to_graphemes(self).get(start..stop) {
|
||||
return s
|
||||
.iter()
|
||||
.cloned()
|
||||
.step_by(step)
|
||||
.collect::<Vec<String>>()
|
||||
.join("");
|
||||
}
|
||||
String::default()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.len()
|
||||
to_graphemes(self).len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a string-able `value` to a vec of graphemes
|
||||
/// represents the string according to user perceived characters
|
||||
fn to_graphemes<S: AsRef<str>>(value: S) -> Vec<String> {
|
||||
UnicodeSegmentation::graphemes(value.as_ref(), true)
|
||||
.map(String::from)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn subscript(vm: &mut VirtualMachine, value: &str, b: PyObjectRef) -> PyResult {
|
||||
// let value = a
|
||||
if objtype::isinstance(&b, &vm.ctx.int_type()) {
|
||||
let pos = objint::get_value(&b).to_i32().unwrap();
|
||||
let idx = value.to_string().get_pos(pos);
|
||||
Ok(vm.new_str(value[idx..idx + 1].to_string()))
|
||||
let graphemes = to_graphemes(value);
|
||||
let idx = graphemes.get_pos(pos);
|
||||
graphemes
|
||||
.get(idx)
|
||||
.map(|c| vm.new_str(c.to_string()))
|
||||
.ok_or(vm.new_index_error("string index out of range".to_string()))
|
||||
} else {
|
||||
match &(*b.borrow()).payload {
|
||||
&PyObjectPayload::Slice {
|
||||
|
||||
Reference in New Issue
Block a user