Merge pull request #299 from holygits/fix-unicode-handling

Handle unicode string slicing with graphemes
This commit is contained in:
Windel Bouwman
2019-02-04 13:48:41 +01:00
committed by GitHub
5 changed files with 70 additions and 9 deletions

1
Cargo.lock generated
View File

@@ -717,6 +717,7 @@ dependencies = [
"serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)",
"statrs 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]

View File

@@ -0,0 +1,33 @@
def test_slice_bounds(s):
# End out of range
assert s[0:100] == s
assert s[0:-100] == ''
# Start out of range
assert s[100:1] == ''
# Out of range both sides
# This is the behaviour in cpython
# assert s[-100:100] == s
def expect_index_error(s, index):
try:
s[index]
except IndexError:
pass
else:
assert False
unicode_str = "∀∂"
assert unicode_str[0] == ""
assert unicode_str[1] == ""
assert unicode_str[-1] == ""
test_slice_bounds(unicode_str)
expect_index_error(unicode_str, 100)
expect_index_error(unicode_str, -100)
ascii_str = "hello world"
test_slice_bounds(ascii_str)
assert ascii_str[0] == "h"
assert ascii_str[1] == "e"
assert ascii_str[-1] == "d"

View File

@@ -17,4 +17,5 @@ serde_json = "1.0.26"
byteorder = "1.2.6"
regex = "1"
statrs = "0.10.0"
caseless = "0.2.1"
caseless = "0.2.1"
unicode-segmentation = "1.2.1"

View File

@@ -14,7 +14,7 @@ pub trait PySliceableSequence {
fn get_pos(&self, p: i32) -> usize {
if p < 0 {
if -p as usize > self.len() {
// return something that is out of bounds so get_item raises an IndexError
// return something that is out of bounds so `get_item` raises an IndexError
self.len() + 1
} else {
self.len() - ((-p) as usize)
@@ -57,7 +57,7 @@ pub trait PySliceableSequence {
}
}
impl PySliceableSequence for Vec<PyObjectRef> {
impl<T: Clone> PySliceableSequence for Vec<T> {
fn do_slice(&self, start: usize, stop: usize) -> Self {
self[start..stop].to_vec()
}

View File

@@ -11,6 +11,9 @@ use num_traits::ToPrimitive;
use std::hash::{Hash, Hasher};
// rust's builtin to_lowercase isn't sufficient for casefold
extern crate caseless;
extern crate unicode_segmentation;
use self::unicode_segmentation::UnicodeSegmentation;
pub fn init(context: &PyContext) {
let ref str_type = context.str_type;
@@ -980,22 +983,45 @@ fn str_new(vm: &mut VirtualMachine, args: PyFuncArgs) -> PyResult {
impl PySliceableSequence for String {
fn do_slice(&self, start: usize, stop: usize) -> Self {
self[start..stop].to_string()
to_graphemes(self)
.get(start..stop)
.map_or(String::default(), |c| c.join(""))
}
fn do_stepped_slice(&self, start: usize, stop: usize, step: usize) -> Self {
self[start..stop].chars().step_by(step).collect()
if let Some(s) = to_graphemes(self).get(start..stop) {
return s
.iter()
.cloned()
.step_by(step)
.collect::<Vec<String>>()
.join("");
}
String::default()
}
fn len(&self) -> usize {
self.len()
to_graphemes(self).len()
}
}
/// Convert a string-able `value` to a vec of graphemes
/// represents the string according to user perceived characters
fn to_graphemes<S: AsRef<str>>(value: S) -> Vec<String> {
UnicodeSegmentation::graphemes(value.as_ref(), true)
.map(String::from)
.collect()
}
pub fn subscript(vm: &mut VirtualMachine, value: &str, b: PyObjectRef) -> PyResult {
// let value = a
if objtype::isinstance(&b, &vm.ctx.int_type()) {
let pos = objint::get_value(&b).to_i32().unwrap();
let idx = value.to_string().get_pos(pos);
Ok(vm.new_str(value[idx..idx + 1].to_string()))
let graphemes = to_graphemes(value);
let idx = graphemes.get_pos(pos);
graphemes
.get(idx)
.map(|c| vm.new_str(c.to_string()))
.ok_or(vm.new_index_error("string index out of range".to_string()))
} else {
match &(*b.borrow()).payload {
&PyObjectPayload::Slice {