diff --git a/Cargo.lock b/Cargo.lock index 6f226d279..b1342cd72 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -717,6 +717,7 @@ dependencies = [ "serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)", "statrs 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] diff --git a/tests/snippets/unicode_slicing.py b/tests/snippets/unicode_slicing.py new file mode 100644 index 000000000..de4184513 --- /dev/null +++ b/tests/snippets/unicode_slicing.py @@ -0,0 +1,33 @@ +def test_slice_bounds(s): + # End out of range + assert s[0:100] == s + assert s[0:-100] == '' + # Start out of range + assert s[100:1] == '' + # Out of range both sides + # This is the behaviour in cpython + # assert s[-100:100] == s + +def expect_index_error(s, index): + try: + s[index] + except IndexError: + pass + else: + assert False + +unicode_str = "∀∂" +assert unicode_str[0] == "∀" +assert unicode_str[1] == "∂" +assert unicode_str[-1] == "∂" + +test_slice_bounds(unicode_str) +expect_index_error(unicode_str, 100) +expect_index_error(unicode_str, -100) + +ascii_str = "hello world" +test_slice_bounds(ascii_str) +assert ascii_str[0] == "h" +assert ascii_str[1] == "e" +assert ascii_str[-1] == "d" + diff --git a/vm/Cargo.toml b/vm/Cargo.toml index 7f1aed7ab..59e10da64 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -17,4 +17,5 @@ serde_json = "1.0.26" byteorder = "1.2.6" regex = "1" statrs = "0.10.0" -caseless = "0.2.1" \ No newline at end of file +caseless = "0.2.1" +unicode-segmentation = "1.2.1" diff --git a/vm/src/obj/objsequence.rs b/vm/src/obj/objsequence.rs index 65bff64a6..4dbd0026f 100644 --- a/vm/src/obj/objsequence.rs +++ b/vm/src/obj/objsequence.rs @@ -14,7 +14,7 @@ pub trait PySliceableSequence { fn get_pos(&self, p: i32) -> usize { if p < 0 { if -p as usize > self.len() { - // return something that is out of bounds so get_item raises an IndexError + // return something that is out of bounds so `get_item` raises an IndexError self.len() + 1 } else { self.len() - ((-p) as usize) @@ -57,7 +57,7 @@ pub trait PySliceableSequence { } } -impl PySliceableSequence for Vec { +impl PySliceableSequence for Vec { fn do_slice(&self, start: usize, stop: usize) -> Self { self[start..stop].to_vec() } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 3887a3f33..ce55bb940 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -11,6 +11,9 @@ use num_traits::ToPrimitive; use std::hash::{Hash, Hasher}; // rust's builtin to_lowercase isn't sufficient for casefold extern crate caseless; +extern crate unicode_segmentation; + +use self::unicode_segmentation::UnicodeSegmentation; pub fn init(context: &PyContext) { let ref str_type = context.str_type; @@ -980,22 +983,45 @@ fn str_new(vm: &mut VirtualMachine, args: PyFuncArgs) -> PyResult { impl PySliceableSequence for String { fn do_slice(&self, start: usize, stop: usize) -> Self { - self[start..stop].to_string() + to_graphemes(self) + .get(start..stop) + .map_or(String::default(), |c| c.join("")) } + fn do_stepped_slice(&self, start: usize, stop: usize, step: usize) -> Self { - self[start..stop].chars().step_by(step).collect() + if let Some(s) = to_graphemes(self).get(start..stop) { + return s + .iter() + .cloned() + .step_by(step) + .collect::>() + .join(""); + } + String::default() } + fn len(&self) -> usize { - self.len() + to_graphemes(self).len() } } +/// Convert a string-able `value` to a vec of graphemes +/// represents the string according to user perceived characters +fn to_graphemes>(value: S) -> Vec { + UnicodeSegmentation::graphemes(value.as_ref(), true) + .map(String::from) + .collect() +} + pub fn subscript(vm: &mut VirtualMachine, value: &str, b: PyObjectRef) -> PyResult { - // let value = a if objtype::isinstance(&b, &vm.ctx.int_type()) { let pos = objint::get_value(&b).to_i32().unwrap(); - let idx = value.to_string().get_pos(pos); - Ok(vm.new_str(value[idx..idx + 1].to_string())) + let graphemes = to_graphemes(value); + let idx = graphemes.get_pos(pos); + graphemes + .get(idx) + .map(|c| vm.new_str(c.to_string())) + .ok_or(vm.new_index_error("string index out of range".to_string())) } else { match &(*b.borrow()).payload { &PyObjectPayload::Slice {