Merge pull request #299 from holygits/fix-unicode-handling

Handle unicode string slicing with graphemes
2026-06-09 22:49:57 +09:00 · 2019-02-04 13:48:41 +01:00
parent 4242b66992 b7f6db753f
commit 55aa12b68f
5 changed files with 70 additions and 9 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -717,6 +717,7 @@ dependencies = [
 "serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)",
 "statrs 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
--- a/tests/snippets/unicode_slicing.py
+++ b/tests/snippets/unicode_slicing.py
@@ -0,0 +1,33 @@
+def test_slice_bounds(s):
+    # End out of range
+    assert s[0:100] == s
+    assert s[0:-100] == '' 
+    # Start out of range
+    assert s[100:1] == ''
+    # Out of range both sides
+    # This is the behaviour in cpython
+    # assert s[-100:100] == s
+
+def expect_index_error(s, index):
+    try:
+        s[index]
+    except IndexError:
+        pass
+    else:
+        assert False
+
+unicode_str = "∀∂"
+assert unicode_str[0] == "∀"
+assert unicode_str[1] == "∂"
+assert unicode_str[-1] == "∂"
+
+test_slice_bounds(unicode_str)
+expect_index_error(unicode_str, 100)
+expect_index_error(unicode_str, -100)
+
+ascii_str = "hello world"
+test_slice_bounds(ascii_str)
+assert ascii_str[0] == "h"
+assert ascii_str[1] == "e"
+assert ascii_str[-1] == "d"
+
--- a/vm/Cargo.toml
+++ b/vm/Cargo.toml
@@ -17,4 +17,5 @@ serde_json = "1.0.26"
 byteorder = "1.2.6"
 regex = "1"
 statrs = "0.10.0"
-caseless = "0.2.1"
+caseless = "0.2.1"
+unicode-segmentation = "1.2.1"
--- a/vm/src/obj/objsequence.rs
+++ b/vm/src/obj/objsequence.rs
@@ -14,7 +14,7 @@ pub trait PySliceableSequence {
    fn get_pos(&self, p: i32) -> usize {
        if p < 0 {
            if -p as usize > self.len() {
-                // return something that is out of bounds so get_item raises an IndexError
+                // return something that is out of bounds so `get_item` raises an IndexError
                self.len() + 1
            } else {
                self.len() - ((-p) as usize)
@@ -57,7 +57,7 @@ pub trait PySliceableSequence {
    }
 }

-impl PySliceableSequence for Vec<PyObjectRef> {
+impl<T: Clone> PySliceableSequence for Vec<T> {
    fn do_slice(&self, start: usize, stop: usize) -> Self {
        self[start..stop].to_vec()
    }
--- a/vm/src/obj/objstr.rs
+++ b/vm/src/obj/objstr.rs
@@ -11,6 +11,9 @@ use num_traits::ToPrimitive;
 use std::hash::{Hash, Hasher};
 // rust's builtin to_lowercase isn't sufficient for casefold
 extern crate caseless;
+extern crate unicode_segmentation;
+
+use self::unicode_segmentation::UnicodeSegmentation;

 pub fn init(context: &PyContext) {
    let ref str_type = context.str_type;
@@ -980,22 +983,45 @@ fn str_new(vm: &mut VirtualMachine, args: PyFuncArgs) -> PyResult {

 impl PySliceableSequence for String {
    fn do_slice(&self, start: usize, stop: usize) -> Self {
-        self[start..stop].to_string()
+        to_graphemes(self)
+            .get(start..stop)
+            .map_or(String::default(), |c| c.join(""))
    }
+
    fn do_stepped_slice(&self, start: usize, stop: usize, step: usize) -> Self {
-        self[start..stop].chars().step_by(step).collect()
+        if let Some(s) = to_graphemes(self).get(start..stop) {
+            return s
+                .iter()
+                .cloned()
+                .step_by(step)
+                .collect::<Vec<String>>()
+                .join("");
+        }
+        String::default()
    }
+
    fn len(&self) -> usize {
-        self.len()
+        to_graphemes(self).len()
    }
 }

+/// Convert a string-able `value` to a vec of graphemes
+/// represents the string according to user perceived characters
+fn to_graphemes<S: AsRef<str>>(value: S) -> Vec<String> {
+    UnicodeSegmentation::graphemes(value.as_ref(), true)
+        .map(String::from)
+        .collect()
+}
+
 pub fn subscript(vm: &mut VirtualMachine, value: &str, b: PyObjectRef) -> PyResult {
-    // let value = a
    if objtype::isinstance(&b, &vm.ctx.int_type()) {
        let pos = objint::get_value(&b).to_i32().unwrap();
-        let idx = value.to_string().get_pos(pos);
-        Ok(vm.new_str(value[idx..idx + 1].to_string()))
+        let graphemes = to_graphemes(value);
+        let idx = graphemes.get_pos(pos);
+        graphemes
+            .get(idx)
+            .map(|c| vm.new_str(c.to_string()))
+            .ok_or(vm.new_index_error("string index out of range".to_string()))
    } else {
        match &(*b.borrow()).payload {
            &PyObjectPayload::Slice {