From ec00d727ddb76c45824bb65e36b7468ad6badd93 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Tue, 28 Apr 2020 00:06:08 +0900 Subject: [PATCH 1/3] Fix str.center and all pad methods into pystr --- Lib/test/test_unicode.py | 4 -- vm/src/obj/objbyteinner.rs | 118 ++++++++++++++------------------ vm/src/obj/objmemory.rs | 2 +- vm/src/obj/objstr.rs | 133 +++++++++++++++++++------------------ vm/src/obj/pystr.rs | 32 +++++++-- 5 files changed, 144 insertions(+), 145 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 88843dedc1..60e8a3f1d1 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -896,8 +896,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('ß'.swapcase(), 'SS') self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_center(self): string_tests.CommonTest.test_center(self) self.assertEqual('x'.center(2, '\U0010FFFF'), @@ -960,8 +958,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertNotIn(delim * 2, fill) self.assertIn(delim * 2, fill + delim * 2) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_issue18183(self): '\U00010000\U00100000'.lower() '\U00010000\U00100000'.casefold() diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index f3a34a5390..7fbc5424ec 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -1,6 +1,5 @@ use bstr::ByteSlice; use num_bigint::{BigInt, ToBigInt}; -use num_integer::Integer; use num_traits::{One, Signed, ToPrimitive, Zero}; use std::convert::TryFrom; use std::ops::Range; @@ -179,25 +178,17 @@ impl ByteInnerFindOptions { #[derive(FromArgs)] pub struct ByteInnerPaddingOptions { #[pyarg(positional_only, optional = false)] - width: PyIntRef, + width: isize, #[pyarg(positional_only, optional = true)] - fillbyte: OptionalArg, + fillchar: OptionalArg, } + impl ByteInnerPaddingOptions { - fn get_value(self, fn_name: &str, len: usize, vm: &VirtualMachine) -> PyResult<(u8, usize)> { - let fillbyte = if let OptionalArg::Present(v) = &self.fillbyte { - match try_as_byte(&v) { - Some(x) => { - if x.len() == 1 { - x[0] - } else { - return Err(vm.new_type_error(format!( - "{}() argument 2 must be a byte string of length 1, not {}", - fn_name, &v - ))); - } - } - None => { + fn get_value(self, fn_name: &str, vm: &VirtualMachine) -> PyResult<(isize, u8)> { + let fillchar = if let OptionalArg::Present(v) = self.fillchar { + match try_as_byte(v.clone()) { + Some(x) if x.len() == 1 => x[0], + _ => { return Err(vm.new_type_error(format!( "{}() argument 2 must be a byte string of length 1, not {}", fn_name, &v @@ -208,20 +199,7 @@ impl ByteInnerPaddingOptions { b' ' // default is space }; - // <0 = no change - let width = if let Some(x) = self.width.as_bigint().to_usize() { - if x <= len { - 0 - } else { - x - } - } else { - 0 - }; - - let diff: usize = if width != 0 { width - len } else { 0 }; - - Ok((fillbyte, diff)) + Ok((self.width, fillchar)) } } @@ -724,30 +702,27 @@ impl PyByteInner { .collect::>()) } + #[inline] + fn pad( + &self, + options: ByteInnerPaddingOptions, + pad: fn(&[u8], usize, u8) -> Vec, + vm: &VirtualMachine, + ) -> PyResult> { + let (width, fillchar) = options.get_value("center", vm)?; + Ok(if self.len() as isize >= width { + Vec::from(&self.elements[..]) + } else { + pad(&self.elements, width as usize, fillchar) + }) + } + pub fn center( &self, options: ByteInnerPaddingOptions, vm: &VirtualMachine, ) -> PyResult> { - let (fillbyte, diff) = options.get_value("center", self.len(), vm)?; - - let mut ln: usize = diff / 2; - let mut rn: usize = ln; - - if diff.is_odd() && self.len() % 2 == 0 { - ln += 1 - } - - if diff.is_odd() && self.len() % 2 != 0 { - rn += 1 - } - - // merge all - let mut res = vec![fillbyte; ln]; - res.extend_from_slice(&self.elements[..]); - res.extend_from_slice(&vec![fillbyte; rn][..]); - - Ok(res) + self.pad(options, PyCommonString::::py_center, vm) } pub fn ljust( @@ -755,14 +730,7 @@ impl PyByteInner { options: ByteInnerPaddingOptions, vm: &VirtualMachine, ) -> PyResult> { - let (fillbyte, diff) = options.get_value("ljust", self.len(), vm)?; - - // merge all - let mut res = vec![]; - res.extend_from_slice(&self.elements[..]); - res.extend_from_slice(&vec![fillbyte; diff][..]); - - Ok(res) + self.pad(options, PyCommonString::::py_ljust, vm) } pub fn rjust( @@ -770,13 +738,7 @@ impl PyByteInner { options: ByteInnerPaddingOptions, vm: &VirtualMachine, ) -> PyResult> { - let (fillbyte, diff) = options.get_value("rjust", self.len(), vm)?; - - // merge all - let mut res = vec![fillbyte; diff]; - res.extend_from_slice(&self.elements[..]); - - Ok(res) + self.pad(options, PyCommonString::::py_rjust, vm) } pub fn count(&self, options: ByteInnerFindOptions, vm: &VirtualMachine) -> PyResult { @@ -1259,8 +1221,8 @@ impl PyByteInner { } } -pub fn try_as_byte(obj: &PyObjectRef) -> Option> { - match_class!(match obj.clone() { +pub fn try_as_byte(obj: PyObjectRef) -> Option> { + match_class!(match obj { i @ PyBytes => Some(i.get_value().to_vec()), j @ PyByteArray => Some(j.borrow_value().elements.to_vec()), _ => None, @@ -1350,7 +1312,13 @@ impl PyCommonStringWrapper<[u8]> for PyByteInner { const ASCII_WHITESPACES: [u8; 6] = [0x20, 0x09, 0x0a, 0x0c, 0x0d, 0x0b]; impl PyCommonString for [u8] { - fn get_slice(&self, range: std::ops::Range) -> &Self { + type Container = Vec; + + fn with_capacity(capacity: usize) -> Self::Container { + Vec::with_capacity(capacity) + } + + fn get_bytes<'a>(&'a self, range: std::ops::Range) -> &'a Self { &self[range] } @@ -1358,7 +1326,11 @@ impl PyCommonString for [u8] { Self::is_empty(self) } - fn len(&self) -> usize { + fn bytes_len(&self) -> usize { + Self::len(self) + } + + fn chars_len(&self) -> usize { Self::len(self) } @@ -1407,4 +1379,12 @@ impl PyCommonString for [u8] { } splited } + + fn py_pad(&self, left: usize, right: usize, fill: u8) -> Self::Container { + let mut u = Vec::with_capacity(left + self.len() + right); + u.extend(std::iter::repeat(fill).take(left)); + u.extend_from_slice(self); + u.extend(std::iter::repeat(fill).take(right)); + u + } } diff --git a/vm/src/obj/objmemory.rs b/vm/src/obj/objmemory.rs index 6ae3507de6..4add6e329f 100644 --- a/vm/src/obj/objmemory.rs +++ b/vm/src/obj/objmemory.rs @@ -18,7 +18,7 @@ pub type PyMemoryViewRef = PyRef; #[pyimpl] impl PyMemoryView { pub fn try_value(&self) -> Option> { - try_as_byte(&self.obj_ref) + try_as_byte(self.obj_ref.clone()) } #[pyslot] diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 5268571773..474f20ca26 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -305,6 +305,7 @@ impl PyString { } #[pymethod(name = "__len__")] + #[inline] fn len(&self) -> usize { self.value.chars().count() } @@ -965,85 +966,69 @@ impl PyString { } } - fn get_fill_char<'a>( - rep: &'a OptionalArg, - vm: &VirtualMachine, - ) -> PyResult<&'a str> { - let rep_str = match rep { - OptionalArg::Present(ref st) => &st.value, - OptionalArg::Missing => " ", - }; - if rep_str.len() == 1 { - Ok(rep_str) - } else { - Err(vm - .new_type_error("The fill character must be exactly one character long".to_owned())) + fn get_fill_char(fillchar: OptionalArg, vm: &VirtualMachine) -> PyResult { + match fillchar { + OptionalArg::Present(ref s) => { + let mut chars = s.value.chars(); + let first = chars.next(); + let second = chars.next(); + if first.is_some() && second.is_none() { + // safeness is guaranteed by upper first.is_some() + Ok(first.unwrap_or_else(|| unsafe { std::hint::unreachable_unchecked() })) + } else { + Err(vm.new_type_error( + "The fill character must be exactly one character long".to_owned(), + )) + } + } + OptionalArg::Missing => Ok(' '), } } - #[pymethod] - fn ljust( + #[inline] + fn pad( &self, - len: usize, - rep: OptionalArg, + width: isize, + fillchar: OptionalArg, + pad: fn(&str, usize, char) -> String, vm: &VirtualMachine, ) -> PyResult { - let value = &self.value; - let rep_char = Self::get_fill_char(&rep, vm)?; - if len <= value.len() { - Ok(value.to_owned()) + let fillchar = Self::get_fill_char(fillchar, vm)?; + Ok(if self.len() as isize >= width { + String::from(&self.value) } else { - Ok(format!("{}{}", value, rep_char.repeat(len - value.len()))) - } - } - - #[pymethod] - fn rjust( - &self, - len: usize, - rep: OptionalArg, - vm: &VirtualMachine, - ) -> PyResult { - let value = &self.value; - let rep_char = Self::get_fill_char(&rep, vm)?; - if len <= value.len() { - Ok(value.to_owned()) - } else { - Ok(format!("{}{}", rep_char.repeat(len - value.len()), value)) - } + pad(&self.value, width as usize, fillchar) + }) } #[pymethod] fn center( &self, - len: usize, - rep: OptionalArg, + width: isize, + fillchar: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - let value = &self.value; - let rep_char = Self::get_fill_char(&rep, vm)?; - let value_len = self.value.chars().count(); + self.pad(width, fillchar, PyCommonString::::py_center, vm) + } - if len <= value_len { - return Ok(value.to_owned()); - } - let diff: usize = len - value_len; - let mut left_buff: usize = diff / 2; - let mut right_buff: usize = left_buff; + #[pymethod] + fn ljust( + &self, + width: isize, + fillchar: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + self.pad(width, fillchar, PyCommonString::::py_ljust, vm) + } - if diff % 2 != 0 && value_len % 2 == 0 { - left_buff += 1 - } - - if diff % 2 != 0 && value_len % 2 != 0 { - right_buff += 1 - } - Ok(format!( - "{}{}{}", - rep_char.repeat(left_buff), - value, - rep_char.repeat(right_buff) - )) + #[pymethod] + fn rjust( + &self, + width: isize, + fillchar: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + self.pad(width, fillchar, PyCommonString::::py_rjust, vm) } #[pymethod] @@ -1750,7 +1735,13 @@ impl PyCommonStringWrapper for PyStringRef { } impl PyCommonString for str { - fn get_slice(&self, range: std::ops::Range) -> &Self { + type Container = String; + + fn with_capacity(capacity: usize) -> Self::Container { + String::with_capacity(capacity) + } + + fn get_bytes<'a>(&'a self, range: std::ops::Range) -> &'a Self { &self[range] } @@ -1758,10 +1749,14 @@ impl PyCommonString for str { Self::is_empty(self) } - fn len(&self) -> usize { + fn bytes_len(&self) -> usize { Self::len(self) } + fn chars_len(&self) -> usize { + self.chars().count() + } + fn py_split_whitespace(&self, maxsplit: isize, convert: F) -> Vec where F: Fn(&Self) -> PyObjectRef, @@ -1813,4 +1808,12 @@ impl PyCommonString for str { } splited } + + fn py_pad(&self, left: usize, right: usize, fill: char) -> Self::Container { + let mut u = String::with_capacity((left + right) * fill.len_utf8() + self.len()); + u.extend(std::iter::repeat(fill).take(left)); + u.push_str(self); + u.extend(std::iter::repeat(fill).take(right)); + u + } } diff --git a/vm/src/obj/pystr.rs b/vm/src/obj/pystr.rs index 8ae01e3969..2b9f05cc3e 100644 --- a/vm/src/obj/pystr.rs +++ b/vm/src/obj/pystr.rs @@ -114,8 +114,12 @@ where } pub trait PyCommonString { - fn get_slice(&self, range: std::ops::Range) -> &Self; - fn len(&self) -> usize; + type Container; + + fn with_capacity(capacity: usize) -> Self::Container; + fn get_bytes<'a>(&'a self, range: std::ops::Range) -> &'a Self; + fn bytes_len(&self) -> usize; + fn chars_len(&self) -> usize; fn is_empty(&self) -> bool; fn py_split( @@ -164,9 +168,9 @@ pub trait PyCommonString { T: TryFromObject, F: Fn(&Self, &T) -> bool, { - let (affix, range) = args.get_value(self.len()); + let (affix, range) = args.get_value(self.bytes_len()); if range.is_normal() { - let value = self.get_slice(range); + let value = self.get_bytes(range); single_or_tuple_any( affix, |s: &T| Ok(func(value, s)), @@ -212,7 +216,7 @@ pub trait PyCommonString { { if range.is_normal() { let start = range.start; - if let Some(index) = find(self.get_slice(range), &needle) { + if let Some(index) = find(self.get_bytes(range), &needle) { return Some(start + index); } } @@ -225,9 +229,25 @@ pub trait PyCommonString { F: Fn(&Self, &Self) -> usize, { if range.is_normal() { - count(self.get_slice(range), &needle) + count(self.get_bytes(range), &needle) } else { 0 } } + + fn py_pad(&self, left: usize, right: usize, fillchar: E) -> Self::Container; + + fn py_center(&self, width: usize, fillchar: E) -> Self::Container { + let marg = width - self.chars_len(); + let left = marg / 2 + (marg & width & 1); + self.py_pad(left, marg - left, fillchar) + } + + fn py_ljust(&self, width: usize, fillchar: E) -> Self::Container { + self.py_pad(0, width - self.chars_len(), fillchar) + } + + fn py_rjust(&self, width: usize, fillchar: E) -> Self::Container { + self.py_pad(width - self.chars_len(), 0, fillchar) + } } From 5899348bdb0f6c2f377ca7867c94a5f1c5075c42 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Tue, 28 Apr 2020 01:01:58 +0900 Subject: [PATCH 2/3] fix replace in debug build --- vm/src/obj/objbyteinner.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index 7fbc5424ec..e27089697a 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -1092,7 +1092,9 @@ impl PyByteInner { { return Err(vm.new_overflow_error("replace bytes is too long".to_owned())); } - let result_len = self.elements.len() + count * (to.len() - from.len()); + let result_len = (self.elements.len() as isize + + count as isize * (to.len() as isize - from.len() as isize)) + as usize; let mut result = Vec::with_capacity(result_len); let mut last_end = 0; From a3eb9a33da3dc87ffcfce8d6f8bfb84b8245a805 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Wed, 29 Apr 2020 03:03:58 +0900 Subject: [PATCH 3/3] pystr::get_chars --- vm/src/obj/objbyteinner.rs | 4 ++++ vm/src/obj/objstr.rs | 13 +++++++++++++ vm/src/obj/pystr.rs | 6 ++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index e27089697a..e684e0fdc9 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -1324,6 +1324,10 @@ impl PyCommonString for [u8] { &self[range] } + fn get_chars<'a>(&'a self, range: std::ops::Range) -> &'a Self { + &self[range] + } + fn is_empty(&self) -> bool { Self::is_empty(self) } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 474f20ca26..2fd535d591 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -1745,6 +1745,19 @@ impl PyCommonString for str { &self[range] } + fn get_chars<'a>(&'a self, range: std::ops::Range) -> &'a Self { + let mut chars = self.chars(); + for _ in 0..range.start { + let _ = chars.next(); + } + let start = chars.as_str(); + for _ in range { + let _ = chars.next(); + } + let end = chars.as_str(); + &start[..start.len() - end.len()] + } + fn is_empty(&self) -> bool { Self::is_empty(self) } diff --git a/vm/src/obj/pystr.rs b/vm/src/obj/pystr.rs index 2b9f05cc3e..fba47558ae 100644 --- a/vm/src/obj/pystr.rs +++ b/vm/src/obj/pystr.rs @@ -118,6 +118,8 @@ pub trait PyCommonString { fn with_capacity(capacity: usize) -> Self::Container; fn get_bytes<'a>(&'a self, range: std::ops::Range) -> &'a Self; + // FIXME: get_chars is expensive for str + fn get_chars<'a>(&'a self, range: std::ops::Range) -> &'a Self; fn bytes_len(&self) -> usize; fn chars_len(&self) -> usize; fn is_empty(&self) -> bool; @@ -216,7 +218,7 @@ pub trait PyCommonString { { if range.is_normal() { let start = range.start; - if let Some(index) = find(self.get_bytes(range), &needle) { + if let Some(index) = find(self.get_chars(range), &needle) { return Some(start + index); } } @@ -229,7 +231,7 @@ pub trait PyCommonString { F: Fn(&Self, &Self) -> usize, { if range.is_normal() { - count(self.get_bytes(range), &needle) + count(self.get_chars(range), &needle) } else { 0 }