forked from Rust-related/RustPython
Merge pull request #4709 from r3m0t/str-splitlines
Fix str.splitlines to recognise unicode line terminators
This commit is contained in:
33
Lib/test/test_codecs.py
vendored
33
Lib/test/test_codecs.py
vendored
@@ -149,8 +149,6 @@ class ReadTest(MixInCheckStateHandling):
|
||||
"".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
|
||||
)
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self):
|
||||
def getreader(input):
|
||||
stream = io.BytesIO(input.encode(self.encoding))
|
||||
@@ -463,6 +461,12 @@ class UTF32Test(ReadTest, unittest.TestCase):
|
||||
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
|
||||
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_only_one_bom(self):
|
||||
@@ -593,6 +597,11 @@ class UTF32LETest(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-32-le"
|
||||
ill_formed_sequence = b"\x80\xdc\x00\x00"
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_partial(self):
|
||||
@@ -677,6 +686,11 @@ class UTF32BETest(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-32-be"
|
||||
ill_formed_sequence = b"\x00\x00\xdc\x80"
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_partial(self):
|
||||
@@ -1048,6 +1062,11 @@ class UTF8Test(ReadTest, unittest.TestCase):
|
||||
class UTF7Test(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-7"
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_ascii(self):
|
||||
@@ -2546,6 +2565,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
|
||||
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
|
||||
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
|
||||
@@ -2683,6 +2707,11 @@ class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
|
||||
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
# TODO: RUSTPYTHON
|
||||
@unittest.expectedFailure
|
||||
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
|
||||
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
|
||||
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
|
||||
|
||||
@@ -171,6 +171,8 @@ assert 'hello\nhallo\nHallo'.splitlines() == ['hello', 'hallo', 'Hallo']
|
||||
assert 'hello\nhallo\nHallo\n'.splitlines() == ['hello', 'hallo', 'Hallo']
|
||||
assert 'hello\nhallo\nHallo'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo']
|
||||
assert 'hello\nhallo\nHallo\n'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo\n']
|
||||
assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines() == ['hello', 'hallo', 'Hallo', 'HELLO', 'hoho', 'haha', 'another', 'yetanother', 'last', '.']
|
||||
assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines(keepends=True) == ['hello\x0b', 'hallo\x0c', 'Hallo\x1c', 'HELLO\x1d', 'hoho\x1e', 'haha\x85', 'another\u2028', 'yetanother\u2029', 'last\r\n', '.']
|
||||
assert 'abc\t12345\txyz'.expandtabs() == 'abc 12345 xyz'
|
||||
assert '-'.join(['1', '2', '3']) == '1-2-3'
|
||||
assert 'HALLO'.isupper()
|
||||
|
||||
@@ -376,7 +376,9 @@ pub trait AnyStr {
|
||||
}
|
||||
}
|
||||
|
||||
fn py_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
|
||||
# TODO: remove this function from anystr.
|
||||
# See https://github.com/RustPython/RustPython/pull/4709/files#r1141013993
|
||||
fn py_bytes_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
|
||||
where
|
||||
FW: Fn(&Self) -> W,
|
||||
{
|
||||
|
||||
@@ -894,8 +894,41 @@ impl PyStr {
|
||||
|
||||
#[pymethod]
|
||||
fn splitlines(&self, args: anystr::SplitLinesArgs, vm: &VirtualMachine) -> Vec<PyObjectRef> {
|
||||
self.as_str()
|
||||
.py_splitlines(args, |s| self.new_substr(s.to_owned()).to_pyobject(vm))
|
||||
let into_wrapper = |s: &str| self.new_substr(s.to_owned()).to_pyobject(vm);
|
||||
let mut elements = Vec::new();
|
||||
let mut last_i = 0;
|
||||
let self_str = self.as_str();
|
||||
let mut enumerated = self_str.char_indices().peekable();
|
||||
while let Some((i, ch)) = enumerated.next() {
|
||||
let end_len = match ch {
|
||||
'\n' => 1,
|
||||
'\r' => {
|
||||
let is_rn = enumerated.peek().map_or(false, |(_, ch)| *ch == '\n');
|
||||
if is_rn {
|
||||
let _ = enumerated.next();
|
||||
2
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
'\x0b' | '\x0c' | '\x1c' | '\x1d' | '\x1e' | '\u{0085}' | '\u{2028}'
|
||||
| '\u{2029}' => ch.len_utf8(),
|
||||
_ => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let range = if args.keepends {
|
||||
last_i..i + end_len
|
||||
} else {
|
||||
last_i..i
|
||||
};
|
||||
last_i = i + end_len;
|
||||
elements.push(into_wrapper(&self_str[range]));
|
||||
}
|
||||
if last_i != self_str.len() {
|
||||
elements.push(into_wrapper(&self_str[last_i..]));
|
||||
}
|
||||
elements
|
||||
}
|
||||
|
||||
#[pymethod]
|
||||
|
||||
@@ -716,7 +716,7 @@ impl PyBytesInner {
|
||||
where
|
||||
FW: Fn(&[u8]) -> W,
|
||||
{
|
||||
self.elements.py_splitlines(options, into_wrapper)
|
||||
self.elements.py_bytes_splitlines(options, into_wrapper)
|
||||
}
|
||||
|
||||
pub fn zfill(&self, width: isize) -> Vec<u8> {
|
||||
|
||||
Reference in New Issue
Block a user