diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index ae9449f11e..3a9c6d2741 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -149,8 +149,6 @@ class ReadTest(MixInCheckStateHandling): "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding)) ) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_readline(self): def getreader(input): stream = io.BytesIO(input.encode(self.encoding)) @@ -463,6 +461,12 @@ class UTF32Test(ReadTest, unittest.TestCase): b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + + # TODO: RUSTPYTHON @unittest.expectedFailure def test_only_one_bom(self): @@ -593,6 +597,11 @@ class UTF32LETest(ReadTest, unittest.TestCase): encoding = "utf-32-le" ill_formed_sequence = b"\x80\xdc\x00\x00" + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + # TODO: RUSTPYTHON @unittest.expectedFailure def test_partial(self): @@ -677,6 +686,11 @@ class UTF32BETest(ReadTest, unittest.TestCase): encoding = "utf-32-be" ill_formed_sequence = b"\x00\x00\xdc\x80" + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + # TODO: RUSTPYTHON @unittest.expectedFailure def test_partial(self): @@ -1048,6 +1062,11 @@ class UTF8Test(ReadTest, unittest.TestCase): class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + # TODO: RUSTPYTHON @unittest.expectedFailure def test_ascii(self): @@ -2546,6 +2565,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase): def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + def test_empty(self): self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) @@ -2683,6 +2707,11 @@ class RawUnicodeEscapeTest(ReadTest, unittest.TestCase): def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_readline(self): # TODO: RUSTPYTHON, remove when this passes + super().test_readline() # TODO: RUSTPYTHON, remove when this passes + def test_empty(self): self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 12060fc40a..8adf97b731 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -171,6 +171,8 @@ assert 'hello\nhallo\nHallo'.splitlines() == ['hello', 'hallo', 'Hallo'] assert 'hello\nhallo\nHallo\n'.splitlines() == ['hello', 'hallo', 'Hallo'] assert 'hello\nhallo\nHallo'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo'] assert 'hello\nhallo\nHallo\n'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo\n'] +assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines() == ['hello', 'hallo', 'Hallo', 'HELLO', 'hoho', 'haha', 'another', 'yetanother', 'last', '.'] +assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines(keepends=True) == ['hello\x0b', 'hallo\x0c', 'Hallo\x1c', 'HELLO\x1d', 'hoho\x1e', 'haha\x85', 'another\u2028', 'yetanother\u2029', 'last\r\n', '.'] assert 'abc\t12345\txyz'.expandtabs() == 'abc 12345 xyz' assert '-'.join(['1', '2', '3']) == '1-2-3' assert 'HALLO'.isupper() diff --git a/vm/src/anystr.rs b/vm/src/anystr.rs index 1becc7d71c..5479023127 100644 --- a/vm/src/anystr.rs +++ b/vm/src/anystr.rs @@ -376,7 +376,7 @@ pub trait AnyStr { } } - fn py_splitlines(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec + fn py_bytes_splitlines(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec where FW: Fn(&Self) -> W, { diff --git a/vm/src/builtins/str.rs b/vm/src/builtins/str.rs index a2f85479e7..21b7f23a37 100644 --- a/vm/src/builtins/str.rs +++ b/vm/src/builtins/str.rs @@ -893,8 +893,41 @@ impl PyStr { #[pymethod] fn splitlines(&self, args: anystr::SplitLinesArgs, vm: &VirtualMachine) -> Vec { - self.as_str() - .py_splitlines(args, |s| self.new_substr(s.to_owned()).to_pyobject(vm)) + let into_wrapper = |s: &str| self.new_substr(s.to_owned()).to_pyobject(vm); + let mut elements = Vec::new(); + let mut last_i = 0; + let self_str = self.as_str(); + let mut enumerated = self_str.char_indices().peekable(); + while let Some((i, ch)) = enumerated.next() { + let end_len = match ch { + '\n' => 1, + '\r' => { + let is_rn = enumerated.peek().map_or(false, |(_, ch)| *ch == '\n'); + if is_rn { + let _ = enumerated.next(); + 2 + } else { + 1 + } + } + '\x0b' | '\x0c' | '\x1c' | '\x1d' | '\x1e' | '\u{0085}' | '\u{2028}' + | '\u{2029}' => ch.len_utf8(), + _ => { + continue; + } + }; + let range = if args.keepends { + last_i..i + end_len + } else { + last_i..i + }; + last_i = i + end_len; + elements.push(into_wrapper(&self_str[range])); + } + if last_i != self_str.len() { + elements.push(into_wrapper(&self_str[last_i..])); + } + elements } #[pymethod] diff --git a/vm/src/bytesinner.rs b/vm/src/bytesinner.rs index cf282be29f..f2a0a7f359 100644 --- a/vm/src/bytesinner.rs +++ b/vm/src/bytesinner.rs @@ -716,7 +716,7 @@ impl PyBytesInner { where FW: Fn(&[u8]) -> W, { - self.elements.py_splitlines(options, into_wrapper) + self.elements.py_bytes_splitlines(options, into_wrapper) } pub fn zfill(&self, width: isize) -> Vec {