From 83002d73690c0a0a38968dcb74e65efe3c61059d Mon Sep 17 00:00:00 2001 From: Changjoon Date: Mon, 4 May 2026 10:28:43 +0900 Subject: [PATCH] Tighten CPython parity for str format spec, %-format, and str() constructor (#7769) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five related CPython parity gaps in `str` formatting and construction: 1. **`str(bytes, errors=...)` triggers decode mode.** Previously, only `encoding=` triggered decode; passing only `errors=` fell back to `repr()`. CPython's behavior: presence of `encoding` OR `errors` triggers decode mode (default UTF-8 when only `errors` is given). 2. **`'{...}'.format() IndexError wording.** Generic Rust "tuple index out of range" replaced with CPython's "Replacement index N out of range for positional args tuple". 3. **`{0:3.2s}.format('abc')` → 'ab '.** String format spec applied precision after width padding; CPython truncates BEFORE padding. Reorder the operations. 4. **`%x` / `%o` / `%X` / `%c` accept `__index__` objects.** Previously only `PyInt` downcast was attempted. Mirror CPython's PyNumber_Index dispatch via `try_index_opt`. 5. **`%d` / `%u` / `%i` error wording.** "a number is required" → "a real number is required" (matches CPython). Also adds `not ` suffix to `%c` error messages so the type is visible in TypeError text (matches CPython structure even without fully-qualified names). Verified byte-identical with CPython 3.14.4 across 25+ probes covering the format/spec/constructor combinations. Unmasks `test_str.test_constructor_keyword_args` and `test_str.test_constructor_defaults`. test_str/test_bytes/test_format/ test_codecs/test_io/test_unicode_identifiers — 1,429 tests pass, 0 regressions. All 188 `extra_tests/snippets/*.py` pass under the CI feature set. `test_str.test_format` and `test_str.test_formatting` markers retained: `test_format` still trips on `'{0:08s}'.format('result')` (numeric zero-pad treated as fill+left-align by CPython for str type — separate format-spec parser concern). `test_formatting` still trips on `%c` error message expecting fully qualified `module.qualname` (RP returns bare class name — separate broader concern). --- Lib/test/test_descr.py | 1 - Lib/test/test_str.py | 6 +-- crates/common/src/format.rs | 23 +++++++---- crates/vm/src/builtins/str.rs | 36 ++++++++++++---- crates/vm/src/cformat.rs | 78 +++++++++++++++++++++++++++-------- crates/vm/src/format.rs | 21 +++++----- 6 files changed, 117 insertions(+), 48 deletions(-) diff --git a/Lib/test/test_descr.py b/Lib/test/test_descr.py index d19789537..a15ab20e6 100644 --- a/Lib/test/test_descr.py +++ b/Lib/test/test_descr.py @@ -3120,7 +3120,6 @@ class ClassPropertiesAndMethods(unittest.TestCase): ## pass ## os_helper.unlink(os_helper.TESTFN) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_keywords(self): # Testing keyword args to basic type constructors ... with self.assertRaisesRegex(TypeError, 'keyword argument'): diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 116ae93b4..6d0e935c1 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -1074,7 +1074,7 @@ class StrTest(string_tests.StringLikeTest, '\U00100000'.ljust(3, '\U00010000') '\U00100000'.rjust(3, '\U00010000') - @unittest.expectedFailure # TODO: RUSTPYTHON; ? + + @unittest.expectedFailure # TODO: RUSTPYTHON; '{0:08s}'.format('result') misalign — '0' fill treated as numeric zero-pad for str type def test_format(self): self.assertEqual(''.format(), '') self.assertEqual('a'.format(), 'a') @@ -1503,7 +1503,7 @@ class StrTest(string_tests.StringLikeTest, self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3') self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g') - @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: %x format: an integer is required, not PseudoInt + @unittest.expectedFailure # TODO: RUSTPYTHON; %c error wording uses bare class name; CPython uses fully qualified module.qualname (e.g. test.test_str.PseudoFloat) def test_formatting(self): string_tests.StringLikeTest.test_formatting(self) # Testing Unicode formatting strings... @@ -1752,7 +1752,6 @@ class StrTest(string_tests.StringLikeTest, 'character buffers are decoded to unicode' ) - @unittest.expectedFailure # TODO: RUSTPYTHON; Pass various keyword argument combinations to the constructor. def test_constructor_keyword_args(self): """Pass various keyword argument combinations to the constructor.""" # The object argument can be passed as a keyword. @@ -1762,7 +1761,6 @@ class StrTest(string_tests.StringLikeTest, self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'" self.assertEqual(str(object=b'foo', errors='strict'), 'foo') - @unittest.expectedFailure # TODO: RUSTPYTHON; Check the constructor argument defaults. def test_constructor_defaults(self): """Check the constructor argument defaults.""" # The object argument defaults to '' or b''. diff --git a/crates/common/src/format.rs b/crates/common/src/format.rs index 8c69e7001..1eee5ba67 100644 --- a/crates/common/src/format.rs +++ b/crates/common/src/format.rs @@ -876,14 +876,15 @@ impl FormatSpec { { self.validate_format(FormatType::String)?; match self.format_type { - Some(FormatType::String) | None => self - .format_sign_and_align(s, "", FormatAlign::Left) - .map(|mut value| { - if let Some(precision) = self.precision { - value.truncate(precision); - } - value - }), + Some(FormatType::String) | None => { + // CPython parity: precision truncates BEFORE width pads. + // `'{:3.2s}'.format('abc')` -> 'ab ' (truncate to 'ab', pad to 3). + let truncated: String = match self.precision { + Some(p) => s.deref().chars().take(p).collect(), + None => s.deref().to_owned(), + }; + self.format_sign_and_align(&truncated, "", FormatAlign::Left) + } _ => { let ch = char::from(self.format_type.as_ref().unwrap()); Err(FormatSpecError::UnknownFormatCode(ch, "str")) @@ -1081,6 +1082,12 @@ impl CharLen for AsciiStr<'_> { } } +impl CharLen for String { + fn char_len(&self) -> usize { + self.chars().count() + } +} + impl Deref for AsciiStr<'_> { type Target = str; diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index fc509f2df..402cde304 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -411,8 +411,12 @@ impl Constructor for PyStr { // result as-is so any str subclass type the user returned is preserved // (matches unicode_new_impl which only invokes unicode_subtype_new when // type != &PyUnicode_Type). + // CPython parity: `errors` without `encoding` also triggers decode + // mode (with default UTF-8). The fast-path repr only applies when + // BOTH `encoding` and `errors` are missing. if cls.is(vm.ctx.types.str_type) && args.encoding.is_missing() + && args.errors.is_missing() && let OptionalArg::Present(input) = &args.object { return Ok(input.str(vm)?.into()); @@ -425,13 +429,31 @@ impl Constructor for PyStr { fn py_new(_cls: &Py, args: Self::Args, vm: &VirtualMachine) -> PyResult { match args.object { OptionalArg::Present(input) => { - if let OptionalArg::Present(enc) = args.encoding { - let s = vm.state.codec_registry.decode_text( - input, - enc.as_str(), - args.errors.into_option(), - vm, - )?; + let encoding = args.encoding.into_option(); + let errors = args.errors.into_option(); + // CPython parity: presence of `encoding` OR `errors` triggers + // decode mode. When `errors` is given alone, the encoding + // defaults to UTF-8. + if encoding.is_some() || errors.is_some() { + // CPython rejects str / non-bytes-like input early with + // specific TypeError wording (unicode_new_impl). + if input.fast_isinstance(vm.ctx.types.str_type) { + return Err(vm.new_type_error("decoding str is not supported")); + } + if !input.fast_isinstance(vm.ctx.types.bytes_type) + && !input.fast_isinstance(vm.ctx.types.bytearray_type) + && crate::protocol::PyBuffer::try_from_borrowed_object(vm, &input).is_err() + { + return Err(vm.new_type_error(format!( + "decoding to str: need a bytes-like object, {} found", + input.class().name() + ))); + } + let enc_str = encoding.as_ref().map(|e| e.as_str()).unwrap_or("utf-8"); + let s = vm + .state + .codec_registry + .decode_text(input, enc_str, errors, vm)?; Ok(Self::from(s.as_wtf8().to_owned())) } else { let s = input.str(vm)?; diff --git a/crates/vm/src/cformat.rs b/crates/vm/src/cformat.rs index 87b943f89..6bf6062c8 100644 --- a/crates/vm/src/cformat.rs +++ b/crates/vm/src/cformat.rs @@ -64,6 +64,13 @@ fn spec_format_bytes( Ok(spec.format_number(&bigint).into_bytes()) } obj => { + // CPython parity: `%d` / `%i` / `%u` accept any object + // with `__index__` (preferred) or `__int__`. + if let Some(int_result) = obj.try_index_opt(vm) { + let i = int_result?; + check_int_to_str_digits(i.as_bigint(), vm)?; + return Ok(spec.format_number(i.as_bigint()).into_bytes()); + } if let Some(method) = vm.get_method(obj.clone(), identifier!(vm, __int__)) { let result = method?.call((), vm)?; if let Some(i) = result.downcast_ref::() { @@ -72,7 +79,7 @@ fn spec_format_bytes( } } Err(vm.new_type_error(format!( - "%{} format: a number is required, not {}", + "%{} format: a real number is required, not {}", spec.format_type.to_char(), obj.class().name() ))) @@ -80,8 +87,13 @@ fn spec_format_bytes( }) } _ => { + // CPython parity: `%x` / `%o` / `%X` accept any object with + // `__index__`, not just PyInt. Mirrors PyNumber_Index dispatch. if let Some(i) = obj.downcast_ref::() { Ok(spec.format_number(i.as_bigint()).into_bytes()) + } else if let Some(int_result) = obj.try_index_opt(vm) { + let i = int_result?; + Ok(spec.format_number(i.as_bigint()).into_bytes()) } else { Err(vm.new_type_error(format!( "%{} format: an integer is required, not {}", @@ -105,12 +117,8 @@ fn spec_format_bytes( Ok(spec.format_float(value.into()).into_bytes()) } CFormatType::Character(CCharacterType::Character) => { - if let Some(i) = obj.downcast_ref::() { - let ch = i - .try_to_primitive::(vm) - .map_err(|_| vm.new_overflow_error("%c arg not in range(256)"))?; - return Ok(spec.format_char(ch)); - } + // CPython parity: bytes `%c` accepts a single byte or any object + // with `__index__` in range(256). if let Some(b) = obj.downcast_ref::() { if b.len() == 1 { return Ok(spec.format_char(b.as_bytes()[0])); @@ -121,7 +129,20 @@ fn spec_format_bytes( return Ok(spec.format_char(buf[0])); } } - Err(vm.new_type_error("%c requires an integer in range(256) or a single byte")) + let int = if let Some(i) = obj.downcast_ref::() { + i.to_owned() + } else if let Some(int_result) = obj.try_index_opt(vm) { + int_result? + } else { + return Err(vm.new_type_error(format!( + "%c requires an integer in range(256) or a single byte, not {}", + obj.class().name() + ))); + }; + let ch = int + .try_to_primitive::(vm) + .map_err(|_| vm.new_overflow_error("%c arg not in range(256)"))?; + Ok(spec.format_char(ch)) } } } @@ -161,6 +182,13 @@ fn spec_format_string( Ok(spec.format_number(&bigint).into()) } obj => { + // CPython parity: `%d` / `%i` / `%u` accept any object + // with `__index__` (preferred) or `__int__`. + if let Some(int_result) = obj.try_index_opt(vm) { + let i = int_result?; + check_int_to_str_digits(i.as_bigint(), vm)?; + return Ok(spec.format_number(i.as_bigint()).into()); + } if let Some(method) = vm.get_method(obj.clone(), identifier!(vm, __int__)) { let result = method?.call((), vm)?; if let Some(i) = result.downcast_ref::() { @@ -169,7 +197,7 @@ fn spec_format_string( } } Err(vm.new_type_error(format!( - "%{} format: a number is required, not {}", + "%{} format: a real number is required, not {}", spec.format_type.to_char(), obj.class().name() ))) @@ -177,8 +205,13 @@ fn spec_format_string( }) } _ => { + // CPython parity: `%x` / `%o` / `%X` accept any object with + // `__index__`, not just PyInt. Mirrors PyNumber_Index dispatch. if let Some(i) = obj.downcast_ref::() { Ok(spec.format_number(i.as_bigint()).into()) + } else if let Some(int_result) = obj.try_index_opt(vm) { + let i = int_result?; + Ok(spec.format_number(i.as_bigint()).into()) } else { Err(vm.new_type_error(format!( "%{} format: an integer is required, not {}", @@ -193,20 +226,29 @@ fn spec_format_string( Ok(spec.format_float(value.into()).into()) } CFormatType::Character(CCharacterType::Character) => { - if let Some(i) = obj.downcast_ref::() { - let ch = i - .as_bigint() - .to_u32() - .and_then(CodePoint::from_u32) - .ok_or_else(|| vm.new_overflow_error("%c arg not in range(0x110000)"))?; - return Ok(spec.format_char(ch)); - } + // CPython parity: `%c` accepts a single-char str or any object with + // `__index__` (the latter via PyNumber_Index dispatch). if let Some(s) = obj.downcast_ref::() && let Ok(ch) = s.as_wtf8().code_points().exactly_one() { return Ok(spec.format_char(ch)); } - Err(vm.new_type_error("%c requires int or char")) + let int = if let Some(i) = obj.downcast_ref::() { + i.to_owned() + } else if let Some(int_result) = obj.try_index_opt(vm) { + int_result? + } else { + return Err(vm.new_type_error(format!( + "%c requires an int or a unicode character, not {}", + obj.class().name() + ))); + }; + let ch = int + .as_bigint() + .to_u32() + .and_then(CodePoint::from_u32) + .ok_or_else(|| vm.new_overflow_error("%c arg not in range(0x110000)"))?; + Ok(spec.format_char(ch)) } } } diff --git a/crates/vm/src/format.rs b/crates/vm/src/format.rs index e9db87fd1..77322d062 100644 --- a/crates/vm/src/format.rs +++ b/crates/vm/src/format.rs @@ -198,11 +198,12 @@ pub(crate) fn format( )); } auto_argument_index += 1; - arguments - .args - .get(auto_argument_index - 1) - .cloned() - .ok_or_else(|| vm.new_index_error("tuple index out of range")) + let idx = auto_argument_index - 1; + arguments.args.get(idx).cloned().ok_or_else(|| { + vm.new_index_error(format!( + "Replacement index {idx} out of range for positional args tuple" + )) + }) } FieldType::Index(index) => { if auto_argument_index != 0 { @@ -211,11 +212,11 @@ pub(crate) fn format( )); } seen_index = true; - arguments - .args - .get(index) - .cloned() - .ok_or_else(|| vm.new_index_error("tuple index out of range")) + arguments.args.get(index).cloned().ok_or_else(|| { + vm.new_index_error(format!( + "Replacement index {index} out of range for positional args tuple" + )) + }) } FieldType::Keyword(keyword) => keyword .as_str()