Tighten CPython parity for str format spec, %-format, and str() constructor (#7769)

Five related CPython parity gaps in `str` formatting and construction:

1. **`str(bytes, errors=...)` triggers decode mode.** Previously, only
   `encoding=` triggered decode; passing only `errors=` fell back to
   `repr()`. CPython's behavior: presence of `encoding` OR `errors`
   triggers decode mode (default UTF-8 when only `errors` is given).

2. **`'{...}'.format() IndexError wording.** Generic Rust "tuple index
   out of range" replaced with CPython's "Replacement index N out of
   range for positional args tuple".

3. **`{0:3.2s}.format('abc')` → 'ab '.** String format spec applied
   precision after width padding; CPython truncates BEFORE padding.
   Reorder the operations.

4. **`%x` / `%o` / `%X` / `%c` accept `__index__` objects.** Previously
   only `PyInt` downcast was attempted. Mirror CPython's
   PyNumber_Index dispatch via `try_index_opt`.

5. **`%d` / `%u` / `%i` error wording.** "a number is required" →
   "a real number is required" (matches CPython).

Also adds `not <type>` suffix to `%c` error messages so the type is
visible in TypeError text (matches CPython structure even without
fully-qualified names).

Verified byte-identical with CPython 3.14.4 across 25+ probes covering
the format/spec/constructor combinations. Unmasks
`test_str.test_constructor_keyword_args` and
`test_str.test_constructor_defaults`. test_str/test_bytes/test_format/
test_codecs/test_io/test_unicode_identifiers — 1,429 tests pass, 0
regressions. All 188 `extra_tests/snippets/*.py` pass under the CI
feature set.

`test_str.test_format` and `test_str.test_formatting` markers retained:
`test_format` still trips on `'{0:08s}'.format('result')` (numeric
zero-pad treated as fill+left-align by CPython for str type — separate
format-spec parser concern). `test_formatting` still trips on
`%c` error message expecting fully qualified `module.qualname` (RP
returns bare class name — separate broader concern).
This commit is contained in:
Changjoon
2026-05-04 10:28:43 +09:00
committed by GitHub
parent 90cc888f4b
commit 83002d7369
6 changed files with 117 additions and 48 deletions

View File

@@ -3120,7 +3120,6 @@ class ClassPropertiesAndMethods(unittest.TestCase):
## pass
## os_helper.unlink(os_helper.TESTFN)
@unittest.expectedFailure # TODO: RUSTPYTHON
def test_keywords(self):
# Testing keyword args to basic type constructors ...
with self.assertRaisesRegex(TypeError, 'keyword argument'):

View File

@@ -1074,7 +1074,7 @@ class StrTest(string_tests.StringLikeTest,
'\U00100000'.ljust(3, '\U00010000')
'\U00100000'.rjust(3, '\U00010000')
@unittest.expectedFailure # TODO: RUSTPYTHON; ? +
@unittest.expectedFailure # TODO: RUSTPYTHON; '{0:08s}'.format('result') misalign — '0' fill treated as numeric zero-pad for str type
def test_format(self):
self.assertEqual(''.format(), '')
self.assertEqual('a'.format(), 'a')
@@ -1503,7 +1503,7 @@ class StrTest(string_tests.StringLikeTest,
self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
@unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: %x format: an integer is required, not PseudoInt
@unittest.expectedFailure # TODO: RUSTPYTHON; %c error wording uses bare class name; CPython uses fully qualified module.qualname (e.g. test.test_str.PseudoFloat)
def test_formatting(self):
string_tests.StringLikeTest.test_formatting(self)
# Testing Unicode formatting strings...
@@ -1752,7 +1752,6 @@ class StrTest(string_tests.StringLikeTest,
'character buffers are decoded to unicode'
)
@unittest.expectedFailure # TODO: RUSTPYTHON; Pass various keyword argument combinations to the constructor.
def test_constructor_keyword_args(self):
"""Pass various keyword argument combinations to the constructor."""
# The object argument can be passed as a keyword.
@@ -1762,7 +1761,6 @@ class StrTest(string_tests.StringLikeTest,
self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
@unittest.expectedFailure # TODO: RUSTPYTHON; Check the constructor argument defaults.
def test_constructor_defaults(self):
"""Check the constructor argument defaults."""
# The object argument defaults to '' or b''.

View File

@@ -876,14 +876,15 @@ impl FormatSpec {
{
self.validate_format(FormatType::String)?;
match self.format_type {
Some(FormatType::String) | None => self
.format_sign_and_align(s, "", FormatAlign::Left)
.map(|mut value| {
if let Some(precision) = self.precision {
value.truncate(precision);
}
value
}),
Some(FormatType::String) | None => {
// CPython parity: precision truncates BEFORE width pads.
// `'{:3.2s}'.format('abc')` -> 'ab ' (truncate to 'ab', pad to 3).
let truncated: String = match self.precision {
Some(p) => s.deref().chars().take(p).collect(),
None => s.deref().to_owned(),
};
self.format_sign_and_align(&truncated, "", FormatAlign::Left)
}
_ => {
let ch = char::from(self.format_type.as_ref().unwrap());
Err(FormatSpecError::UnknownFormatCode(ch, "str"))
@@ -1081,6 +1082,12 @@ impl CharLen for AsciiStr<'_> {
}
}
impl CharLen for String {
fn char_len(&self) -> usize {
self.chars().count()
}
}
impl Deref for AsciiStr<'_> {
type Target = str;

View File

@@ -411,8 +411,12 @@ impl Constructor for PyStr {
// result as-is so any str subclass type the user returned is preserved
// (matches unicode_new_impl which only invokes unicode_subtype_new when
// type != &PyUnicode_Type).
// CPython parity: `errors` without `encoding` also triggers decode
// mode (with default UTF-8). The fast-path repr only applies when
// BOTH `encoding` and `errors` are missing.
if cls.is(vm.ctx.types.str_type)
&& args.encoding.is_missing()
&& args.errors.is_missing()
&& let OptionalArg::Present(input) = &args.object
{
return Ok(input.str(vm)?.into());
@@ -425,13 +429,31 @@ impl Constructor for PyStr {
fn py_new(_cls: &Py<PyType>, args: Self::Args, vm: &VirtualMachine) -> PyResult<Self> {
match args.object {
OptionalArg::Present(input) => {
if let OptionalArg::Present(enc) = args.encoding {
let s = vm.state.codec_registry.decode_text(
input,
enc.as_str(),
args.errors.into_option(),
vm,
)?;
let encoding = args.encoding.into_option();
let errors = args.errors.into_option();
// CPython parity: presence of `encoding` OR `errors` triggers
// decode mode. When `errors` is given alone, the encoding
// defaults to UTF-8.
if encoding.is_some() || errors.is_some() {
// CPython rejects str / non-bytes-like input early with
// specific TypeError wording (unicode_new_impl).
if input.fast_isinstance(vm.ctx.types.str_type) {
return Err(vm.new_type_error("decoding str is not supported"));
}
if !input.fast_isinstance(vm.ctx.types.bytes_type)
&& !input.fast_isinstance(vm.ctx.types.bytearray_type)
&& crate::protocol::PyBuffer::try_from_borrowed_object(vm, &input).is_err()
{
return Err(vm.new_type_error(format!(
"decoding to str: need a bytes-like object, {} found",
input.class().name()
)));
}
let enc_str = encoding.as_ref().map(|e| e.as_str()).unwrap_or("utf-8");
let s = vm
.state
.codec_registry
.decode_text(input, enc_str, errors, vm)?;
Ok(Self::from(s.as_wtf8().to_owned()))
} else {
let s = input.str(vm)?;

View File

@@ -64,6 +64,13 @@ fn spec_format_bytes(
Ok(spec.format_number(&bigint).into_bytes())
}
obj => {
// CPython parity: `%d` / `%i` / `%u` accept any object
// with `__index__` (preferred) or `__int__`.
if let Some(int_result) = obj.try_index_opt(vm) {
let i = int_result?;
check_int_to_str_digits(i.as_bigint(), vm)?;
return Ok(spec.format_number(i.as_bigint()).into_bytes());
}
if let Some(method) = vm.get_method(obj.clone(), identifier!(vm, __int__)) {
let result = method?.call((), vm)?;
if let Some(i) = result.downcast_ref::<PyInt>() {
@@ -72,7 +79,7 @@ fn spec_format_bytes(
}
}
Err(vm.new_type_error(format!(
"%{} format: a number is required, not {}",
"%{} format: a real number is required, not {}",
spec.format_type.to_char(),
obj.class().name()
)))
@@ -80,8 +87,13 @@ fn spec_format_bytes(
})
}
_ => {
// CPython parity: `%x` / `%o` / `%X` accept any object with
// `__index__`, not just PyInt. Mirrors PyNumber_Index dispatch.
if let Some(i) = obj.downcast_ref::<PyInt>() {
Ok(spec.format_number(i.as_bigint()).into_bytes())
} else if let Some(int_result) = obj.try_index_opt(vm) {
let i = int_result?;
Ok(spec.format_number(i.as_bigint()).into_bytes())
} else {
Err(vm.new_type_error(format!(
"%{} format: an integer is required, not {}",
@@ -105,12 +117,8 @@ fn spec_format_bytes(
Ok(spec.format_float(value.into()).into_bytes())
}
CFormatType::Character(CCharacterType::Character) => {
if let Some(i) = obj.downcast_ref::<PyInt>() {
let ch = i
.try_to_primitive::<u8>(vm)
.map_err(|_| vm.new_overflow_error("%c arg not in range(256)"))?;
return Ok(spec.format_char(ch));
}
// CPython parity: bytes `%c` accepts a single byte or any object
// with `__index__` in range(256).
if let Some(b) = obj.downcast_ref::<PyBytes>() {
if b.len() == 1 {
return Ok(spec.format_char(b.as_bytes()[0]));
@@ -121,7 +129,20 @@ fn spec_format_bytes(
return Ok(spec.format_char(buf[0]));
}
}
Err(vm.new_type_error("%c requires an integer in range(256) or a single byte"))
let int = if let Some(i) = obj.downcast_ref::<PyInt>() {
i.to_owned()
} else if let Some(int_result) = obj.try_index_opt(vm) {
int_result?
} else {
return Err(vm.new_type_error(format!(
"%c requires an integer in range(256) or a single byte, not {}",
obj.class().name()
)));
};
let ch = int
.try_to_primitive::<u8>(vm)
.map_err(|_| vm.new_overflow_error("%c arg not in range(256)"))?;
Ok(spec.format_char(ch))
}
}
}
@@ -161,6 +182,13 @@ fn spec_format_string(
Ok(spec.format_number(&bigint).into())
}
obj => {
// CPython parity: `%d` / `%i` / `%u` accept any object
// with `__index__` (preferred) or `__int__`.
if let Some(int_result) = obj.try_index_opt(vm) {
let i = int_result?;
check_int_to_str_digits(i.as_bigint(), vm)?;
return Ok(spec.format_number(i.as_bigint()).into());
}
if let Some(method) = vm.get_method(obj.clone(), identifier!(vm, __int__)) {
let result = method?.call((), vm)?;
if let Some(i) = result.downcast_ref::<PyInt>() {
@@ -169,7 +197,7 @@ fn spec_format_string(
}
}
Err(vm.new_type_error(format!(
"%{} format: a number is required, not {}",
"%{} format: a real number is required, not {}",
spec.format_type.to_char(),
obj.class().name()
)))
@@ -177,8 +205,13 @@ fn spec_format_string(
})
}
_ => {
// CPython parity: `%x` / `%o` / `%X` accept any object with
// `__index__`, not just PyInt. Mirrors PyNumber_Index dispatch.
if let Some(i) = obj.downcast_ref::<PyInt>() {
Ok(spec.format_number(i.as_bigint()).into())
} else if let Some(int_result) = obj.try_index_opt(vm) {
let i = int_result?;
Ok(spec.format_number(i.as_bigint()).into())
} else {
Err(vm.new_type_error(format!(
"%{} format: an integer is required, not {}",
@@ -193,20 +226,29 @@ fn spec_format_string(
Ok(spec.format_float(value.into()).into())
}
CFormatType::Character(CCharacterType::Character) => {
if let Some(i) = obj.downcast_ref::<PyInt>() {
let ch = i
.as_bigint()
.to_u32()
.and_then(CodePoint::from_u32)
.ok_or_else(|| vm.new_overflow_error("%c arg not in range(0x110000)"))?;
return Ok(spec.format_char(ch));
}
// CPython parity: `%c` accepts a single-char str or any object with
// `__index__` (the latter via PyNumber_Index dispatch).
if let Some(s) = obj.downcast_ref::<PyStr>()
&& let Ok(ch) = s.as_wtf8().code_points().exactly_one()
{
return Ok(spec.format_char(ch));
}
Err(vm.new_type_error("%c requires int or char"))
let int = if let Some(i) = obj.downcast_ref::<PyInt>() {
i.to_owned()
} else if let Some(int_result) = obj.try_index_opt(vm) {
int_result?
} else {
return Err(vm.new_type_error(format!(
"%c requires an int or a unicode character, not {}",
obj.class().name()
)));
};
let ch = int
.as_bigint()
.to_u32()
.and_then(CodePoint::from_u32)
.ok_or_else(|| vm.new_overflow_error("%c arg not in range(0x110000)"))?;
Ok(spec.format_char(ch))
}
}
}

View File

@@ -198,11 +198,12 @@ pub(crate) fn format(
));
}
auto_argument_index += 1;
arguments
.args
.get(auto_argument_index - 1)
.cloned()
.ok_or_else(|| vm.new_index_error("tuple index out of range"))
let idx = auto_argument_index - 1;
arguments.args.get(idx).cloned().ok_or_else(|| {
vm.new_index_error(format!(
"Replacement index {idx} out of range for positional args tuple"
))
})
}
FieldType::Index(index) => {
if auto_argument_index != 0 {
@@ -211,11 +212,11 @@ pub(crate) fn format(
));
}
seen_index = true;
arguments
.args
.get(index)
.cloned()
.ok_or_else(|| vm.new_index_error("tuple index out of range"))
arguments.args.get(index).cloned().ok_or_else(|| {
vm.new_index_error(format!(
"Replacement index {index} out of range for positional args tuple"
))
})
}
FieldType::Keyword(keyword) => keyword
.as_str()