Merge pull request #7318 from youknowone/utf8source

Update test_utf8source from v3.14.3 and implement it
This commit is contained in:
Jeong, YunWon
2026-03-03 22:29:14 +09:00
committed by GitHub
4 changed files with 142 additions and 14 deletions

View File

@@ -62,14 +62,12 @@ class EncodingTest:
self.run_test(self.source_line.encode('utf-8'))
# [encoding first line]
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 17
def test_encoding_on_first_line(self):
encoding = 'Latin-1'
source = self.create_source(encoding)
self.run_test(source)
# [encoding second line]
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 34
def test_encoding_on_second_line(self):
source = b"#/usr/bin/python\n" + self.create_source('Latin-1')
self.run_test(source)
@@ -84,7 +82,6 @@ class EncodingTest:
self.run_test(source)
# [BOM conflict]
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 20
def test_bom_conflict(self):
source = codecs.BOM_UTF8 + self.create_source('latin-1')
with self.assertRaises(SyntaxError):

View File

@@ -752,7 +752,6 @@ class RunPathTestCase(unittest.TestCase, CodeExecutionMixin):
with infinite_recursion(25):
self.assertRaises(RecursionError, run_path, zip_name)
@unittest.expectedFailure # TODO: RUSTPYTHON; detect encoding comments in files
def test_encoding(self):
with temp_dir() as script_dir:
filename = os.path.join(script_dir, 'script.py')

View File

@@ -1,5 +1,3 @@
# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
import unittest
class PEP3120Test(unittest.TestCase):
@@ -14,11 +12,9 @@ class PEP3120Test(unittest.TestCase):
b'\\\xd0\x9f'
)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_badsyntax(self):
try:
import test.badsyntax_pep3120
import test.tokenizedata.badsyntax_pep3120 # noqa: F401
except SyntaxError as msg:
msg = str(msg).lower()
self.assertTrue('utf-8' in msg)
@@ -28,8 +24,6 @@ class PEP3120Test(unittest.TestCase):
class BuiltinCompileTests(unittest.TestCase):
# TODO: RUSTPYTHON
@unittest.expectedFailure
# Issue 3574.
def test_latin1(self):
# Allow compile() to read Latin-1 source.

View File

@@ -111,6 +111,145 @@ mod builtins {
_feature_version: OptionalArg<i32>,
}
/// Detect PEP 263 encoding cookie from source bytes.
/// Checks first two lines for `# coding[:=] <encoding>` pattern.
/// Returns the encoding name if found, or None for default (UTF-8).
#[cfg(feature = "parser")]
fn detect_source_encoding(source: &[u8]) -> Option<String> {
fn find_encoding_in_line(line: &[u8]) -> Option<String> {
// PEP 263: '#' must be preceded only by whitespace/formfeed
let hash_pos = line.iter().position(|&b| b == b'#')?;
if !line[..hash_pos]
.iter()
.all(|&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
{
return None;
}
let after_hash = &line[hash_pos..];
// Find "coding" after the #
let coding_pos = after_hash.windows(6).position(|w| w == b"coding")?;
let after_coding = &after_hash[coding_pos + 6..];
// Next char must be ':' or '='
let rest = if after_coding.first() == Some(&b':') || after_coding.first() == Some(&b'=')
{
&after_coding[1..]
} else {
return None;
};
// Skip whitespace
let rest = rest
.iter()
.copied()
.skip_while(|&b| b == b' ' || b == b'\t')
.collect::<Vec<_>>();
// Read encoding name: [-\w.]+
let name: String = rest
.iter()
.take_while(|&&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.')
.map(|&b| b as char)
.collect();
if name.is_empty() { None } else { Some(name) }
}
// Split into lines (first two only)
let mut lines = source.splitn(3, |&b| b == b'\n');
if let Some(first) = lines.next() {
// Strip BOM if present
let first = first.strip_prefix(b"\xef\xbb\xbf").unwrap_or(first);
if let Some(enc) = find_encoding_in_line(first) {
return Some(enc);
}
// Only check second line if first line is blank or a comment
let trimmed = first
.iter()
.skip_while(|&&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
.copied()
.collect::<Vec<_>>();
if !trimmed.is_empty() && trimmed[0] != b'#' {
return None;
}
}
lines.next().and_then(find_encoding_in_line)
}
/// Decode source bytes to a string, handling PEP 263 encoding declarations
/// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding
/// declaration (matching CPython behavior).
/// Check if an encoding name is a UTF-8 variant after normalization.
/// Matches: utf-8, utf_8, utf8, UTF-8, etc.
#[cfg(feature = "parser")]
fn is_utf8_encoding(name: &str) -> bool {
let normalized: String = name.chars().filter(|&c| c != '-' && c != '_').collect();
normalized.eq_ignore_ascii_case("utf8")
}
#[cfg(feature = "parser")]
fn decode_source_bytes(source: &[u8], filename: &str, vm: &VirtualMachine) -> PyResult<String> {
let has_bom = source.starts_with(b"\xef\xbb\xbf");
let encoding = detect_source_encoding(source);
let is_utf8 = encoding.as_deref().is_none_or(is_utf8_encoding);
// Validate BOM + encoding combination
if has_bom && !is_utf8 {
return Err(vm.new_exception_msg(
vm.ctx.exceptions.syntax_error.to_owned(),
format!("encoding problem for '{filename}': utf-8").into(),
));
}
if is_utf8 {
let src = if has_bom { &source[3..] } else { source };
match core::str::from_utf8(src) {
Ok(s) => Ok(s.to_owned()),
Err(e) => {
let bad_byte = src[e.valid_up_to()];
let line = src[..e.valid_up_to()]
.iter()
.filter(|&&b| b == b'\n')
.count()
+ 1;
Err(vm.new_exception_msg(
vm.ctx.exceptions.syntax_error.to_owned(),
format!(
"Non-UTF-8 code starting with '\\x{bad_byte:02x}' \
on line {line}, but no encoding declared; \
see https://peps.python.org/pep-0263/ for details \
({filename}, line {line})"
)
.into(),
))
}
}
} else {
// Use codec registry for non-UTF-8 encodings
let enc = encoding.as_deref().unwrap();
let bytes_obj = vm.ctx.new_bytes(source.to_vec());
let decoded = vm
.state
.codec_registry
.decode_text(bytes_obj.into(), enc, None, vm)
.map_err(|exc| {
if exc.fast_isinstance(vm.ctx.exceptions.lookup_error) {
vm.new_exception_msg(
vm.ctx.exceptions.syntax_error.to_owned(),
format!("unknown encoding for '{filename}': {enc}").into(),
)
} else {
exc
}
})?;
Ok(decoded.to_string_lossy().into_owned())
}
}
#[cfg(any(feature = "parser", feature = "compiler"))]
#[pyfunction]
fn compile(args: CompileArgs, vm: &VirtualMachine) -> PyResult {
@@ -203,9 +342,8 @@ mod builtins {
let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?;
let source = source.borrow_bytes();
// TODO: compiler::compile should probably get bytes
let source = core::str::from_utf8(&source)
.map_err(|e| vm.new_unicode_decode_error(e.to_string()))?;
let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?;
let source = source.as_str();
let flags = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?;