mirror of
https://github.com/RustPython/RustPython.git
synced 2026-06-02 19:39:49 +09:00
Implement latin_1 in Rust
This implementation is patterned off of the ascii codec.
This commit is contained in:
@@ -172,6 +172,82 @@ pub mod utf8 {
|
||||
}
|
||||
}
|
||||
|
||||
pub mod latin_1 {
|
||||
use super::*;
|
||||
|
||||
pub const ENCODING_NAME: &str = "latin-1";
|
||||
|
||||
const ERR_REASON: &str = "ordinal not in range(256)";
|
||||
|
||||
#[inline]
|
||||
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
|
||||
let full_data = s;
|
||||
let mut data = s;
|
||||
let mut char_data_index = 0;
|
||||
let mut out = Vec::<u8>::new();
|
||||
loop {
|
||||
match data
|
||||
.char_indices()
|
||||
.enumerate()
|
||||
.find(|(_, (_, c))| (*c as u32) > 255)
|
||||
{
|
||||
None => {
|
||||
out.extend_from_slice(data.as_bytes());
|
||||
break;
|
||||
}
|
||||
Some((char_i, (byte_i, _))) => {
|
||||
out.extend_from_slice(&data.as_bytes()[..byte_i]);
|
||||
let char_start = char_data_index + char_i;
|
||||
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
|
||||
let non_latin_1_run_length = data[byte_i..]
|
||||
.chars()
|
||||
.take_while(|c| (*c as u32) > 255)
|
||||
.count();
|
||||
let char_range = char_start..char_start + non_latin_1_run_length;
|
||||
let (replace, char_restart) =
|
||||
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
|
||||
match replace {
|
||||
EncodeReplace::Str(s) => {
|
||||
if s.as_ref().chars().any(|c| (c as u32) > 255) {
|
||||
return Err(
|
||||
errors.error_encoding(full_data, char_range, ERR_REASON)
|
||||
);
|
||||
}
|
||||
out.extend_from_slice(s.as_ref().as_bytes());
|
||||
}
|
||||
EncodeReplace::Bytes(b) => {
|
||||
out.extend_from_slice(b.as_ref());
|
||||
}
|
||||
}
|
||||
data = crate::str::try_get_chars(full_data, char_restart..)
|
||||
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
|
||||
char_data_index = char_restart;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
|
||||
decode_utf8_compatible(
|
||||
data,
|
||||
errors,
|
||||
|v| {
|
||||
std::str::from_utf8(v).map_err(|e| {
|
||||
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
|
||||
// is valid ascii & therefore valid utf8
|
||||
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
|
||||
})
|
||||
},
|
||||
|_rest, err_len| HandleResult::Error {
|
||||
err_len,
|
||||
reason: ERR_REASON,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub mod ascii {
|
||||
use super::*;
|
||||
use ::ascii::AsciiStr;
|
||||
|
||||
@@ -315,6 +315,19 @@ mod _codecs {
|
||||
do_codec!(utf8::decode, args, vm)
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
|
||||
if args.s.as_ref().chars().all(|c| (c as u32) < 256) {
|
||||
return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len()));
|
||||
}
|
||||
do_codec!(latin_1::encode, args, vm)
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult {
|
||||
do_codec!(latin_1::decode, args, vm)
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
|
||||
if args.s.is_ascii() {
|
||||
@@ -353,14 +366,6 @@ mod _codecs {
|
||||
}};
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
|
||||
delegate_pycodecs!(latin_1_encode, args, vm)
|
||||
}
|
||||
#[pyfunction]
|
||||
fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
|
||||
delegate_pycodecs!(latin_1_decode, args, vm)
|
||||
}
|
||||
#[pyfunction]
|
||||
fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
|
||||
delegate_pycodecs!(mbcs_encode, args, vm)
|
||||
|
||||
Reference in New Issue
Block a user