Implement latin_1 in Rust

This implementation is patterned off of the ascii codec.
This commit is contained in:
Padraic Fanning
2021-09-23 21:34:33 -04:00
parent 0bb0946a5e
commit 0f889ce92b
2 changed files with 89 additions and 8 deletions

View File

@@ -172,6 +172,82 @@ pub mod utf8 {
}
}
pub mod latin_1 {
use super::*;
pub const ENCODING_NAME: &str = "latin-1";
const ERR_REASON: &str = "ordinal not in range(256)";
#[inline]
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
let full_data = s;
let mut data = s;
let mut char_data_index = 0;
let mut out = Vec::<u8>::new();
loop {
match data
.char_indices()
.enumerate()
.find(|(_, (_, c))| (*c as u32) > 255)
{
None => {
out.extend_from_slice(data.as_bytes());
break;
}
Some((char_i, (byte_i, _))) => {
out.extend_from_slice(&data.as_bytes()[..byte_i]);
let char_start = char_data_index + char_i;
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
let non_latin_1_run_length = data[byte_i..]
.chars()
.take_while(|c| (*c as u32) > 255)
.count();
let char_range = char_start..char_start + non_latin_1_run_length;
let (replace, char_restart) =
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
match replace {
EncodeReplace::Str(s) => {
if s.as_ref().chars().any(|c| (c as u32) > 255) {
return Err(
errors.error_encoding(full_data, char_range, ERR_REASON)
);
}
out.extend_from_slice(s.as_ref().as_bytes());
}
EncodeReplace::Bytes(b) => {
out.extend_from_slice(b.as_ref());
}
}
data = crate::str::try_get_chars(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
continue;
}
}
}
Ok(out)
}
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
decode_utf8_compatible(
data,
errors,
|v| {
std::str::from_utf8(v).map_err(|e| {
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
// is valid ascii & therefore valid utf8
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
})
},
|_rest, err_len| HandleResult::Error {
err_len,
reason: ERR_REASON,
},
)
}
}
pub mod ascii {
use super::*;
use ::ascii::AsciiStr;

View File

@@ -315,6 +315,19 @@ mod _codecs {
do_codec!(utf8::decode, args, vm)
}
#[pyfunction]
fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
if args.s.as_ref().chars().all(|c| (c as u32) < 256) {
return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len()));
}
do_codec!(latin_1::encode, args, vm)
}
#[pyfunction]
fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult {
do_codec!(latin_1::decode, args, vm)
}
#[pyfunction]
fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
if args.s.is_ascii() {
@@ -353,14 +366,6 @@ mod _codecs {
}};
}
#[pyfunction]
fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
delegate_pycodecs!(latin_1_encode, args, vm)
}
#[pyfunction]
fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
delegate_pycodecs!(latin_1_decode, args, vm)
}
#[pyfunction]
fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
delegate_pycodecs!(mbcs_encode, args, vm)