From 0f889ce92b51cd93f056652fa9d625f16a668ecb Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Thu, 23 Sep 2021 21:34:33 -0400 Subject: [PATCH] Implement latin_1 in Rust This implementation is patterned off of the ascii codec. --- common/src/encodings.rs | 76 +++++++++++++++++++++++++++++++++++++++++ vm/src/stdlib/codecs.rs | 21 +++++++----- 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/common/src/encodings.rs b/common/src/encodings.rs index 7a6cd1606..ca77c529f 100644 --- a/common/src/encodings.rs +++ b/common/src/encodings.rs @@ -172,6 +172,82 @@ pub mod utf8 { } } +pub mod latin_1 { + use super::*; + + pub const ENCODING_NAME: &str = "latin-1"; + + const ERR_REASON: &str = "ordinal not in range(256)"; + + #[inline] + pub fn encode(s: &str, errors: &E) -> Result, E::Error> { + let full_data = s; + let mut data = s; + let mut char_data_index = 0; + let mut out = Vec::::new(); + loop { + match data + .char_indices() + .enumerate() + .find(|(_, (_, c))| (*c as u32) > 255) + { + None => { + out.extend_from_slice(data.as_bytes()); + break; + } + Some((char_i, (byte_i, _))) => { + out.extend_from_slice(&data.as_bytes()[..byte_i]); + let char_start = char_data_index + char_i; + // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char + let non_latin_1_run_length = data[byte_i..] + .chars() + .take_while(|c| (*c as u32) > 255) + .count(); + let char_range = char_start..char_start + non_latin_1_run_length; + let (replace, char_restart) = + errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?; + match replace { + EncodeReplace::Str(s) => { + if s.as_ref().chars().any(|c| (c as u32) > 255) { + return Err( + errors.error_encoding(full_data, char_range, ERR_REASON) + ); + } + out.extend_from_slice(s.as_ref().as_bytes()); + } + EncodeReplace::Bytes(b) => { + out.extend_from_slice(b.as_ref()); + } + } + data = crate::str::try_get_chars(full_data, char_restart..) + .ok_or_else(|| errors.error_oob_restart(char_restart))?; + char_data_index = char_restart; + continue; + } + } + } + Ok(out) + } + + pub fn decode(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> { + decode_utf8_compatible( + data, + errors, + |v| { + std::str::from_utf8(v).map_err(|e| { + // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()] + // is valid ascii & therefore valid utf8 + unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) } + }) + }, + |_rest, err_len| HandleResult::Error { + err_len, + reason: ERR_REASON, + }, + ) + } +} + pub mod ascii { use super::*; use ::ascii::AsciiStr; diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs index ac63e4e1d..ede48947f 100644 --- a/vm/src/stdlib/codecs.rs +++ b/vm/src/stdlib/codecs.rs @@ -315,6 +315,19 @@ mod _codecs { do_codec!(utf8::decode, args, vm) } + #[pyfunction] + fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult { + if args.s.as_ref().chars().all(|c| (c as u32) < 256) { + return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len())); + } + do_codec!(latin_1::encode, args, vm) + } + + #[pyfunction] + fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult { + do_codec!(latin_1::decode, args, vm) + } + #[pyfunction] fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult { if args.s.is_ascii() { @@ -353,14 +366,6 @@ mod _codecs { }}; } - #[pyfunction] - fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { - delegate_pycodecs!(latin_1_encode, args, vm) - } - #[pyfunction] - fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { - delegate_pycodecs!(latin_1_decode, args, vm) - } #[pyfunction] fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { delegate_pycodecs!(mbcs_encode, args, vm)