Use a Utf8size struct to represent the {bytes,chars} pairs in textio

This commit is contained in:
Noah
2021-06-08 22:48:11 -05:00
parent 27af9e5401
commit 1f0e7beefe

View File

@@ -1946,6 +1946,58 @@ mod _io {
}
}
/// A length of or index into a UTF-8 string, measured in both chars and bytes
#[derive(Debug, Default, Copy, Clone)]
struct Utf8size {
bytes: usize,
chars: usize,
}
impl Utf8size {
fn len_pystr(s: &PyStr) -> Self {
Utf8size {
bytes: s.byte_len(),
chars: s.char_len(),
}
}
fn len_str(s: &str) -> Self {
Utf8size {
bytes: s.len(),
chars: s.chars().count(),
}
}
}
impl std::ops::Add for Utf8size {
type Output = Self;
#[inline]
fn add(mut self, rhs: Self) -> Self {
self += rhs;
self
}
}
impl std::ops::AddAssign for Utf8size {
#[inline]
fn add_assign(&mut self, rhs: Self) {
self.bytes += rhs.bytes;
self.chars += rhs.chars;
}
}
impl std::ops::Sub for Utf8size {
type Output = Self;
#[inline]
fn sub(mut self, rhs: Self) -> Self {
self -= rhs;
self
}
}
impl std::ops::SubAssign for Utf8size {
#[inline]
fn sub_assign(&mut self, rhs: Self) {
self.bytes -= rhs.bytes;
self.chars -= rhs.chars;
}
}
// TODO: implement legit fast-paths for other encodings
type EncodeFunc = fn(PyStrRef) -> PendingWrite;
fn textio_encode_utf8(s: PyStrRef) -> PendingWrite {
@@ -1970,10 +2022,8 @@ mod _io {
telling: bool,
snapshot: Option<(i32, PyBytesRef)>,
decoded_chars: Option<PyStrRef>,
// number of characters we've consumed from decoded_chars in codepoints
num_decoded_chars: usize,
// same as above, but in bytes
decoded_chars_pos: usize,
// number of characters we've consumed from decoded_chars
decoded_chars_used: Utf8size,
b2cratio: f64,
}
@@ -2113,6 +2163,16 @@ mod _io {
}
Ok(())
}
fn num_to_skip(&self) -> Utf8size {
Utf8size {
bytes: self.bytes_to_skip as usize,
chars: self.chars_to_skip as usize,
}
}
fn set_num_to_skip(&mut self, num: Utf8size) {
self.bytes_to_skip = num.bytes as i32;
self.chars_to_skip = num.chars as i32;
}
}
#[pyattr]
@@ -2214,8 +2274,7 @@ mod _io {
telling: seekable,
snapshot: None,
decoded_chars: None,
decoded_chars_pos: 0,
num_decoded_chars: 0,
decoded_chars_used: Utf8size::default(),
b2cratio: 0.0,
});
@@ -2364,8 +2423,7 @@ mod _io {
if !pos_is_valid {
return Err(vm.new_os_error("can't restore logical file position".to_owned()));
}
textio.decoded_chars_pos = cookie.bytes_to_skip as usize;
textio.num_decoded_chars = cookie.chars_to_skip as usize;
textio.decoded_chars_used = cookie.num_to_skip();
} else {
textio.snapshot = Some((cookie.dec_flags, PyBytes::from(vec![]).into_ref(vm)))
}
@@ -2403,7 +2461,7 @@ mod _io {
dec_flags: *dec_flags,
..Default::default()
};
if textio.decoded_chars_pos == 0 {
if textio.decoded_chars_used.bytes == 0 {
return Ok(cookie.build().into_pyobject(vm));
}
let decoder_getstate = || {
@@ -2413,23 +2471,21 @@ mod _io {
let decoder_decode = |b: &[u8]| {
let decoded = vm.call_method(decoder, "decode", (vm.ctx.new_bytes(b.to_vec()),))?;
let decoded = check_decoded(decoded, vm)?;
Ok((decoded.byte_len(), decoded.char_len()))
Ok(Utf8size::len_pystr(&decoded))
};
let saved_state = vm.call_method(decoder, "getstate", ())?;
let mut chars_to_skip = textio.num_decoded_chars;
let mut bytes_to_skip = textio.decoded_chars_pos;
let mut skip_bytes = (textio.b2cratio * chars_to_skip as f64) as isize;
let mut num_to_skip = textio.decoded_chars_used;
let mut skip_bytes = (textio.b2cratio * num_to_skip.chars as f64) as isize;
let mut skip_back = 1;
while skip_bytes > 0 {
cookie.set_decoder_state(decoder, vm)?;
let input = &next_input.as_bytes()[..skip_bytes as usize];
let (bytes_decoded, chars_decoded) = decoder_decode(input)?;
if chars_decoded <= chars_to_skip {
let ndecoded = decoder_decode(input)?;
if ndecoded.chars <= num_to_skip.chars {
let (dec_buffer, dec_flags) = decoder_getstate()?;
if dec_buffer.is_empty() {
cookie.dec_flags = dec_flags;
chars_to_skip -= chars_decoded;
bytes_to_skip -= bytes_decoded;
num_to_skip -= ndecoded;
break;
}
skip_bytes -= dec_buffer.len() as isize;
@@ -2446,31 +2502,26 @@ mod _io {
let skip_bytes = skip_bytes as usize;
cookie.start_pos += skip_bytes as Offset;
cookie.chars_to_skip = chars_to_skip as i32;
cookie.bytes_to_skip = bytes_to_skip as i32;
cookie.set_num_to_skip(num_to_skip);
if chars_to_skip != 0 {
let mut chars_decoded = 0;
let mut bytes_decoded = 0;
if num_to_skip.chars != 0 {
let mut ndecoded = Utf8size::default();
let mut input = next_input.as_bytes();
input = &input[skip_bytes..];
while !input.is_empty() {
let (byte1, rest) = input.split_at(1);
let (b_n, n) = decoder_decode(byte1)?;
chars_decoded += n;
bytes_decoded += b_n;
let n = decoder_decode(byte1)?;
ndecoded += n;
cookie.bytes_to_feed += 1;
let (dec_buffer, dec_flags) = decoder_getstate()?;
if dec_buffer.is_empty() && chars_decoded < chars_to_skip {
if dec_buffer.is_empty() && ndecoded.chars < num_to_skip.chars {
cookie.start_pos += cookie.bytes_to_feed as Offset;
chars_to_skip -= chars_decoded;
bytes_to_skip -= bytes_decoded;
num_to_skip -= ndecoded;
cookie.dec_flags = dec_flags;
cookie.bytes_to_feed = 0;
chars_decoded = 0;
bytes_decoded = 0;
ndecoded = Utf8size::default();
}
if chars_decoded >= chars_to_skip {
if ndecoded.chars >= num_to_skip.chars {
break;
}
input = rest;
@@ -2479,9 +2530,9 @@ mod _io {
let decoded =
vm.call_method(decoder, "decode", (vm.ctx.new_bytes(vec![]), true))?;
let decoded = check_decoded(decoded, vm)?;
chars_decoded += decoded.char_len();
let final_decoded_chars = ndecoded.chars + decoded.char_len();
cookie.need_eof = true;
if chars_decoded < chars_to_skip {
if final_decoded_chars < num_to_skip.chars {
return Err(
vm.new_os_error("can't reconstruct logical file position".to_owned())
);
@@ -2489,8 +2540,7 @@ mod _io {
}
}
vm.call_method(decoder, "setstate", (saved_state,))?;
cookie.chars_to_skip = chars_to_skip as i32;
cookie.bytes_to_skip = bytes_to_skip as i32;
cookie.set_num_to_skip(num_to_skip);
Ok(cookie.build().into_pyobject(vm))
}
@@ -2688,13 +2738,18 @@ mod _io {
PyStr::from(self.slice()).into_ref(vm)
}
}
fn utf8_len(&self) -> Utf8size {
Utf8size {
bytes: self.byte_len(),
chars: self.char_len(),
}
}
}
let mut start;
let mut endpos;
let mut offset_to_buffer;
let mut chunked = 0;
let mut chunked_chars = 0;
let mut chunked = Utf8size::default();
let mut remaining: Option<SlicedStr> = None;
let mut chunks = Vec::new();
@@ -2708,21 +2763,21 @@ mod _io {
if eof {
textio.set_decoded_chars(None);
textio.snapshot = None;
start = 0;
endpos = 0;
offset_to_buffer = 0;
start = Utf8size::default();
endpos = Utf8size::default();
offset_to_buffer = Utf8size::default();
break 'outer None;
}
};
let line = match remaining.take() {
None => {
start = textio.decoded_chars_pos;
offset_to_buffer = 0;
start = textio.decoded_chars_used;
offset_to_buffer = Utf8size::default();
decoded_chars.clone()
}
Some(remaining) => {
assert_eq!(textio.decoded_chars_pos, 0);
offset_to_buffer = remaining.byte_len();
assert_eq!(textio.decoded_chars_used.bytes, 0);
offset_to_buffer = remaining.utf8_len();
let decoded_chars = decoded_chars.as_str();
let line = if remaining.is_full_slice() {
let mut line = remaining.0;
@@ -2736,26 +2791,27 @@ mod _io {
s.push_str(decoded_chars);
PyStr::from(s).into_ref(vm)
};
start = 0;
start = Utf8size::default();
line
}
};
let line_from_start = &line.as_str()[start..];
let line_from_start = &line.as_str()[start.bytes..];
let nl_res = textio.newline.find_newline(line_from_start);
match nl_res {
Ok(p) | Err(p) => {
endpos = start + p;
endpos = start + Utf8size::len_str(&line_from_start[..p]);
if let Some(limit) = limit {
// TODO: track char positions in variables as well as bytes
// original CPython logic: endpos = start + limit - chunked
let line_chars = line.as_str()[..endpos].chars().count();
if chunked_chars + line_chars >= limit {
if chunked.chars + endpos.chars >= limit {
endpos = start
+ crate::common::str::char_range_end(
line_from_start,
limit - chunked_chars,
)
.unwrap();
+ Utf8size {
chars: limit - chunked.chars,
bytes: crate::common::str::char_range_end(
line_from_start,
limit - chunked.chars,
)
.unwrap(),
};
break Some(line);
}
}
@@ -2764,28 +2820,25 @@ mod _io {
if nl_res.is_ok() {
break Some(line);
}
if endpos > start {
let chunk = SlicedStr(line.clone(), start..endpos);
chunked += chunk.byte_len();
chunked_chars += chunk.char_len();
if endpos.bytes > start.bytes {
let chunk = SlicedStr(line.clone(), start.bytes..endpos.bytes);
chunked += chunk.utf8_len();
chunks.push(chunk);
}
let line_len = line.byte_len();
if endpos < line_len {
remaining = Some(SlicedStr(line, endpos..line_len));
if endpos.bytes < line_len {
remaining = Some(SlicedStr(line, endpos.bytes..line_len));
}
textio.set_decoded_chars(None);
};
let cur_line = cur_line.map(|line| {
let orig_decoded_chars = &line.as_str()[offset_to_buffer..endpos];
textio.decoded_chars_pos = orig_decoded_chars.len();
// TODO: variables that are siblings to endpos/offset_to_buffer, measured in chars rather than bytes?
textio.num_decoded_chars = orig_decoded_chars.chars().count();
SlicedStr(line, start..endpos)
textio.decoded_chars_used = endpos - offset_to_buffer;
SlicedStr(line, start.bytes..endpos.bytes)
});
// don't need to care about chunked.chars anymore
let mut chunked = chunked.bytes;
if let Some(remaining) = remaining {
// don't need to care about chunked_chars anymore
chunked += remaining.byte_len();
chunks.push(remaining);
}
@@ -2931,13 +2984,13 @@ mod _io {
return None;
}
let decoded_chars = self.decoded_chars.as_ref()?;
let avail = &decoded_chars.as_str()[self.decoded_chars_pos..];
let avail = &decoded_chars.as_str()[self.decoded_chars_used.bytes..];
if avail.is_empty() {
return None;
}
let avail_chars = decoded_chars.char_len() - self.num_decoded_chars;
let avail_chars = decoded_chars.char_len() - self.decoded_chars_used.chars;
let (chars, chars_used) = if n >= avail_chars {
if self.decoded_chars_pos == 0 {
if self.decoded_chars_used.bytes == 0 {
(decoded_chars.clone(), avail_chars)
} else {
(PyStr::from(avail).into_ref(vm), avail_chars)
@@ -2946,14 +2999,15 @@ mod _io {
let s = crate::common::str::get_chars(avail, 0..n);
(PyStr::from(s).into_ref(vm), n)
};
self.num_decoded_chars += chars_used;
self.decoded_chars_pos += chars.byte_len();
self.decoded_chars_used += Utf8size {
bytes: chars.byte_len(),
chars: chars_used,
};
Some((chars, chars_used))
}
fn set_decoded_chars(&mut self, s: Option<PyStrRef>) {
self.decoded_chars = s;
self.num_decoded_chars = 0;
self.decoded_chars_pos = 0;
self.decoded_chars_used = Utf8size::default();
}
fn take_decoded_chars(
&mut self,
@@ -2961,8 +3015,7 @@ mod _io {
vm: &VirtualMachine,
) -> PyStrRef {
let empty_str = || PyStr::from("").into_ref(vm);
let chars_pos = std::mem::replace(&mut self.decoded_chars_pos, 0);
self.num_decoded_chars = 0;
let chars_pos = std::mem::take(&mut self.decoded_chars_used).bytes;
let decoded_chars = match std::mem::take(&mut self.decoded_chars) {
None => return append.unwrap_or_else(empty_str),
Some(s) if s.is_empty() => return append.unwrap_or_else(empty_str),