From 1f0e7beefeade8b6d23ebf7b29babd1ea847ef9e Mon Sep 17 00:00:00 2001
From: Noah <33094578+coolreader18@users.noreply.github.com>
Date: Tue, 8 Jun 2021 22:48:11 -0500
Subject: [PATCH] Use a Utf8size struct to represent the {bytes,chars} pairs in
 textio

---
 vm/src/stdlib/io.rs | 205 ++++++++++++++++++++++++++++----------------
 1 file changed, 129 insertions(+), 76 deletions(-)
diff --git a/vm/src/stdlib/io.rs b/vm/src/stdlib/io.rs
index 116f75a8b..a1b86fd5d 100644
--- a/vm/src/stdlib/io.rs
+++ b/vm/src/stdlib/io.rs
@@ -1946,6 +1946,58 @@ mod _io {
         }
     }
 
+    /// A length of or index into a UTF-8 string, measured in both chars and bytes
+    #[derive(Debug, Default, Copy, Clone)]
+    struct Utf8size {
+        bytes: usize,
+        chars: usize,
+    }
+    impl Utf8size {
+        fn len_pystr(s: &PyStr) -> Self {
+            Utf8size {
+                bytes: s.byte_len(),
+                chars: s.char_len(),
+            }
+        }
+
+        fn len_str(s: &str) -> Self {
+            Utf8size {
+                bytes: s.len(),
+                chars: s.chars().count(),
+            }
+        }
+    }
+    impl std::ops::Add for Utf8size {
+        type Output = Self;
+        #[inline]
+        fn add(mut self, rhs: Self) -> Self {
+            self += rhs;
+            self
+        }
+    }
+    impl std::ops::AddAssign for Utf8size {
+        #[inline]
+        fn add_assign(&mut self, rhs: Self) {
+            self.bytes += rhs.bytes;
+            self.chars += rhs.chars;
+        }
+    }
+    impl std::ops::Sub for Utf8size {
+        type Output = Self;
+        #[inline]
+        fn sub(mut self, rhs: Self) -> Self {
+            self -= rhs;
+            self
+        }
+    }
+    impl std::ops::SubAssign for Utf8size {
+        #[inline]
+        fn sub_assign(&mut self, rhs: Self) {
+            self.bytes -= rhs.bytes;
+            self.chars -= rhs.chars;
+        }
+    }
+
     // TODO: implement legit fast-paths for other encodings
     type EncodeFunc = fn(PyStrRef) -> PendingWrite;
     fn textio_encode_utf8(s: PyStrRef) -> PendingWrite {
@@ -1970,10 +2022,8 @@ mod _io {
         telling: bool,
         snapshot: Option<(i32, PyBytesRef)>,
         decoded_chars: Option<PyStrRef>,
-        // number of characters we've consumed from decoded_chars in codepoints
-        num_decoded_chars: usize,
-        // same as above, but in bytes
-        decoded_chars_pos: usize,
+        // number of characters we've consumed from decoded_chars
+        decoded_chars_used: Utf8size,
         b2cratio: f64,
     }
 
@@ -2113,6 +2163,16 @@ mod _io {
             }
             Ok(())
         }
+        fn num_to_skip(&self) -> Utf8size {
+            Utf8size {
+                bytes: self.bytes_to_skip as usize,
+                chars: self.chars_to_skip as usize,
+            }
+        }
+        fn set_num_to_skip(&mut self, num: Utf8size) {
+            self.bytes_to_skip = num.bytes as i32;
+            self.chars_to_skip = num.chars as i32;
+        }
     }
 
     #[pyattr]
@@ -2214,8 +2274,7 @@ mod _io {
                 telling: seekable,
                 snapshot: None,
                 decoded_chars: None,
-                decoded_chars_pos: 0,
-                num_decoded_chars: 0,
+                decoded_chars_used: Utf8size::default(),
                 b2cratio: 0.0,
             });
 
@@ -2364,8 +2423,7 @@ mod _io {
                 if !pos_is_valid {
                     return Err(vm.new_os_error("can't restore logical file position".to_owned()));
                 }
-                textio.decoded_chars_pos = cookie.bytes_to_skip as usize;
-                textio.num_decoded_chars = cookie.chars_to_skip as usize;
+                textio.decoded_chars_used = cookie.num_to_skip();
             } else {
                 textio.snapshot = Some((cookie.dec_flags, PyBytes::from(vec![]).into_ref(vm)))
             }
@@ -2403,7 +2461,7 @@ mod _io {
                 dec_flags: *dec_flags,
                 ..Default::default()
             };
-            if textio.decoded_chars_pos == 0 {
+            if textio.decoded_chars_used.bytes == 0 {
                 return Ok(cookie.build().into_pyobject(vm));
             }
             let decoder_getstate = || {
@@ -2413,23 +2471,21 @@ mod _io {
             let decoder_decode = |b: &[u8]| {
                 let decoded = vm.call_method(decoder, "decode", (vm.ctx.new_bytes(b.to_vec()),))?;
                 let decoded = check_decoded(decoded, vm)?;
-                Ok((decoded.byte_len(), decoded.char_len()))
+                Ok(Utf8size::len_pystr(&decoded))
             };
             let saved_state = vm.call_method(decoder, "getstate", ())?;
-            let mut chars_to_skip = textio.num_decoded_chars;
-            let mut bytes_to_skip = textio.decoded_chars_pos;
-            let mut skip_bytes = (textio.b2cratio * chars_to_skip as f64) as isize;
+            let mut num_to_skip = textio.decoded_chars_used;
+            let mut skip_bytes = (textio.b2cratio * num_to_skip.chars as f64) as isize;
             let mut skip_back = 1;
             while skip_bytes > 0 {
                 cookie.set_decoder_state(decoder, vm)?;
                 let input = &next_input.as_bytes()[..skip_bytes as usize];
-                let (bytes_decoded, chars_decoded) = decoder_decode(input)?;
-                if chars_decoded <= chars_to_skip {
+                let ndecoded = decoder_decode(input)?;
+                if ndecoded.chars <= num_to_skip.chars {
                     let (dec_buffer, dec_flags) = decoder_getstate()?;
                     if dec_buffer.is_empty() {
                         cookie.dec_flags = dec_flags;
-                        chars_to_skip -= chars_decoded;
-                        bytes_to_skip -= bytes_decoded;
+                        num_to_skip -= ndecoded;
                         break;
                     }
                     skip_bytes -= dec_buffer.len() as isize;
@@ -2446,31 +2502,26 @@ mod _io {
             let skip_bytes = skip_bytes as usize;
 
             cookie.start_pos += skip_bytes as Offset;
-            cookie.chars_to_skip = chars_to_skip as i32;
-            cookie.bytes_to_skip = bytes_to_skip as i32;
+            cookie.set_num_to_skip(num_to_skip);
 
-            if chars_to_skip != 0 {
-                let mut chars_decoded = 0;
-                let mut bytes_decoded = 0;
+            if num_to_skip.chars != 0 {
+                let mut ndecoded = Utf8size::default();
                 let mut input = next_input.as_bytes();
                 input = &input[skip_bytes..];
                 while !input.is_empty() {
                     let (byte1, rest) = input.split_at(1);
-                    let (b_n, n) = decoder_decode(byte1)?;
-                    chars_decoded += n;
-                    bytes_decoded += b_n;
+                    let n = decoder_decode(byte1)?;
+                    ndecoded += n;
                     cookie.bytes_to_feed += 1;
                     let (dec_buffer, dec_flags) = decoder_getstate()?;
-                    if dec_buffer.is_empty() && chars_decoded < chars_to_skip {
+                    if dec_buffer.is_empty() && ndecoded.chars < num_to_skip.chars {
                         cookie.start_pos += cookie.bytes_to_feed as Offset;
-                        chars_to_skip -= chars_decoded;
-                        bytes_to_skip -= bytes_decoded;
+                        num_to_skip -= ndecoded;
                         cookie.dec_flags = dec_flags;
                         cookie.bytes_to_feed = 0;
-                        chars_decoded = 0;
-                        bytes_decoded = 0;
+                        ndecoded = Utf8size::default();
                     }
-                    if chars_decoded >= chars_to_skip {
+                    if ndecoded.chars >= num_to_skip.chars {
                         break;
                     }
                     input = rest;
@@ -2479,9 +2530,9 @@ mod _io {
                     let decoded =
                         vm.call_method(decoder, "decode", (vm.ctx.new_bytes(vec![]), true))?;
                     let decoded = check_decoded(decoded, vm)?;
-                    chars_decoded += decoded.char_len();
+                    let final_decoded_chars = ndecoded.chars + decoded.char_len();
                     cookie.need_eof = true;
-                    if chars_decoded < chars_to_skip {
+                    if final_decoded_chars < num_to_skip.chars {
                         return Err(
                             vm.new_os_error("can't reconstruct logical file position".to_owned())
                         );
@@ -2489,8 +2540,7 @@ mod _io {
                 }
             }
             vm.call_method(decoder, "setstate", (saved_state,))?;
-            cookie.chars_to_skip = chars_to_skip as i32;
-            cookie.bytes_to_skip = bytes_to_skip as i32;
+            cookie.set_num_to_skip(num_to_skip);
             Ok(cookie.build().into_pyobject(vm))
         }
 
@@ -2688,13 +2738,18 @@ mod _io {
                         PyStr::from(self.slice()).into_ref(vm)
                     }
                 }
+                fn utf8_len(&self) -> Utf8size {
+                    Utf8size {
+                        bytes: self.byte_len(),
+                        chars: self.char_len(),
+                    }
+                }
             }
 
             let mut start;
             let mut endpos;
             let mut offset_to_buffer;
-            let mut chunked = 0;
-            let mut chunked_chars = 0;
+            let mut chunked = Utf8size::default();
             let mut remaining: Option<SlicedStr> = None;
             let mut chunks = Vec::new();
 
@@ -2708,21 +2763,21 @@ mod _io {
                     if eof {
                         textio.set_decoded_chars(None);
                         textio.snapshot = None;
-                        start = 0;
-                        endpos = 0;
-                        offset_to_buffer = 0;
+                        start = Utf8size::default();
+                        endpos = Utf8size::default();
+                        offset_to_buffer = Utf8size::default();
                         break 'outer None;
                     }
                 };
                 let line = match remaining.take() {
                     None => {
-                        start = textio.decoded_chars_pos;
-                        offset_to_buffer = 0;
+                        start = textio.decoded_chars_used;
+                        offset_to_buffer = Utf8size::default();
                         decoded_chars.clone()
                     }
                     Some(remaining) => {
-                        assert_eq!(textio.decoded_chars_pos, 0);
-                        offset_to_buffer = remaining.byte_len();
+                        assert_eq!(textio.decoded_chars_used.bytes, 0);
+                        offset_to_buffer = remaining.utf8_len();
                         let decoded_chars = decoded_chars.as_str();
                         let line = if remaining.is_full_slice() {
                             let mut line = remaining.0;
@@ -2736,26 +2791,27 @@ mod _io {
                             s.push_str(decoded_chars);
                             PyStr::from(s).into_ref(vm)
                         };
-                        start = 0;
+                        start = Utf8size::default();
                         line
                     }
                 };
-                let line_from_start = &line.as_str()[start..];
+                let line_from_start = &line.as_str()[start.bytes..];
                 let nl_res = textio.newline.find_newline(line_from_start);
                 match nl_res {
                     Ok(p) | Err(p) => {
-                        endpos = start + p;
+                        endpos = start + Utf8size::len_str(&line_from_start[..p]);
                         if let Some(limit) = limit {
-                            // TODO: track char positions in variables as well as bytes
                             // original CPython logic: endpos = start + limit - chunked
-                            let line_chars = line.as_str()[..endpos].chars().count();
-                            if chunked_chars + line_chars >= limit {
+                            if chunked.chars + endpos.chars >= limit {
                                 endpos = start
-                                    + crate::common::str::char_range_end(
-                                        line_from_start,
-                                        limit - chunked_chars,
-                                    )
-                                    .unwrap();
+                                    + Utf8size {
+                                        chars: limit - chunked.chars,
+                                        bytes: crate::common::str::char_range_end(
+                                            line_from_start,
+                                            limit - chunked.chars,
+                                        )
+                                        .unwrap(),
+                                    };
                                 break Some(line);
                             }
                         }
@@ -2764,28 +2820,25 @@ mod _io {
                 if nl_res.is_ok() {
                     break Some(line);
                 }
-                if endpos > start {
-                    let chunk = SlicedStr(line.clone(), start..endpos);
-                    chunked += chunk.byte_len();
-                    chunked_chars += chunk.char_len();
+                if endpos.bytes > start.bytes {
+                    let chunk = SlicedStr(line.clone(), start.bytes..endpos.bytes);
+                    chunked += chunk.utf8_len();
                     chunks.push(chunk);
                 }
                 let line_len = line.byte_len();
-                if endpos < line_len {
-                    remaining = Some(SlicedStr(line, endpos..line_len));
+                if endpos.bytes < line_len {
+                    remaining = Some(SlicedStr(line, endpos.bytes..line_len));
                 }
                 textio.set_decoded_chars(None);
             };
 
             let cur_line = cur_line.map(|line| {
-                let orig_decoded_chars = &line.as_str()[offset_to_buffer..endpos];
-                textio.decoded_chars_pos = orig_decoded_chars.len();
-                // TODO: variables that are siblings to endpos/offset_to_buffer, measured in chars rather than bytes?
-                textio.num_decoded_chars = orig_decoded_chars.chars().count();
-                SlicedStr(line, start..endpos)
+                textio.decoded_chars_used = endpos - offset_to_buffer;
+                SlicedStr(line, start.bytes..endpos.bytes)
             });
+            // don't need to care about chunked.chars anymore
+            let mut chunked = chunked.bytes;
             if let Some(remaining) = remaining {
-                // don't need to care about chunked_chars anymore
                 chunked += remaining.byte_len();
                 chunks.push(remaining);
             }
@@ -2931,13 +2984,13 @@ mod _io {
                 return None;
             }
             let decoded_chars = self.decoded_chars.as_ref()?;
-            let avail = &decoded_chars.as_str()[self.decoded_chars_pos..];
+            let avail = &decoded_chars.as_str()[self.decoded_chars_used.bytes..];
             if avail.is_empty() {
                 return None;
             }
-            let avail_chars = decoded_chars.char_len() - self.num_decoded_chars;
+            let avail_chars = decoded_chars.char_len() - self.decoded_chars_used.chars;
             let (chars, chars_used) = if n >= avail_chars {
-                if self.decoded_chars_pos == 0 {
+                if self.decoded_chars_used.bytes == 0 {
                     (decoded_chars.clone(), avail_chars)
                 } else {
                     (PyStr::from(avail).into_ref(vm), avail_chars)
@@ -2946,14 +2999,15 @@ mod _io {
                 let s = crate::common::str::get_chars(avail, 0..n);
                 (PyStr::from(s).into_ref(vm), n)
             };
-            self.num_decoded_chars += chars_used;
-            self.decoded_chars_pos += chars.byte_len();
+            self.decoded_chars_used += Utf8size {
+                bytes: chars.byte_len(),
+                chars: chars_used,
+            };
             Some((chars, chars_used))
         }
         fn set_decoded_chars(&mut self, s: Option<PyStrRef>) {
             self.decoded_chars = s;
-            self.num_decoded_chars = 0;
-            self.decoded_chars_pos = 0;
+            self.decoded_chars_used = Utf8size::default();
         }
         fn take_decoded_chars(
             &mut self,
@@ -2961,8 +3015,7 @@ mod _io {
             vm: &VirtualMachine,
         ) -> PyStrRef {
             let empty_str = || PyStr::from("").into_ref(vm);
-            let chars_pos = std::mem::replace(&mut self.decoded_chars_pos, 0);
-            self.num_decoded_chars = 0;
+            let chars_pos = std::mem::take(&mut self.decoded_chars_used).bytes;
             let decoded_chars = match std::mem::take(&mut self.decoded_chars) {
                 None => return append.unwrap_or_else(empty_str),
                 Some(s) if s.is_empty() => return append.unwrap_or_else(empty_str),