From 9c57ae4046c588633bc2c7bcede52822c6cc8804 Mon Sep 17 00:00:00 2001
From: Jimmy Girardet <ijkl@netc.fr>
Date: Wed, 17 Apr 2019 15:28:14 +0200
Subject: [PATCH 1/2] support bytes creation from hex and ascii

---
 parser/src/lexer.rs     | 104 +++++++++++++++++++++++++++++++++++++++-
 tests/snippets/bytes.py |   4 ++
 2 files changed, 107 insertions(+), 1 deletion(-)
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
index dfc9ecd51..b9e39e7df 100644
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@@ -542,7 +542,7 @@ where
         let tok = if is_bytes {
             if string_content.is_ascii() {
                 Tok::Bytes {
-                    value: string_content.as_bytes().to_vec(),
+                    value: self.lex_byte(string_content)?,
                 }
             } else {
                 return Err(LexicalError::StringError);
@@ -1105,6 +1105,84 @@ where
         let tok_end = self.get_pos();
         Ok((tok_start, ty, tok_end))
     }
+
+    fn lex_byte(&self, s: String) -> Result<Vec<u8>, LexicalError> {
+        let mut res = vec![];
+        let mut escape = false; //flag if previous was \
+        let mut hex_on = false; // hex mode on or off
+        let mut hex_value = String::new();
+
+        for c in s.chars() {
+            match c {
+                '\\' => {
+                    if escape {
+                        res.push(92);
+                        escape = false;
+                    } else {
+                        escape = true;
+                    }
+                }
+
+                'x' => {
+                    if escape {
+                        hex_on = true;
+                    } else {
+                        res.push(120);
+                    }
+                    escape = false;
+                }
+                't' => {
+                    if escape {
+                        res.push(9);
+                    } else {
+                        res.push(116);
+                    }
+                    escape = false;
+                }
+                'n' => {
+                    if escape {
+                        res.push(10);
+                    } else {
+                        res.push(110)
+                    }
+                    escape = false;
+                }
+                'r' => {
+                    if escape {
+                        res.push(13);
+                    } else {
+                        res.push(114)
+                    }
+                    escape = false;
+                }
+                x => {
+                    if hex_on {
+                        if x.is_ascii_hexdigit() {
+                            if hex_value.is_empty() {
+                                hex_value.push(x);
+                                continue;
+                            } else {
+                                hex_value.push(x);
+                                res.push(u8::from_str_radix(&hex_value, 16).unwrap());
+                                hex_on = false;
+                                hex_value.clear();
+                            }
+                        } else {
+                            return Err(LexicalError::StringError);
+                        }
+                    } else {
+                        if escape {
+                            res.push(92);
+                        }
+                        res.push(x as u8);
+                    }
+                    escape = false;
+                }
+            }
+        }
+
+        Ok(res)
+    }
 }
 
 /* Implement iterator pattern for the get_tok function.
@@ -1520,4 +1598,28 @@ mod tests {
         test_string_continuation_mac_eol: MAC_EOL,
         test_string_continuation_unix_eol: UNIX_EOL,
     }
+
+    #[test]
+    fn test_byte() {
+        // single quote
+        let all = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
+        let source = String::from(all);
+        let tokens = lex_source(&source);
+        let res = (0..=255).collect::<Vec<u8>>();
+        assert_eq!(tokens, vec![Tok::Bytes { value: res }]);
+
+        // double quote
+        let all = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
+        let source = String::from(all);
+        let tokens = lex_source(&source);
+        let res = (0..=255).collect::<Vec<u8>>();
+        assert_eq!(tokens, vec![Tok::Bytes { value: res }]);
+
+        // backslash doesnt escape
+        let all = r##"b"omkmok\Xaa""##;
+        let source = String::from(all);
+        let tokens = lex_source(&source);
+        let res = vec![111, 109, 107, 109, 111, 107, 92, 88, 97, 97];
+        assert_eq!(tokens, vec![Tok::Bytes { value: res }]);
+    }
 }
diff --git a/tests/snippets/bytes.py b/tests/snippets/bytes.py
index d2a60943c..c496bfeff 100644
--- a/tests/snippets/bytes.py
+++ b/tests/snippets/bytes.py
@@ -10,6 +10,10 @@ assert bytes("bla", "utf8")
 with assertRaises(TypeError):
     bytes("bla")
 
+assert b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" == bytes(range(0,256))
+assert b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' == bytes(range(0,256))
+assert b"omkmok\Xaa" == bytes([111, 109, 107, 109, 111, 107, 92, 88, 97, 97])
+
 
 a = b"abcd"
 b = b"ab"

From eb2d0b01eec8a5ce1d73ce31dda798715d3dedd5 Mon Sep 17 00:00:00 2001
From: jgirardet <ijkl@netc.fr>
Date: Thu, 18 Apr 2019 08:06:06 +0200
Subject: [PATCH 2/2] refactor lex byte

---
 parser/src/lexer.rs | 128 +++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 79 deletions(-)

diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
index b9e39e7df..11d30cbee 100644
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@@ -542,7 +542,7 @@ where
         let tok = if is_bytes {
             if string_content.is_ascii() {
                 Tok::Bytes {
-                    value: self.lex_byte(string_content)?,
+                    value: lex_byte(string_content)?,
                 }
             } else {
                 return Err(LexicalError::StringError);
@@ -1105,84 +1105,6 @@ where
         let tok_end = self.get_pos();
         Ok((tok_start, ty, tok_end))
     }
-
-    fn lex_byte(&self, s: String) -> Result<Vec<u8>, LexicalError> {
-        let mut res = vec![];
-        let mut escape = false; //flag if previous was \
-        let mut hex_on = false; // hex mode on or off
-        let mut hex_value = String::new();
-
-        for c in s.chars() {
-            match c {
-                '\\' => {
-                    if escape {
-                        res.push(92);
-                        escape = false;
-                    } else {
-                        escape = true;
-                    }
-                }
-
-                'x' => {
-                    if escape {
-                        hex_on = true;
-                    } else {
-                        res.push(120);
-                    }
-                    escape = false;
-                }
-                't' => {
-                    if escape {
-                        res.push(9);
-                    } else {
-                        res.push(116);
-                    }
-                    escape = false;
-                }
-                'n' => {
-                    if escape {
-                        res.push(10);
-                    } else {
-                        res.push(110)
-                    }
-                    escape = false;
-                }
-                'r' => {
-                    if escape {
-                        res.push(13);
-                    } else {
-                        res.push(114)
-                    }
-                    escape = false;
-                }
-                x => {
-                    if hex_on {
-                        if x.is_ascii_hexdigit() {
-                            if hex_value.is_empty() {
-                                hex_value.push(x);
-                                continue;
-                            } else {
-                                hex_value.push(x);
-                                res.push(u8::from_str_radix(&hex_value, 16).unwrap());
-                                hex_on = false;
-                                hex_value.clear();
-                            }
-                        } else {
-                            return Err(LexicalError::StringError);
-                        }
-                    } else {
-                        if escape {
-                            res.push(92);
-                        }
-                        res.push(x as u8);
-                    }
-                    escape = false;
-                }
-            }
-        }
-
-        Ok(res)
-    }
 }
 
 /* Implement iterator pattern for the get_tok function.
@@ -1211,6 +1133,54 @@ where
     }
 }
 
+fn lex_byte(s: String) -> Result<Vec<u8>, LexicalError> {
+    let mut res = vec![];
+    let mut escape = false; //flag if previous was \
+    let mut hex_on = false; // hex mode on or off
+    let mut hex_value = String::new();
+
+    for c in s.chars() {
+        if hex_on {
+            if c.is_ascii_hexdigit() {
+                if hex_value.is_empty() {
+                    hex_value.push(c);
+                    continue;
+                } else {
+                    hex_value.push(c);
+                    res.push(u8::from_str_radix(&hex_value, 16).unwrap());
+                    hex_on = false;
+                    hex_value.clear();
+                }
+            } else {
+                return Err(LexicalError::StringError);
+            }
+        } else {
+            match (c, escape) {
+                ('\\', true) => res.push(b'\\'),
+                ('\\', false) => {
+                    escape = true;
+                    continue;
+                }
+                ('x', true) => hex_on = true,
+                ('x', false) => res.push(b'x'),
+                ('t', true) => res.push(b'\t'),
+                ('t', false) => res.push(b't'),
+                ('n', true) => res.push(b'\n'),
+                ('n', false) => res.push(b'n'),
+                ('r', true) => res.push(b'\r'),
+                ('r', false) => res.push(b'r'),
+                (x, true) => {
+                    res.push(b'\\');
+                    res.push(x as u8);
+                }
+                (x, false) => res.push(x as u8),
+            }
+            escape = false;
+        }
+    }
+    Ok(res)
+}
+
 #[cfg(test)]
 mod tests {
     use super::{make_tokenizer, NewlineHandler, Tok};