From 9c57ae4046c588633bc2c7bcede52822c6cc8804 Mon Sep 17 00:00:00 2001 From: Jimmy Girardet Date: Wed, 17 Apr 2019 15:28:14 +0200 Subject: [PATCH 1/2] support bytes creation from hex and ascii --- parser/src/lexer.rs | 104 +++++++++++++++++++++++++++++++++++++++- tests/snippets/bytes.py | 4 ++ 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index dfc9ecd51..b9e39e7df 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -542,7 +542,7 @@ where let tok = if is_bytes { if string_content.is_ascii() { Tok::Bytes { - value: string_content.as_bytes().to_vec(), + value: self.lex_byte(string_content)?, } } else { return Err(LexicalError::StringError); @@ -1105,6 +1105,84 @@ where let tok_end = self.get_pos(); Ok((tok_start, ty, tok_end)) } + + fn lex_byte(&self, s: String) -> Result, LexicalError> { + let mut res = vec![]; + let mut escape = false; //flag if previous was \ + let mut hex_on = false; // hex mode on or off + let mut hex_value = String::new(); + + for c in s.chars() { + match c { + '\\' => { + if escape { + res.push(92); + escape = false; + } else { + escape = true; + } + } + + 'x' => { + if escape { + hex_on = true; + } else { + res.push(120); + } + escape = false; + } + 't' => { + if escape { + res.push(9); + } else { + res.push(116); + } + escape = false; + } + 'n' => { + if escape { + res.push(10); + } else { + res.push(110) + } + escape = false; + } + 'r' => { + if escape { + res.push(13); + } else { + res.push(114) + } + escape = false; + } + x => { + if hex_on { + if x.is_ascii_hexdigit() { + if hex_value.is_empty() { + hex_value.push(x); + continue; + } else { + hex_value.push(x); + res.push(u8::from_str_radix(&hex_value, 16).unwrap()); + hex_on = false; + hex_value.clear(); + } + } else { + return Err(LexicalError::StringError); + } + } else { + if escape { + res.push(92); + } + res.push(x as u8); + } + escape = false; + } + } + } + + Ok(res) + } } /* Implement iterator pattern for the get_tok function. @@ -1520,4 +1598,28 @@ mod tests { test_string_continuation_mac_eol: MAC_EOL, test_string_continuation_unix_eol: UNIX_EOL, } + + #[test] + fn test_byte() { + // single quote + let all = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; + let source = String::from(all); + let tokens = lex_source(&source); + let res = (0..=255).collect::>(); + assert_eq!(tokens, vec![Tok::Bytes { value: res }]); + + // double quote + let all = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; + let source = String::from(all); + let tokens = lex_source(&source); + let res = (0..=255).collect::>(); + assert_eq!(tokens, vec![Tok::Bytes { value: res }]); + + // backslash doesnt escape + let all = r##"b"omkmok\Xaa""##; + let source = String::from(all); + let tokens = lex_source(&source); + let res = vec![111, 109, 107, 109, 111, 107, 92, 88, 97, 97]; + assert_eq!(tokens, vec![Tok::Bytes { value: res }]); + } } diff --git a/tests/snippets/bytes.py b/tests/snippets/bytes.py index d2a60943c..c496bfeff 100644 --- a/tests/snippets/bytes.py +++ b/tests/snippets/bytes.py @@ -10,6 +10,10 @@ assert bytes("bla", "utf8") with assertRaises(TypeError): bytes("bla") +assert b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" == bytes(range(0,256)) +assert b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' == bytes(range(0,256)) +assert b"omkmok\Xaa" == bytes([111, 109, 107, 109, 111, 107, 92, 88, 97, 97]) + a = b"abcd" b = b"ab" From eb2d0b01eec8a5ce1d73ce31dda798715d3dedd5 Mon Sep 17 00:00:00 2001 From: jgirardet Date: Thu, 18 Apr 2019 08:06:06 +0200 Subject: [PATCH 2/2] refactor lex byte --- parser/src/lexer.rs | 128 +++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 79 deletions(-) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index b9e39e7df..11d30cbee 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -542,7 +542,7 @@ where let tok = if is_bytes { if string_content.is_ascii() { Tok::Bytes { - value: self.lex_byte(string_content)?, + value: lex_byte(string_content)?, } } else { return Err(LexicalError::StringError); @@ -1105,84 +1105,6 @@ where let tok_end = self.get_pos(); Ok((tok_start, ty, tok_end)) } - - fn lex_byte(&self, s: String) -> Result, LexicalError> { - let mut res = vec![]; - let mut escape = false; //flag if previous was \ - let mut hex_on = false; // hex mode on or off - let mut hex_value = String::new(); - - for c in s.chars() { - match c { - '\\' => { - if escape { - res.push(92); - escape = false; - } else { - escape = true; - } - } - - 'x' => { - if escape { - hex_on = true; - } else { - res.push(120); - } - escape = false; - } - 't' => { - if escape { - res.push(9); - } else { - res.push(116); - } - escape = false; - } - 'n' => { - if escape { - res.push(10); - } else { - res.push(110) - } - escape = false; - } - 'r' => { - if escape { - res.push(13); - } else { - res.push(114) - } - escape = false; - } - x => { - if hex_on { - if x.is_ascii_hexdigit() { - if hex_value.is_empty() { - hex_value.push(x); - continue; - } else { - hex_value.push(x); - res.push(u8::from_str_radix(&hex_value, 16).unwrap()); - hex_on = false; - hex_value.clear(); - } - } else { - return Err(LexicalError::StringError); - } - } else { - if escape { - res.push(92); - } - res.push(x as u8); - } - escape = false; - } - } - } - - Ok(res) - } } /* Implement iterator pattern for the get_tok function. @@ -1211,6 +1133,54 @@ where } } +fn lex_byte(s: String) -> Result, LexicalError> { + let mut res = vec![]; + let mut escape = false; //flag if previous was \ + let mut hex_on = false; // hex mode on or off + let mut hex_value = String::new(); + + for c in s.chars() { + if hex_on { + if c.is_ascii_hexdigit() { + if hex_value.is_empty() { + hex_value.push(c); + continue; + } else { + hex_value.push(c); + res.push(u8::from_str_radix(&hex_value, 16).unwrap()); + hex_on = false; + hex_value.clear(); + } + } else { + return Err(LexicalError::StringError); + } + } else { + match (c, escape) { + ('\\', true) => res.push(b'\\'), + ('\\', false) => { + escape = true; + continue; + } + ('x', true) => hex_on = true, + ('x', false) => res.push(b'x'), + ('t', true) => res.push(b'\t'), + ('t', false) => res.push(b't'), + ('n', true) => res.push(b'\n'), + ('n', false) => res.push(b'n'), + ('r', true) => res.push(b'\r'), + ('r', false) => res.push(b'r'), + (x, true) => { + res.push(b'\\'); + res.push(x as u8); + } + (x, false) => res.push(x as u8), + } + escape = false; + } + } + Ok(res) +} + #[cfg(test)] mod tests { use super::{make_tokenizer, NewlineHandler, Tok};