From a400f6631cee18ab03c2f42174d7a789f9c5fe30 Mon Sep 17 00:00:00 2001 From: Windel Bouwman Date: Sun, 11 Aug 2019 09:29:21 +0200 Subject: [PATCH 1/2] Improve lexing of numbers with underscores. --- examples/parse_folder.rs | 53 ++++++++++++++----- parser/src/lexer.rs | 107 ++++++++++++++++++++------------------ parser/src/python.lalrpop | 6 +-- tests/snippets/numbers.py | 11 ++++ 4 files changed, 110 insertions(+), 67 deletions(-) diff --git a/examples/parse_folder.rs b/examples/parse_folder.rs index 513333b95..33530708d 100644 --- a/examples/parse_folder.rs +++ b/examples/parse_folder.rs @@ -15,7 +15,7 @@ use clap::{App, Arg}; use rustpython_parser::{ast, parser}; use std::path::{Path, PathBuf}; -use std::time::Instant; +use std::time::{Duration, Instant}; fn main() { env_logger::init(); @@ -61,30 +61,45 @@ fn parse_folder(path: &Path) -> std::io::Result> { } if metadata.is_file() && path.extension().and_then(|s| s.to_str()) == Some("py") { - let result = parse_python_file(&path); - match &result { + let parsed_file = parse_python_file(&path); + match &parsed_file.result { Ok(_) => {} Err(y) => error!("Erreur in file {:?} {:?}", path, y), } - res.push(ParsedFile { - filename: Box::new(path), - result, - }); + + res.push(parsed_file); } } Ok(res) } -fn parse_python_file(filename: &Path) -> ParseResult { +fn parse_python_file(filename: &Path) -> ParsedFile { info!("Parsing file {:?}", filename); - let source = std::fs::read_to_string(filename).map_err(|e| e.to_string())?; - parser::parse_program(&source).map_err(|e| e.to_string()) + match std::fs::read_to_string(filename) { + Err(e) => ParsedFile { + filename: Box::new(filename.to_path_buf()), + code: "".to_string(), + num_lines: 0, + result: Err(e.to_string()), + }, + Ok(source) => { + let num_lines = source.to_string().lines().count(); + let result = parser::parse_program(&source).map_err(|e| e.to_string()); + ParsedFile { + filename: Box::new(filename.to_path_buf()), + code: source.to_string(), + num_lines, + result, + } + } + } } fn statistics(results: ScanResult) { // println!("Processed {:?} files", res.len()); println!("Scanned a total of {} files", results.parsed_files.len()); - let total = results.parsed_files.len(); + let total: usize = results.parsed_files.len(); + let total_lines: usize = results.parsed_files.iter().map(|p| p.num_lines).sum(); let failed = results .parsed_files .iter() @@ -103,9 +118,19 @@ fn statistics(results: ScanResult) { let duration = results.t2 - results.t1; println!("Total time spend: {:?}", duration); println!( - "File processing rate: {} files/second", - (total * 1_000_000) as f64 / duration.as_micros() as f64 + "Processed {} files. That's {} files/second", + total, + rate(total, duration) ); + println!( + "Processed {} lines of python code. That's {} lines/second", + total_lines, + rate(total_lines, duration) + ); +} + +fn rate(counter: usize, duration: Duration) -> f64 { + (counter * 1_000_000) as f64 / duration.as_micros() as f64 } struct ScanResult { @@ -116,6 +141,8 @@ struct ScanResult { struct ParsedFile { filename: Box, + code: String, + num_lines: usize, result: ParseResult, } diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index ae83a1d8d..405162104 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -340,18 +340,7 @@ where /// Lex a hex/octal/decimal/binary number without a decimal point. fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> LexResult { - let mut value_text = String::new(); - - loop { - if let Some(c) = self.take_number(radix) { - value_text.push(c); - } else if self.chr0 == Some('_') { - self.next_char(); - } else { - break; - } - } - + let value_text = self.radix_run(radix); let end_pos = self.get_pos(); let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError { error: LexicalErrorType::OtherError(format!("{:?}", e)), @@ -360,24 +349,19 @@ where Ok((start_pos, Tok::Int { value }, end_pos)) } + /// Lex a normal number, that is, no octal, hex or binary number. fn lex_normal_number(&mut self) -> LexResult { let start_pos = self.get_pos(); - let mut value_text = String::new(); - // Normal number: - while let Some(c) = self.take_number(10) { - value_text.push(c); - } + let mut value_text = self.radix_run(10); // If float: if self.chr0 == Some('.') || self.at_exponent() { // Take '.': if self.chr0 == Some('.') { value_text.push(self.next_char().unwrap()); - while let Some(c) = self.take_number(10) { - value_text.push(c); - } + value_text.push_str(&self.radix_run(10)); } // 1e6 for example: @@ -389,9 +373,7 @@ where value_text.push(self.next_char().unwrap()); } - while let Some(c) = self.take_number(10) { - value_text.push(c); - } + value_text.push_str(&self.radix_run(10)); } let value = f64::from_str(&value_text).unwrap(); @@ -426,6 +408,57 @@ where } } + /// Consume a sequence of numbers with the given radix, + /// the digits can be decorated with underscores + /// like this: '1_2_3_4' == '1234' + fn radix_run(&mut self, radix: u32) -> String { + let mut value_text = String::new(); + loop { + if let Some(c) = self.take_number(radix) { + value_text.push(c); + } else if self.chr0 == Some('_') && Lexer::::is_digit_of_radix(&self.chr1, radix) { + self.next_char(); + } else { + break; + } + } + value_text + } + + /// Consume a single character with the given radix. + fn take_number(&mut self, radix: u32) -> Option { + let take_char = Lexer::::is_digit_of_radix(&self.chr0, radix); + + if take_char { + Some(self.next_char().unwrap()) + } else { + None + } + } + + /// Test if a digit is of a certain radix. + fn is_digit_of_radix(c: &Option, radix: u32) -> bool { + match radix { + 2 => match c { + Some('0'..='1') => true, + _ => false, + }, + 8 => match c { + Some('0'..='7') => true, + _ => false, + }, + 10 => match c { + Some('0'..='9') => true, + _ => false, + }, + 16 => match c { + Some('0'..='9') | Some('a'..='f') | Some('A'..='F') => true, + _ => false, + }, + x => unimplemented!("Radix not implemented: {}", x), + } + } + /// Test if we face '[eE][-+]?[0-9]+' fn at_exponent(&self) -> bool { match self.chr0 { @@ -626,34 +659,6 @@ where } } - fn take_number(&mut self, radix: u32) -> Option { - let take_char = match radix { - 2 => match self.chr0 { - Some('0'..='1') => true, - _ => false, - }, - 8 => match self.chr0 { - Some('0'..='7') => true, - _ => false, - }, - 10 => match self.chr0 { - Some('0'..='9') => true, - _ => false, - }, - 16 => match self.chr0 { - Some('0'..='9') | Some('a'..='f') | Some('A'..='F') => true, - _ => false, - }, - x => unimplemented!("Radix not implemented: {}", x), - }; - - if take_char { - Some(self.next_char().unwrap()) - } else { - None - } - } - /// This is the main entry point. Call this function to retrieve the next token. /// This function is used by the iterator implementation. fn inner_next(&mut self) -> LexResult { diff --git a/parser/src/python.lalrpop b/parser/src/python.lalrpop index 6af9fea61..8929dc542 100644 --- a/parser/src/python.lalrpop +++ b/parser/src/python.lalrpop @@ -247,7 +247,7 @@ ImportDots: usize = { ImportAsNames: Vec = { >> => i, - "(" >> ")" => i, + "(" >> ","? ")" => i, "*" => { // Star import all vec![ast::ImportSymbol { symbol: "*".to_string(), alias: None }] @@ -952,11 +952,11 @@ Atom: ast::Expression = { }; ListLiteralValues: Vec = { - > <_trailing_comma:","?> => e, + > ","? => e, }; DictLiteralValues: Vec<(Option, ast::Expression)> = { - > <_trailing_comma:","?> => elements, + > ","? => elements, }; DictEntry: (ast::Expression, ast::Expression) = { diff --git a/tests/snippets/numbers.py b/tests/snippets/numbers.py index c36602ee1..b90168d41 100644 --- a/tests/snippets/numbers.py +++ b/tests/snippets/numbers.py @@ -1,3 +1,5 @@ +from testutils import assertRaises + x = 5 x.__init__(6) assert x == 5 @@ -42,3 +44,12 @@ assert int(0).__rxor__(1) == 1 assert int(1).__rxor__(1) == 0 assert int(3).__rxor__(-3) == -2 assert int(3).__rxor__(4) == 7 + +# Test underscores in numbers: +assert 1_2 == 12 +assert 1_2_3 == 123 +assert 1_2.3_4 == 12.34 +assert 1_2.3_4e0_0 == 12.34 + +with assertRaises(SyntaxError): + eval('1__2') From dbc562c61311b1d99ccfd8f17125089e2b400867 Mon Sep 17 00:00:00 2001 From: Windel Bouwman Date: Sun, 11 Aug 2019 09:57:40 +0200 Subject: [PATCH 2/2] Implement clippy hint. --- examples/parse_folder.rs | 14 +++++++------- parser/src/lexer.rs | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/parse_folder.rs b/examples/parse_folder.rs index 33530708d..ad0c5f859 100644 --- a/examples/parse_folder.rs +++ b/examples/parse_folder.rs @@ -14,7 +14,7 @@ extern crate log; use clap::{App, Arg}; use rustpython_parser::{ast, parser}; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::time::{Duration, Instant}; fn main() { @@ -77,8 +77,8 @@ fn parse_python_file(filename: &Path) -> ParsedFile { info!("Parsing file {:?}", filename); match std::fs::read_to_string(filename) { Err(e) => ParsedFile { - filename: Box::new(filename.to_path_buf()), - code: "".to_string(), + // filename: Box::new(filename.to_path_buf()), + // code: "".to_string(), num_lines: 0, result: Err(e.to_string()), }, @@ -86,8 +86,8 @@ fn parse_python_file(filename: &Path) -> ParsedFile { let num_lines = source.to_string().lines().count(); let result = parser::parse_program(&source).map_err(|e| e.to_string()); ParsedFile { - filename: Box::new(filename.to_path_buf()), - code: source.to_string(), + // filename: Box::new(filename.to_path_buf()), + // code: source.to_string(), num_lines, result, } @@ -140,8 +140,8 @@ struct ScanResult { } struct ParsedFile { - filename: Box, - code: String, + // filename: Box, + // code: String, num_lines: usize, result: ParseResult, } diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 405162104..0f2e28bd2 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -416,7 +416,7 @@ where loop { if let Some(c) = self.take_number(radix) { value_text.push(c); - } else if self.chr0 == Some('_') && Lexer::::is_digit_of_radix(&self.chr1, radix) { + } else if self.chr0 == Some('_') && Lexer::::is_digit_of_radix(self.chr1, radix) { self.next_char(); } else { break; @@ -427,7 +427,7 @@ where /// Consume a single character with the given radix. fn take_number(&mut self, radix: u32) -> Option { - let take_char = Lexer::::is_digit_of_radix(&self.chr0, radix); + let take_char = Lexer::::is_digit_of_radix(self.chr0, radix); if take_char { Some(self.next_char().unwrap()) @@ -437,7 +437,7 @@ where } /// Test if a digit is of a certain radix. - fn is_digit_of_radix(c: &Option, radix: u32) -> bool { + fn is_digit_of_radix(c: Option, radix: u32) -> bool { match radix { 2 => match c { Some('0'..='1') => true,