//! This module takes care of lexing python source text. This means source //! code is translated into seperate tokens. pub use super::token::Tok; use num_bigint::BigInt; use num_traits::Num; use std::cmp::Ordering; use std::collections::HashMap; use std::str::FromStr; #[derive(Clone, Copy, PartialEq, Debug)] struct IndentationLevel { tabs: usize, spaces: usize, } impl IndentationLevel { fn new() -> IndentationLevel { IndentationLevel { tabs: 0, spaces: 0 } } fn compare_strict(&self, other: &IndentationLevel) -> Option { // We only know for sure that we're smaller or bigger if tabs // and spaces both differ in the same direction. Otherwise we're // dependent on the size of tabs. if self.tabs < other.tabs { if self.spaces <= other.spaces { Some(Ordering::Less) } else { None } } else if self.tabs > other.tabs { if self.spaces >= other.spaces { Some(Ordering::Greater) } else { None } } else { Some(self.spaces.cmp(&other.spaces)) } } } pub struct Lexer> { chars: T, at_begin_of_line: bool, nesting: usize, // Amount of parenthesis indentation_stack: Vec, pending: Vec>, chr0: Option, chr1: Option, location: Location, } #[derive(Debug)] pub enum LexicalError { StringError, } #[derive(Clone, Debug, Default, PartialEq)] pub struct Location { row: usize, column: usize, } impl Location { pub fn new(row: usize, column: usize) -> Self { Location { row: row, column: column, } } pub fn get_row(&self) -> usize { self.row } pub fn get_column(&self) -> usize { self.column } } pub fn get_keywords() -> HashMap { let mut keywords: HashMap = HashMap::new(); // Alphabetical keywords: keywords.insert(String::from("..."), Tok::Ellipsis); keywords.insert(String::from("False"), Tok::False); keywords.insert(String::from("None"), Tok::None); keywords.insert(String::from("True"), Tok::True); keywords.insert(String::from("and"), Tok::And); keywords.insert(String::from("as"), Tok::As); keywords.insert(String::from("assert"), Tok::Assert); keywords.insert(String::from("break"), Tok::Break); keywords.insert(String::from("class"), Tok::Class); keywords.insert(String::from("continue"), Tok::Continue); keywords.insert(String::from("def"), Tok::Def); keywords.insert(String::from("del"), Tok::Del); keywords.insert(String::from("elif"), Tok::Elif); keywords.insert(String::from("else"), Tok::Else); keywords.insert(String::from("except"), Tok::Except); keywords.insert(String::from("finally"), Tok::Finally); keywords.insert(String::from("for"), Tok::For); keywords.insert(String::from("from"), Tok::From); keywords.insert(String::from("global"), Tok::Global); keywords.insert(String::from("if"), Tok::If); keywords.insert(String::from("import"), Tok::Import); keywords.insert(String::from("in"), Tok::In); keywords.insert(String::from("is"), Tok::Is); keywords.insert(String::from("lambda"), Tok::Lambda); keywords.insert(String::from("nonlocal"), Tok::Nonlocal); keywords.insert(String::from("not"), Tok::Not); keywords.insert(String::from("or"), Tok::Or); keywords.insert(String::from("pass"), Tok::Pass); keywords.insert(String::from("raise"), Tok::Raise); keywords.insert(String::from("return"), Tok::Return); keywords.insert(String::from("try"), Tok::Try); keywords.insert(String::from("while"), Tok::While); keywords.insert(String::from("with"), Tok::With); keywords.insert(String::from("yield"), Tok::Yield); keywords } pub type Spanned = Result<(Location, Tok, Location), LexicalError>; pub fn make_tokenizer<'a>(source: &'a str) -> impl Iterator> + 'a { let nlh = NewlineHandler::new(source.chars()); let lch = LineContinationHandler::new(nlh); let lexer = Lexer::new(lch); lexer } // The newline handler is an iterator which collapses different newline // types into \n always. pub struct NewlineHandler> { source: T, chr0: Option, chr1: Option, } impl NewlineHandler where T: Iterator, { pub fn new(source: T) -> Self { let mut nlh = NewlineHandler { source: source, chr0: None, chr1: None, }; nlh.shift(); nlh.shift(); nlh } fn shift(&mut self) -> Option { let result = self.chr0; self.chr0 = self.chr1; self.chr1 = self.source.next(); result } } impl Iterator for NewlineHandler where T: Iterator, { type Item = char; fn next(&mut self) -> Option { // Collapse \r\n into \n loop { if self.chr0 == Some('\r') { if self.chr1 == Some('\n') { // Transform windows EOL into \n self.shift(); } else { // Transform MAC EOL into \n self.chr0 = Some('\n') } } else { break; } } self.shift() } } // Glues \ and \n into a single line: pub struct LineContinationHandler> { source: T, chr0: Option, chr1: Option, } impl LineContinationHandler where T: Iterator, { pub fn new(source: T) -> Self { let mut nlh = LineContinationHandler { source: source, chr0: None, chr1: None, }; nlh.shift(); nlh.shift(); nlh } fn shift(&mut self) -> Option { let result = self.chr0; self.chr0 = self.chr1; self.chr1 = self.source.next(); result } } impl Iterator for LineContinationHandler where T: Iterator, { type Item = char; fn next(&mut self) -> Option { // Collapse \r\n into \n loop { if self.chr0 == Some('\\') && self.chr1 == Some('\n') { // Skip backslash and newline self.shift(); self.shift(); // Idea: insert trailing newline here: // } else if self.chr0 != Some('\n') && self.chr1.is_none() { // self.chr1 = Some('\n'); } else { break; } } self.shift() } } impl Lexer where T: Iterator, { pub fn new(input: T) -> Self { let mut lxr = Lexer { chars: input, at_begin_of_line: true, nesting: 0, indentation_stack: vec![IndentationLevel::new()], pending: Vec::new(), chr0: None, location: Location::new(0, 0), chr1: None, }; lxr.next_char(); lxr.next_char(); // Start at top row (=1) left column (=1) lxr.location.row = 1; lxr.location.column = 1; lxr } // Lexer helper functions: fn lex_identifier(&mut self) -> Spanned { let mut name = String::new(); let start_pos = self.get_pos(); // Detect potential string like rb'' b'' f'' u'' r'' let mut saw_b = false; let mut saw_r = false; let mut saw_u = false; let mut saw_f = false; loop { // Detect r"", f"", b"" and u"" // TODO: handle f-strings if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('b') || self.chr0 == Some('B')) { saw_b = true; } else if !(saw_b || saw_r || saw_u || saw_f) && (self.chr0 == Some('u') || self.chr0 == Some('U')) { saw_u = true; } else if !(saw_r || saw_u) && (self.chr0 == Some('r') || self.chr0 == Some('R')) { saw_r = true; } else if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('f') || self.chr0 == Some('F')) { saw_f = true; } else { break; } // Take up char into name: name.push(self.next_char().unwrap()); // Check if we have a string: if self.chr0 == Some('"') || self.chr0 == Some('\'') { return self.lex_string(saw_b, saw_r, saw_u, saw_f); } } while self.is_char() { name.push(self.next_char().unwrap()); } let end_pos = self.get_pos(); let mut keywords = get_keywords(); if keywords.contains_key(&name) { Ok((start_pos, keywords.remove(&name).unwrap(), end_pos)) } else { Ok((start_pos, Tok::Name { name: name }, end_pos)) } } fn lex_number(&mut self) -> Spanned { let start_pos = self.get_pos(); if self.chr0 == Some('0') { if self.chr1 == Some('x') || self.chr1 == Some('X') { // Hex! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 16) } else if self.chr1 == Some('o') || self.chr1 == Some('O') { // Octal style! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 8) } else if self.chr1 == Some('b') || self.chr1 == Some('B') { // Binary! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 2) } else { self.lex_normal_number() } } else { self.lex_normal_number() } } fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> Spanned { let mut value_text = String::new(); loop { if self.is_number(radix) { value_text.push(self.next_char().unwrap()); } else if self.chr0 == Some('_') { self.next_char(); } else { break; } } let end_pos = self.get_pos(); let value = BigInt::from_str_radix(&value_text, radix).unwrap(); Ok((start_pos, Tok::Int { value: value }, end_pos)) } fn lex_normal_number(&mut self) -> Spanned { let start_pos = self.get_pos(); let mut value_text = String::new(); // Normal number: while self.is_number(10) { value_text.push(self.next_char().unwrap()); } // If float: if self.chr0 == Some('.') || self.chr0 == Some('e') { // Take '.': if self.chr0 == Some('.') { value_text.push(self.next_char().unwrap()); while self.is_number(10) { value_text.push(self.next_char().unwrap()); } } // 1e6 for example: if self.chr0 == Some('e') { value_text.push(self.next_char().unwrap()); // Optional +/- if self.chr0 == Some('-') || self.chr0 == Some('+') { value_text.push(self.next_char().unwrap()); } while self.is_number(10) { value_text.push(self.next_char().unwrap()); } } let value = f64::from_str(&value_text).unwrap(); // Parse trailing 'j': if self.chr0 == Some('j') { self.next_char(); let end_pos = self.get_pos(); Ok(( start_pos, Tok::Complex { real: 0.0, imag: value, }, end_pos, )) } else { let end_pos = self.get_pos(); Ok((start_pos, Tok::Float { value: value }, end_pos)) } } else { // Parse trailing 'j': if self.chr0 == Some('j') { self.next_char(); let end_pos = self.get_pos(); let imag = f64::from_str(&value_text).unwrap(); Ok(( start_pos, Tok::Complex { real: 0.0, imag: imag, }, end_pos, )) } else { let end_pos = self.get_pos(); let value = value_text.parse::().unwrap(); Ok((start_pos, Tok::Int { value: value }, end_pos)) } } } fn lex_comment(&mut self) { // Skip everything until end of line self.next_char(); loop { match self.chr0 { Some('\n') => { return; } Some(_) => {} None => return, } self.next_char(); } } fn lex_string( &mut self, is_bytes: bool, is_raw: bool, _is_unicode: bool, _is_fstring: bool, ) -> Spanned { let quote_char = self.next_char().unwrap(); let mut string_content = String::new(); let start_pos = self.get_pos(); // If the next two characters are also the quote character, then we have a triple-quoted // string; consume those two characters and ensure that we require a triple-quote to close let triple_quoted = if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) { self.next_char(); self.next_char(); true } else { false }; loop { match self.next_char() { Some('\\') => { if is_raw { string_content.push('\\'); } else { match self.next_char() { Some('\\') => { string_content.push('\\'); } Some('\'') => string_content.push('\''), Some('\"') => string_content.push('\"'), Some('\n') => { // Ignore Unix EOL character } Some('a') => string_content.push('\x07'), Some('b') => string_content.push('\x08'), Some('f') => string_content.push('\x0c'), Some('n') => { string_content.push('\n'); } Some('r') => string_content.push('\r'), Some('t') => { string_content.push('\t'); } Some('v') => string_content.push('\x0b'), Some(c) => { string_content.push('\\'); string_content.push(c); } None => { return Err(LexicalError::StringError); } } } } Some(c) => { if c == quote_char { if triple_quoted { // Look ahead at the next two characters; if we have two more // quote_chars, it's the end of the string; consume the remaining // closing quotes and break the loop if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) { self.next_char(); self.next_char(); break; } string_content.push(c); } else { break; } } else { if c == '\n' { if !triple_quoted { return Err(LexicalError::StringError); } self.new_line(); } string_content.push(c); } } None => { return Err(LexicalError::StringError); } } } let end_pos = self.get_pos(); let tok = if is_bytes { Tok::Bytes { value: string_content.as_bytes().to_vec(), } } else { Tok::String { value: string_content, } }; return Ok((start_pos, tok, end_pos)); } fn is_char(&self) -> bool { match self.chr0 { Some('a'...'z') | Some('A'...'Z') | Some('_') | Some('0'...'9') => return true, _ => return false, } } fn is_number(&self, radix: u32) -> bool { match radix { 2 => match self.chr0 { Some('0'...'1') => return true, _ => return false, }, 8 => match self.chr0 { Some('0'...'7') => return true, _ => return false, }, 10 => match self.chr0 { Some('0'...'9') => return true, _ => return false, }, 16 => match self.chr0 { Some('0'...'9') | Some('a'...'f') | Some('A'...'F') => return true, _ => return false, }, x => unimplemented!("Radix not implemented: {}", x), } } fn next_char(&mut self) -> Option { let c = self.chr0; let nxt = self.chars.next(); self.chr0 = self.chr1; self.chr1 = nxt; self.location.column += 1; c } fn get_pos(&self) -> Location { self.location.clone() } fn new_line(&mut self) { self.location.row += 1; self.location.column = 1; } fn inner_next(&mut self) -> Option> { if !self.pending.is_empty() { return Some(self.pending.remove(0)); } 'top_loop: loop { // Detect indentation levels if self.at_begin_of_line { self.at_begin_of_line = false; // Determine indentation: let mut spaces: usize = 0; let mut tabs: usize = 0; loop { match self.chr0 { Some(' ') => { self.next_char(); spaces += 1; } Some('\t') => { if spaces != 0 { // Don't allow tabs after spaces as part of indentation. // This is technically stricter than python3 but spaces before // tabs is even more insane than mixing spaces and tabs. panic!("Tabs not allowed as part of indentation after spaces"); } self.next_char(); tabs += 1; } Some('#') => { self.lex_comment(); self.at_begin_of_line = true; continue 'top_loop; } Some('\n') => { // Empty line! self.next_char(); self.at_begin_of_line = true; self.new_line(); continue 'top_loop; } _ => { break; } } } let indentation_level = IndentationLevel { spaces, tabs }; if self.nesting == 0 { // Determine indent or dedent: let current_indentation = *self.indentation_stack.last().unwrap(); let ordering = indentation_level.compare_strict(¤t_indentation); match ordering { Some(Ordering::Equal) => { // Same same } Some(Ordering::Greater) => { // New indentation level: self.indentation_stack.push(indentation_level); let tok_start = self.get_pos(); let tok_end = tok_start.clone(); return Some(Ok((tok_start, Tok::Indent, tok_end))); } Some(Ordering::Less) => { // One or more dedentations // Pop off other levels until col is found: loop { let ordering = indentation_level .compare_strict(self.indentation_stack.last().unwrap()); match ordering { Some(Ordering::Less) => { self.indentation_stack.pop(); let tok_start = self.get_pos(); let tok_end = tok_start.clone(); self.pending.push(Ok((tok_start, Tok::Dedent, tok_end))); } None => { panic!("inconsistent use of tabs and spaces in indentation") } _ => { break; } }; } if indentation_level != *self.indentation_stack.last().unwrap() { // TODO: handle wrong indentations panic!("Non matching indentation levels!"); } return Some(self.pending.remove(0)); } None => panic!("inconsistent use of tabs and spaces in indentation"), } } } match self.chr0 { Some('0'...'9') => return Some(self.lex_number()), Some('_') | Some('a'...'z') | Some('A'...'Z') => return Some(self.lex_identifier()), Some('#') => { self.lex_comment(); continue; } Some('"') => { return Some(self.lex_string(false, false, false, false)); } Some('\'') => { return Some(self.lex_string(false, false, false, false)); } Some('=') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::EqEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Equal, tok_end))); } } } Some('+') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::PlusEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Plus, tok_end))); } } } Some('*') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::StarEqual, tok_end))); } Some('*') => { self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::DoubleStarEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::DoubleStar, tok_end))); } } } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Star, tok_end))); } } } Some('/') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::SlashEqual, tok_end))); } Some('/') => { self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::DoubleSlashEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::DoubleSlash, tok_end))); } } } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Slash, tok_end))); } } } Some('%') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::PercentEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Percent, tok_end))); } } } Some('|') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::VbarEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Vbar, tok_end))); } } } Some('^') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::CircumflexEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::CircumFlex, tok_end))); } } } Some('&') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::AmperEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Amper, tok_end))); } } } Some('-') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::MinusEqual, tok_end))); } Some('>') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Rarrow, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Minus, tok_end))); } } } Some('@') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::AtEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::At, tok_end))); } } } Some('!') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::NotEqual, tok_end))); } _ => panic!("Invalid token '!'"), } } Some('~') => { return Some(self.eat_single_char(Tok::Tilde)); } Some('(') => { let result = self.eat_single_char(Tok::Lpar); self.nesting += 1; return Some(result); } Some(')') => { let result = self.eat_single_char(Tok::Rpar); self.nesting -= 1; return Some(result); } Some('[') => { let result = self.eat_single_char(Tok::Lsqb); self.nesting += 1; return Some(result); } Some(']') => { let result = self.eat_single_char(Tok::Rsqb); self.nesting -= 1; return Some(result); } Some('{') => { let result = self.eat_single_char(Tok::Lbrace); self.nesting += 1; return Some(result); } Some('}') => { let result = self.eat_single_char(Tok::Rbrace); self.nesting -= 1; return Some(result); } Some(':') => { return Some(self.eat_single_char(Tok::Colon)); } Some(';') => { return Some(self.eat_single_char(Tok::Semi)); } Some('<') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('<') => { self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::LeftShiftEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::LeftShift, tok_end))); } } } Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::LessEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Less, tok_end))); } } } Some('>') => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('>') => { self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::RightShiftEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::RightShift, tok_end))); } } } Some('=') => { self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::GreaterEqual, tok_end))); } _ => { let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Greater, tok_end))); } } } Some(',') => { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Comma, tok_end))); } Some('.') => { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); return Some(Ok((tok_start, Tok::Dot, tok_end))); } Some('\n') => { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); self.new_line(); // Depending on the nesting level, we emit newline or not: if self.nesting == 0 { self.at_begin_of_line = true; return Some(Ok((tok_start, Tok::Newline, tok_end))); } else { continue; } } Some(' ') => { // Skip whitespaces self.next_char(); continue; } None => return None, _ => { let c = self.next_char(); panic!("Not impl {:?}", c) } // Ignore all the rest.. } } } fn eat_single_char(&mut self, ty: Tok) -> Spanned { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); Ok((tok_start, ty, tok_end)) } } /* Implement iterator pattern for the get_tok function. Calling the next element in the iterator will yield the next lexical token. */ impl Iterator for Lexer where T: Iterator, { type Item = Spanned; fn next(&mut self) -> Option { // Idea: create some sort of hash map for single char tokens: // let mut X = HashMap::new(); // X.insert('=', Tok::Equal); let token = self.inner_next(); trace!( "Lex token {:?}, nesting={:?}, indent stack: {:?}", token, self.nesting, self.indentation_stack ); token } } #[cfg(test)] mod tests { use super::{make_tokenizer, NewlineHandler, Tok}; use num_bigint::BigInt; use std::iter::FromIterator; use std::iter::Iterator; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; const UNIX_EOL: &str = "\n"; pub fn lex_source(source: &String) -> Vec { let lexer = make_tokenizer(source); Vec::from_iter(lexer.map(|x| x.unwrap().1)) } #[test] fn test_newline_processor() { // Escape \ followed by \n (by removal): let src = "b\\\r\n"; assert_eq!(4, src.len()); let nlh = NewlineHandler::new(src.chars()); let x: Vec = nlh.collect(); assert_eq!(vec!['b', '\\', '\n'], x); } #[test] fn test_raw_string() { let source = String::from("r\"\\\\\" \"\\\\\""); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::String { value: "\\\\".to_string(), }, Tok::String { value: "\\".to_string(), } ] ); } #[test] fn test_numbers() { let source = String::from("0x2f 0b1101 0 123 0.2 2j 2.2j"); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Int { value: BigInt::from(47), }, Tok::Int { value: BigInt::from(13), }, Tok::Int { value: BigInt::from(0), }, Tok::Int { value: BigInt::from(123), }, Tok::Float { value: 0.2 }, Tok::Complex { real: 0.0, imag: 2.0, }, Tok::Complex { real: 0.0, imag: 2.2, }, ] ); } macro_rules! test_line_comment { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!(r"99232 # {}", $eol)); let tokens = lex_source(&source); assert_eq!(tokens, vec![Tok::Int { value: BigInt::from(99232) }]); } )* } } test_line_comment! { test_line_comment_long: " foo", test_line_comment_whitespace: " ", test_line_comment_single_whitespace: " ", test_line_comment_empty: "", } macro_rules! test_comment_until_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("123 # Foo{}456", $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Int { value: BigInt::from(123) }, Tok::Newline, Tok::Int { value: BigInt::from(456) }, ] ) } )* } } test_comment_until_eol! { test_comment_until_windows_eol: WINDOWS_EOL, test_comment_until_mac_eol: MAC_EOL, test_comment_until_unix_eol: UNIX_EOL, } #[test] fn test_assignment() { let source = String::from(r"avariable = 99 + 2-0"); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Name { name: String::from("avariable"), }, Tok::Equal, Tok::Int { value: BigInt::from(99) }, Tok::Plus, Tok::Int { value: BigInt::from(2) }, Tok::Minus, Tok::Int { value: BigInt::from(0) }, ] ); } macro_rules! test_indentation_with_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("def foo():{} return 99{}{}", $eol, $eol, $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo"), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: BigInt::from(99) }, Tok::Newline, Tok::Dedent, ] ); } )* }; } test_indentation_with_eol! { test_indentation_windows_eol: WINDOWS_EOL, test_indentation_mac_eol: MAC_EOL, test_indentation_unix_eol: UNIX_EOL, } macro_rules! test_double_dedent_with_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo"), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::If, Tok::Name { name: String::from("x"), }, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: BigInt::from(99) }, Tok::Newline, Tok::Dedent, Tok::Dedent, ] ); } )* } } macro_rules! test_double_dedent_with_tabs { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo"), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::If, Tok::Name { name: String::from("x"), }, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: BigInt::from(99) }, Tok::Newline, Tok::Dedent, Tok::Dedent, ] ); } )* } } test_double_dedent_with_eol! { test_double_dedent_windows_eol: WINDOWS_EOL, test_double_dedent_mac_eol: MAC_EOL, test_double_dedent_unix_eol: UNIX_EOL, } test_double_dedent_with_tabs! { test_double_dedent_tabs_windows_eol: WINDOWS_EOL, test_double_dedent_tabs_mac_eol: MAC_EOL, test_double_dedent_tabs_unix_eol: UNIX_EOL, } macro_rules! test_newline_in_brackets { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("x = [{} 1,2{}]{}", $eol, $eol, $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Name { name: String::from("x"), }, Tok::Equal, Tok::Lsqb, Tok::Int { value: BigInt::from(1) }, Tok::Comma, Tok::Int { value: BigInt::from(2) }, Tok::Rsqb, Tok::Newline, ] ); } )* }; } test_newline_in_brackets! { test_newline_in_brackets_windows_eol: WINDOWS_EOL, test_newline_in_brackets_mac_eol: MAC_EOL, test_newline_in_brackets_unix_eol: UNIX_EOL, } #[test] fn test_operators() { let source = String::from("//////=/ /"); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::DoubleSlash, Tok::DoubleSlash, Tok::DoubleSlashEqual, Tok::Slash, Tok::Slash, ] ); } #[test] fn test_string() { let source = String::from(r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g'"#); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::String { value: String::from("double"), }, Tok::String { value: String::from("single"), }, Tok::String { value: String::from("can't"), }, Tok::String { value: String::from("\\\""), }, Tok::String { value: String::from("\t\r\n"), }, Tok::String { value: String::from("\\g"), }, ] ); } macro_rules! test_string_continuation { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = String::from(format!("\"abc\\{}def\"", $eol)); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::String { value: String::from("abcdef"), }, ] ) } )* } } test_string_continuation! { test_string_continuation_windows_eol: WINDOWS_EOL, test_string_continuation_mac_eol: MAC_EOL, test_string_continuation_unix_eol: UNIX_EOL, } }