Files
RustPython/parser/src/lexer.rs
2018-12-16 17:40:25 -05:00

1454 lines
49 KiB
Rust

//! This module takes care of lexing python source text. This means source
//! code is translated into seperate tokens.
pub use super::token::Tok;
use num_bigint::BigInt;
use num_traits::Num;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::str::FromStr;
#[derive(Clone, Copy, PartialEq, Debug)]
struct IndentationLevel {
tabs: usize,
spaces: usize,
}
impl IndentationLevel {
fn new() -> IndentationLevel {
IndentationLevel { tabs: 0, spaces: 0 }
}
fn compare_strict(&self, other: &IndentationLevel) -> Option<Ordering> {
// We only know for sure that we're smaller or bigger if tabs
// and spaces both differ in the same direction. Otherwise we're
// dependent on the size of tabs.
if self.tabs < other.tabs {
if self.spaces <= other.spaces {
Some(Ordering::Less)
} else {
None
}
} else if self.tabs > other.tabs {
if self.spaces >= other.spaces {
Some(Ordering::Greater)
} else {
None
}
} else {
Some(self.spaces.cmp(&other.spaces))
}
}
}
pub struct Lexer<T: Iterator<Item = char>> {
chars: T,
at_begin_of_line: bool,
nesting: usize, // Amount of parenthesis
indentation_stack: Vec<IndentationLevel>,
pending: Vec<Spanned<Tok>>,
chr0: Option<char>,
chr1: Option<char>,
location: Location,
}
#[derive(Debug)]
pub enum LexicalError {
StringError,
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct Location {
row: usize,
column: usize,
}
impl Location {
pub fn new(row: usize, column: usize) -> Self {
Location {
row: row,
column: column,
}
}
pub fn get_row(&self) -> usize {
self.row
}
pub fn get_column(&self) -> usize {
self.column
}
}
pub fn get_keywords() -> HashMap<String, Tok> {
let mut keywords: HashMap<String, Tok> = HashMap::new();
// Alphabetical keywords:
keywords.insert(String::from("..."), Tok::Ellipsis);
keywords.insert(String::from("False"), Tok::False);
keywords.insert(String::from("None"), Tok::None);
keywords.insert(String::from("True"), Tok::True);
keywords.insert(String::from("and"), Tok::And);
keywords.insert(String::from("as"), Tok::As);
keywords.insert(String::from("assert"), Tok::Assert);
keywords.insert(String::from("break"), Tok::Break);
keywords.insert(String::from("class"), Tok::Class);
keywords.insert(String::from("continue"), Tok::Continue);
keywords.insert(String::from("def"), Tok::Def);
keywords.insert(String::from("del"), Tok::Del);
keywords.insert(String::from("elif"), Tok::Elif);
keywords.insert(String::from("else"), Tok::Else);
keywords.insert(String::from("except"), Tok::Except);
keywords.insert(String::from("finally"), Tok::Finally);
keywords.insert(String::from("for"), Tok::For);
keywords.insert(String::from("from"), Tok::From);
keywords.insert(String::from("global"), Tok::Global);
keywords.insert(String::from("if"), Tok::If);
keywords.insert(String::from("import"), Tok::Import);
keywords.insert(String::from("in"), Tok::In);
keywords.insert(String::from("is"), Tok::Is);
keywords.insert(String::from("lambda"), Tok::Lambda);
keywords.insert(String::from("nonlocal"), Tok::Nonlocal);
keywords.insert(String::from("not"), Tok::Not);
keywords.insert(String::from("or"), Tok::Or);
keywords.insert(String::from("pass"), Tok::Pass);
keywords.insert(String::from("raise"), Tok::Raise);
keywords.insert(String::from("return"), Tok::Return);
keywords.insert(String::from("try"), Tok::Try);
keywords.insert(String::from("while"), Tok::While);
keywords.insert(String::from("with"), Tok::With);
keywords.insert(String::from("yield"), Tok::Yield);
keywords
}
pub type Spanned<Tok> = Result<(Location, Tok, Location), LexicalError>;
pub fn make_tokenizer<'a>(source: &'a str) -> impl Iterator<Item = Spanned<Tok>> + 'a {
let nlh = NewlineHandler::new(source.chars());
let lch = LineContinationHandler::new(nlh);
let lexer = Lexer::new(lch);
lexer
}
// The newline handler is an iterator which collapses different newline
// types into \n always.
pub struct NewlineHandler<T: Iterator<Item = char>> {
source: T,
chr0: Option<char>,
chr1: Option<char>,
}
impl<T> NewlineHandler<T>
where
T: Iterator<Item = char>,
{
pub fn new(source: T) -> Self {
let mut nlh = NewlineHandler {
source: source,
chr0: None,
chr1: None,
};
nlh.shift();
nlh.shift();
nlh
}
fn shift(&mut self) -> Option<char> {
let result = self.chr0;
self.chr0 = self.chr1;
self.chr1 = self.source.next();
result
}
}
impl<T> Iterator for NewlineHandler<T>
where
T: Iterator<Item = char>,
{
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
// Collapse \r\n into \n
loop {
if self.chr0 == Some('\r') {
if self.chr1 == Some('\n') {
// Transform windows EOL into \n
self.shift();
} else {
// Transform MAC EOL into \n
self.chr0 = Some('\n')
}
} else {
break;
}
}
self.shift()
}
}
// Glues \ and \n into a single line:
pub struct LineContinationHandler<T: Iterator<Item = char>> {
source: T,
chr0: Option<char>,
chr1: Option<char>,
}
impl<T> LineContinationHandler<T>
where
T: Iterator<Item = char>,
{
pub fn new(source: T) -> Self {
let mut nlh = LineContinationHandler {
source: source,
chr0: None,
chr1: None,
};
nlh.shift();
nlh.shift();
nlh
}
fn shift(&mut self) -> Option<char> {
let result = self.chr0;
self.chr0 = self.chr1;
self.chr1 = self.source.next();
result
}
}
impl<T> Iterator for LineContinationHandler<T>
where
T: Iterator<Item = char>,
{
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
// Collapse \r\n into \n
loop {
if self.chr0 == Some('\\') && self.chr1 == Some('\n') {
// Skip backslash and newline
self.shift();
self.shift();
// Idea: insert trailing newline here:
// } else if self.chr0 != Some('\n') && self.chr1.is_none() {
// self.chr1 = Some('\n');
} else {
break;
}
}
self.shift()
}
}
impl<T> Lexer<T>
where
T: Iterator<Item = char>,
{
pub fn new(input: T) -> Self {
let mut lxr = Lexer {
chars: input,
at_begin_of_line: true,
nesting: 0,
indentation_stack: vec![IndentationLevel::new()],
pending: Vec::new(),
chr0: None,
location: Location::new(0, 0),
chr1: None,
};
lxr.next_char();
lxr.next_char();
// Start at top row (=1) left column (=1)
lxr.location.row = 1;
lxr.location.column = 1;
lxr
}
// Lexer helper functions:
fn lex_identifier(&mut self) -> Spanned<Tok> {
let mut name = String::new();
let start_pos = self.get_pos();
// Detect potential string like rb'' b'' f'' u'' r''
let mut saw_b = false;
let mut saw_r = false;
let mut saw_u = false;
let mut saw_f = false;
loop {
// Detect r"", f"", b"" and u""
// TODO: handle f-strings
if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('b') || self.chr0 == Some('B')) {
saw_b = true;
} else if !(saw_b || saw_r || saw_u || saw_f)
&& (self.chr0 == Some('u') || self.chr0 == Some('U'))
{
saw_u = true;
} else if !(saw_r || saw_u) && (self.chr0 == Some('r') || self.chr0 == Some('R')) {
saw_r = true;
} else if !(saw_b || saw_u || saw_f)
&& (self.chr0 == Some('f') || self.chr0 == Some('F'))
{
saw_f = true;
} else {
break;
}
// Take up char into name:
name.push(self.next_char().unwrap());
// Check if we have a string:
if self.chr0 == Some('"') || self.chr0 == Some('\'') {
return self.lex_string(saw_b, saw_r, saw_u, saw_f);
}
}
while self.is_char() {
name.push(self.next_char().unwrap());
}
let end_pos = self.get_pos();
let mut keywords = get_keywords();
if keywords.contains_key(&name) {
Ok((start_pos, keywords.remove(&name).unwrap(), end_pos))
} else {
Ok((start_pos, Tok::Name { name: name }, end_pos))
}
}
fn lex_number(&mut self) -> Spanned<Tok> {
let start_pos = self.get_pos();
if self.chr0 == Some('0') {
if self.chr1 == Some('x') || self.chr1 == Some('X') {
// Hex!
self.next_char();
self.next_char();
self.lex_number_radix(start_pos, 16)
} else if self.chr1 == Some('o') || self.chr1 == Some('O') {
// Octal style!
self.next_char();
self.next_char();
self.lex_number_radix(start_pos, 8)
} else if self.chr1 == Some('b') || self.chr1 == Some('B') {
// Binary!
self.next_char();
self.next_char();
self.lex_number_radix(start_pos, 2)
} else {
self.lex_normal_number()
}
} else {
self.lex_normal_number()
}
}
fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> Spanned<Tok> {
let mut value_text = String::new();
loop {
if self.is_number(radix) {
value_text.push(self.next_char().unwrap());
} else if self.chr0 == Some('_') {
self.next_char();
} else {
break;
}
}
let end_pos = self.get_pos();
let value = BigInt::from_str_radix(&value_text, radix).unwrap();
Ok((start_pos, Tok::Int { value: value }, end_pos))
}
fn lex_normal_number(&mut self) -> Spanned<Tok> {
let start_pos = self.get_pos();
let mut value_text = String::new();
// Normal number:
while self.is_number(10) {
value_text.push(self.next_char().unwrap());
}
// If float:
if self.chr0 == Some('.') || self.chr0 == Some('e') {
// Take '.':
if self.chr0 == Some('.') {
value_text.push(self.next_char().unwrap());
while self.is_number(10) {
value_text.push(self.next_char().unwrap());
}
}
// 1e6 for example:
if self.chr0 == Some('e') {
value_text.push(self.next_char().unwrap());
// Optional +/-
if self.chr0 == Some('-') || self.chr0 == Some('+') {
value_text.push(self.next_char().unwrap());
}
while self.is_number(10) {
value_text.push(self.next_char().unwrap());
}
}
let value = f64::from_str(&value_text).unwrap();
// Parse trailing 'j':
if self.chr0 == Some('j') {
self.next_char();
let end_pos = self.get_pos();
Ok((
start_pos,
Tok::Complex {
real: 0.0,
imag: value,
},
end_pos,
))
} else {
let end_pos = self.get_pos();
Ok((start_pos, Tok::Float { value: value }, end_pos))
}
} else {
// Parse trailing 'j':
if self.chr0 == Some('j') {
self.next_char();
let end_pos = self.get_pos();
let imag = f64::from_str(&value_text).unwrap();
Ok((
start_pos,
Tok::Complex {
real: 0.0,
imag: imag,
},
end_pos,
))
} else {
let end_pos = self.get_pos();
let value = value_text.parse::<BigInt>().unwrap();
Ok((start_pos, Tok::Int { value: value }, end_pos))
}
}
}
fn lex_comment(&mut self) {
// Skip everything until end of line
self.next_char();
loop {
match self.chr0 {
Some('\n') => {
return;
}
Some(_) => {}
None => return,
}
self.next_char();
}
}
fn lex_string(
&mut self,
is_bytes: bool,
is_raw: bool,
_is_unicode: bool,
_is_fstring: bool,
) -> Spanned<Tok> {
let quote_char = self.next_char().unwrap();
let mut string_content = String::new();
let start_pos = self.get_pos();
// If the next two characters are also the quote character, then we have a triple-quoted
// string; consume those two characters and ensure that we require a triple-quote to close
let triple_quoted = if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) {
self.next_char();
self.next_char();
true
} else {
false
};
loop {
match self.next_char() {
Some('\\') => {
if is_raw {
string_content.push('\\');
} else {
match self.next_char() {
Some('\\') => {
string_content.push('\\');
}
Some('\'') => string_content.push('\''),
Some('\"') => string_content.push('\"'),
Some('\n') => {
// Ignore Unix EOL character
}
Some('a') => string_content.push('\x07'),
Some('b') => string_content.push('\x08'),
Some('f') => string_content.push('\x0c'),
Some('n') => {
string_content.push('\n');
}
Some('r') => string_content.push('\r'),
Some('t') => {
string_content.push('\t');
}
Some('v') => string_content.push('\x0b'),
Some(c) => {
string_content.push('\\');
string_content.push(c);
}
None => {
return Err(LexicalError::StringError);
}
}
}
}
Some(c) => {
if c == quote_char {
if triple_quoted {
// Look ahead at the next two characters; if we have two more
// quote_chars, it's the end of the string; consume the remaining
// closing quotes and break the loop
if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) {
self.next_char();
self.next_char();
break;
}
string_content.push(c);
} else {
break;
}
} else {
if c == '\n' {
if !triple_quoted {
return Err(LexicalError::StringError);
}
self.new_line();
}
string_content.push(c);
}
}
None => {
return Err(LexicalError::StringError);
}
}
}
let end_pos = self.get_pos();
let tok = if is_bytes {
Tok::Bytes {
value: string_content.as_bytes().to_vec(),
}
} else {
Tok::String {
value: string_content,
}
};
return Ok((start_pos, tok, end_pos));
}
fn is_char(&self) -> bool {
match self.chr0 {
Some('a'...'z') | Some('A'...'Z') | Some('_') | Some('0'...'9') => return true,
_ => return false,
}
}
fn is_number(&self, radix: u32) -> bool {
match radix {
2 => match self.chr0 {
Some('0'...'1') => return true,
_ => return false,
},
8 => match self.chr0 {
Some('0'...'7') => return true,
_ => return false,
},
10 => match self.chr0 {
Some('0'...'9') => return true,
_ => return false,
},
16 => match self.chr0 {
Some('0'...'9') | Some('a'...'f') | Some('A'...'F') => return true,
_ => return false,
},
x => unimplemented!("Radix not implemented: {}", x),
}
}
fn next_char(&mut self) -> Option<char> {
let c = self.chr0;
let nxt = self.chars.next();
self.chr0 = self.chr1;
self.chr1 = nxt;
self.location.column += 1;
c
}
fn get_pos(&self) -> Location {
self.location.clone()
}
fn new_line(&mut self) {
self.location.row += 1;
self.location.column = 1;
}
fn inner_next(&mut self) -> Option<Spanned<Tok>> {
if !self.pending.is_empty() {
return Some(self.pending.remove(0));
}
'top_loop: loop {
// Detect indentation levels
if self.at_begin_of_line {
self.at_begin_of_line = false;
// Determine indentation:
let mut spaces: usize = 0;
let mut tabs: usize = 0;
loop {
match self.chr0 {
Some(' ') => {
self.next_char();
spaces += 1;
}
Some('\t') => {
if spaces != 0 {
// Don't allow tabs after spaces as part of indentation.
// This is technically stricter than python3 but spaces before
// tabs is even more insane than mixing spaces and tabs.
panic!("Tabs not allowed as part of indentation after spaces");
}
self.next_char();
tabs += 1;
}
Some('#') => {
self.lex_comment();
self.at_begin_of_line = true;
continue 'top_loop;
}
Some('\n') => {
// Empty line!
self.next_char();
self.at_begin_of_line = true;
self.new_line();
continue 'top_loop;
}
_ => {
break;
}
}
}
let indentation_level = IndentationLevel { spaces, tabs };
if self.nesting == 0 {
// Determine indent or dedent:
let current_indentation = *self.indentation_stack.last().unwrap();
let ordering = indentation_level.compare_strict(&current_indentation);
match ordering {
Some(Ordering::Equal) => {
// Same same
}
Some(Ordering::Greater) => {
// New indentation level:
self.indentation_stack.push(indentation_level);
let tok_start = self.get_pos();
let tok_end = tok_start.clone();
return Some(Ok((tok_start, Tok::Indent, tok_end)));
}
Some(Ordering::Less) => {
// One or more dedentations
// Pop off other levels until col is found:
loop {
let ordering = indentation_level
.compare_strict(self.indentation_stack.last().unwrap());
match ordering {
Some(Ordering::Less) => {
self.indentation_stack.pop();
let tok_start = self.get_pos();
let tok_end = tok_start.clone();
self.pending.push(Ok((tok_start, Tok::Dedent, tok_end)));
}
None => {
panic!("inconsistent use of tabs and spaces in indentation")
}
_ => {
break;
}
};
}
if indentation_level != *self.indentation_stack.last().unwrap() {
// TODO: handle wrong indentations
panic!("Non matching indentation levels!");
}
return Some(self.pending.remove(0));
}
None => panic!("inconsistent use of tabs and spaces in indentation"),
}
}
}
match self.chr0 {
Some('0'...'9') => return Some(self.lex_number()),
Some('_') | Some('a'...'z') | Some('A'...'Z') => return Some(self.lex_identifier()),
Some('#') => {
self.lex_comment();
continue;
}
Some('"') => {
return Some(self.lex_string(false, false, false, false));
}
Some('\'') => {
return Some(self.lex_string(false, false, false, false));
}
Some('=') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::EqEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Equal, tok_end)));
}
}
}
Some('+') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::PlusEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Plus, tok_end)));
}
}
}
Some('*') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::StarEqual, tok_end)));
}
Some('*') => {
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::DoubleStarEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::DoubleStar, tok_end)));
}
}
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Star, tok_end)));
}
}
}
Some('/') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::SlashEqual, tok_end)));
}
Some('/') => {
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::DoubleSlashEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::DoubleSlash, tok_end)));
}
}
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Slash, tok_end)));
}
}
}
Some('%') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::PercentEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Percent, tok_end)));
}
}
}
Some('|') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::VbarEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Vbar, tok_end)));
}
}
}
Some('^') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::CircumflexEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::CircumFlex, tok_end)));
}
}
}
Some('&') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::AmperEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Amper, tok_end)));
}
}
}
Some('-') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::MinusEqual, tok_end)));
}
Some('>') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Rarrow, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Minus, tok_end)));
}
}
}
Some('@') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::AtEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::At, tok_end)));
}
}
}
Some('!') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::NotEqual, tok_end)));
}
_ => panic!("Invalid token '!'"),
}
}
Some('~') => {
return Some(self.eat_single_char(Tok::Tilde));
}
Some('(') => {
let result = self.eat_single_char(Tok::Lpar);
self.nesting += 1;
return Some(result);
}
Some(')') => {
let result = self.eat_single_char(Tok::Rpar);
self.nesting -= 1;
return Some(result);
}
Some('[') => {
let result = self.eat_single_char(Tok::Lsqb);
self.nesting += 1;
return Some(result);
}
Some(']') => {
let result = self.eat_single_char(Tok::Rsqb);
self.nesting -= 1;
return Some(result);
}
Some('{') => {
let result = self.eat_single_char(Tok::Lbrace);
self.nesting += 1;
return Some(result);
}
Some('}') => {
let result = self.eat_single_char(Tok::Rbrace);
self.nesting -= 1;
return Some(result);
}
Some(':') => {
return Some(self.eat_single_char(Tok::Colon));
}
Some(';') => {
return Some(self.eat_single_char(Tok::Semi));
}
Some('<') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('<') => {
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::LeftShiftEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::LeftShift, tok_end)));
}
}
}
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::LessEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Less, tok_end)));
}
}
}
Some('>') => {
let tok_start = self.get_pos();
self.next_char();
match self.chr0 {
Some('>') => {
self.next_char();
match self.chr0 {
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::RightShiftEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::RightShift, tok_end)));
}
}
}
Some('=') => {
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::GreaterEqual, tok_end)));
}
_ => {
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Greater, tok_end)));
}
}
}
Some(',') => {
let tok_start = self.get_pos();
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Comma, tok_end)));
}
Some('.') => {
let tok_start = self.get_pos();
self.next_char();
let tok_end = self.get_pos();
return Some(Ok((tok_start, Tok::Dot, tok_end)));
}
Some('\n') => {
let tok_start = self.get_pos();
self.next_char();
let tok_end = self.get_pos();
self.new_line();
// Depending on the nesting level, we emit newline or not:
if self.nesting == 0 {
self.at_begin_of_line = true;
return Some(Ok((tok_start, Tok::Newline, tok_end)));
} else {
continue;
}
}
Some(' ') => {
// Skip whitespaces
self.next_char();
continue;
}
None => return None,
_ => {
let c = self.next_char();
panic!("Not impl {:?}", c)
} // Ignore all the rest..
}
}
}
fn eat_single_char(&mut self, ty: Tok) -> Spanned<Tok> {
let tok_start = self.get_pos();
self.next_char();
let tok_end = self.get_pos();
Ok((tok_start, ty, tok_end))
}
}
/* Implement iterator pattern for the get_tok function.
Calling the next element in the iterator will yield the next lexical
token.
*/
impl<T> Iterator for Lexer<T>
where
T: Iterator<Item = char>,
{
type Item = Spanned<Tok>;
fn next(&mut self) -> Option<Self::Item> {
// Idea: create some sort of hash map for single char tokens:
// let mut X = HashMap::new();
// X.insert('=', Tok::Equal);
let token = self.inner_next();
trace!(
"Lex token {:?}, nesting={:?}, indent stack: {:?}",
token,
self.nesting,
self.indentation_stack
);
token
}
}
#[cfg(test)]
mod tests {
use super::{make_tokenizer, NewlineHandler, Tok};
use num_bigint::BigInt;
use std::iter::FromIterator;
use std::iter::Iterator;
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
const UNIX_EOL: &str = "\n";
pub fn lex_source(source: &String) -> Vec<Tok> {
let lexer = make_tokenizer(source);
Vec::from_iter(lexer.map(|x| x.unwrap().1))
}
#[test]
fn test_newline_processor() {
// Escape \ followed by \n (by removal):
let src = "b\\\r\n";
assert_eq!(4, src.len());
let nlh = NewlineHandler::new(src.chars());
let x: Vec<char> = nlh.collect();
assert_eq!(vec!['b', '\\', '\n'], x);
}
#[test]
fn test_raw_string() {
let source = String::from("r\"\\\\\" \"\\\\\"");
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::String {
value: "\\\\".to_string(),
},
Tok::String {
value: "\\".to_string(),
}
]
);
}
#[test]
fn test_numbers() {
let source = String::from("0x2f 0b1101 0 123 0.2 2j 2.2j");
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Int {
value: BigInt::from(47),
},
Tok::Int {
value: BigInt::from(13),
},
Tok::Int {
value: BigInt::from(0),
},
Tok::Int {
value: BigInt::from(123),
},
Tok::Float { value: 0.2 },
Tok::Complex {
real: 0.0,
imag: 2.0,
},
Tok::Complex {
real: 0.0,
imag: 2.2,
},
]
);
}
macro_rules! test_line_comment {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!(r"99232 # {}", $eol));
let tokens = lex_source(&source);
assert_eq!(tokens, vec![Tok::Int { value: BigInt::from(99232) }]);
}
)*
}
}
test_line_comment! {
test_line_comment_long: " foo",
test_line_comment_whitespace: " ",
test_line_comment_single_whitespace: " ",
test_line_comment_empty: "",
}
macro_rules! test_comment_until_eol {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("123 # Foo{}456", $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Int { value: BigInt::from(123) },
Tok::Newline,
Tok::Int { value: BigInt::from(456) },
]
)
}
)*
}
}
test_comment_until_eol! {
test_comment_until_windows_eol: WINDOWS_EOL,
test_comment_until_mac_eol: MAC_EOL,
test_comment_until_unix_eol: UNIX_EOL,
}
#[test]
fn test_assignment() {
let source = String::from(r"avariable = 99 + 2-0");
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Name {
name: String::from("avariable"),
},
Tok::Equal,
Tok::Int {
value: BigInt::from(99)
},
Tok::Plus,
Tok::Int {
value: BigInt::from(2)
},
Tok::Minus,
Tok::Int {
value: BigInt::from(0)
},
]
);
}
macro_rules! test_indentation_with_eol {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("def foo():{} return 99{}{}", $eol, $eol, $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Def,
Tok::Name {
name: String::from("foo"),
},
Tok::Lpar,
Tok::Rpar,
Tok::Colon,
Tok::Newline,
Tok::Indent,
Tok::Return,
Tok::Int { value: BigInt::from(99) },
Tok::Newline,
Tok::Dedent,
]
);
}
)*
};
}
test_indentation_with_eol! {
test_indentation_windows_eol: WINDOWS_EOL,
test_indentation_mac_eol: MAC_EOL,
test_indentation_unix_eol: UNIX_EOL,
}
macro_rules! test_double_dedent_with_eol {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Def,
Tok::Name {
name: String::from("foo"),
},
Tok::Lpar,
Tok::Rpar,
Tok::Colon,
Tok::Newline,
Tok::Indent,
Tok::If,
Tok::Name {
name: String::from("x"),
},
Tok::Colon,
Tok::Newline,
Tok::Indent,
Tok::Return,
Tok::Int { value: BigInt::from(99) },
Tok::Newline,
Tok::Dedent,
Tok::Dedent,
]
);
}
)*
}
}
macro_rules! test_double_dedent_with_tabs {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Def,
Tok::Name {
name: String::from("foo"),
},
Tok::Lpar,
Tok::Rpar,
Tok::Colon,
Tok::Newline,
Tok::Indent,
Tok::If,
Tok::Name {
name: String::from("x"),
},
Tok::Colon,
Tok::Newline,
Tok::Indent,
Tok::Return,
Tok::Int { value: BigInt::from(99) },
Tok::Newline,
Tok::Dedent,
Tok::Dedent,
]
);
}
)*
}
}
test_double_dedent_with_eol! {
test_double_dedent_windows_eol: WINDOWS_EOL,
test_double_dedent_mac_eol: MAC_EOL,
test_double_dedent_unix_eol: UNIX_EOL,
}
test_double_dedent_with_tabs! {
test_double_dedent_tabs_windows_eol: WINDOWS_EOL,
test_double_dedent_tabs_mac_eol: MAC_EOL,
test_double_dedent_tabs_unix_eol: UNIX_EOL,
}
macro_rules! test_newline_in_brackets {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("x = [{} 1,2{}]{}", $eol, $eol, $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::Name {
name: String::from("x"),
},
Tok::Equal,
Tok::Lsqb,
Tok::Int { value: BigInt::from(1) },
Tok::Comma,
Tok::Int { value: BigInt::from(2) },
Tok::Rsqb,
Tok::Newline,
]
);
}
)*
};
}
test_newline_in_brackets! {
test_newline_in_brackets_windows_eol: WINDOWS_EOL,
test_newline_in_brackets_mac_eol: MAC_EOL,
test_newline_in_brackets_unix_eol: UNIX_EOL,
}
#[test]
fn test_operators() {
let source = String::from("//////=/ /");
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::DoubleSlash,
Tok::DoubleSlash,
Tok::DoubleSlashEqual,
Tok::Slash,
Tok::Slash,
]
);
}
#[test]
fn test_string() {
let source = String::from(r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g'"#);
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::String {
value: String::from("double"),
},
Tok::String {
value: String::from("single"),
},
Tok::String {
value: String::from("can't"),
},
Tok::String {
value: String::from("\\\""),
},
Tok::String {
value: String::from("\t\r\n"),
},
Tok::String {
value: String::from("\\g"),
},
]
);
}
macro_rules! test_string_continuation {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = String::from(format!("\"abc\\{}def\"", $eol));
let tokens = lex_source(&source);
assert_eq!(
tokens,
vec![
Tok::String {
value: String::from("abcdef"),
},
]
)
}
)*
}
}
test_string_continuation! {
test_string_continuation_windows_eol: WINDOWS_EOL,
test_string_continuation_mac_eol: MAC_EOL,
test_string_continuation_unix_eol: UNIX_EOL,
}
}