Files
zern/src/tokenizer.rs
2025-05-30 17:51:16 +02:00

342 lines
8.9 KiB
Rust

use std::{cmp::Ordering, error::Error, fmt};
#[derive(Debug, Clone, PartialEq)]
pub enum TokenType {
LeftParen,
RightParen,
LeftBracket,
RightBracket,
Comma,
Plus,
Minus,
Star,
Slash,
Mod,
Xor,
Bang,
Colon,
And,
Or,
Equal,
DoubleEqual,
NotEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
Identifier,
String,
Number,
KeywordLet,
KeywordIf,
KeywordElse,
KeywordWhile,
Indent,
Dedent,
Eof,
}
#[derive(Debug)]
pub struct MotError {
pub loc: Loc,
pub message: String,
}
impl fmt::Display for MotError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} \x1b[91mERROR\x1b[0m: {}", self.loc, self.message)
}
}
impl std::error::Error for MotError {}
macro_rules! error {
($loc:expr, $msg:expr) => {
Err(Box::new(MotError {
loc: $loc.clone(),
message: $msg.into(),
}))
};
}
pub(crate) use error;
#[derive(Debug, Clone, PartialEq)]
pub struct Loc {
pub filename: String,
pub line: usize,
pub column: usize,
}
impl fmt::Display for Loc {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}:{}:{}", self.filename, self.line, self.column)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub token_type: TokenType,
pub lexeme: String,
pub loc: Loc,
}
pub struct Tokenizer {
source: Vec<char>,
tokens: Vec<Token>,
indent_stack: Vec<usize>,
current_indent: usize,
start: usize,
current: usize,
loc: Loc,
}
impl Tokenizer {
pub fn new(filename: String, source: String) -> Tokenizer {
Tokenizer {
source: source.chars().collect(),
tokens: vec![],
indent_stack: vec![0],
current_indent: 0,
start: 0,
current: 0,
loc: Loc {
filename,
line: 1,
column: 1,
},
}
}
pub fn tokenize(mut self) -> Result<Vec<Token>, Box<dyn Error>> {
while !self.eof() {
self.start = self.current;
self.scan_token()?;
}
self.tokens.push(Token {
token_type: TokenType::Eof,
lexeme: String::new(),
loc: self.loc.clone(),
});
Ok(self.tokens)
}
fn scan_token(&mut self) -> Result<(), Box<dyn Error>> {
match self.advance() {
'(' => self.add_token(TokenType::LeftParen),
')' => self.add_token(TokenType::RightParen),
'[' => self.add_token(TokenType::LeftBracket),
']' => self.add_token(TokenType::RightBracket),
'+' => self.add_token(TokenType::Plus),
'*' => self.add_token(TokenType::Star),
',' => self.add_token(TokenType::Comma),
'-' => self.add_token(TokenType::Minus),
'%' => self.add_token(TokenType::Mod),
'^' => self.add_token(TokenType::Xor),
':' => self.add_token(TokenType::Colon),
'/' => {
if self.match_char('/') {
while !self.eof() && self.peek() != '\n' {
self.advance();
}
} else {
self.add_token(TokenType::Slash)
}
}
'&' => {
if self.match_char('&') {
self.add_token(TokenType::And);
} else {
return error!(self.loc, "expected '&' after '&'");
}
}
'|' => {
if self.match_char('|') {
self.add_token(TokenType::Or);
} else {
return error!(self.loc, "expected '|' after '|'");
}
}
'!' => {
if self.match_char('=') {
self.add_token(TokenType::NotEqual)
} else {
self.add_token(TokenType::Bang)
}
}
'=' => {
if self.match_char('=') {
self.add_token(TokenType::DoubleEqual)
} else {
self.add_token(TokenType::Equal)
}
}
'>' => {
if self.match_char('=') {
self.add_token(TokenType::GreaterEqual)
} else {
self.add_token(TokenType::Greater)
}
}
'<' => {
if self.match_char('=') {
self.add_token(TokenType::LessEqual)
} else {
self.add_token(TokenType::Less)
}
}
'"' => {
while !self.eof() && self.peek() != '"' {
if self.peek() == '\n' {
self.loc.line += 1;
self.loc.column = 1;
}
self.advance();
}
if self.eof() {
return error!(self.loc, "unterminated string");
}
self.advance();
self.add_token(TokenType::String);
}
' ' | '\t' | '\r' => {}
'\n' => {
self.loc.line += 1;
self.loc.column = 1;
self.handle_indentation()?;
}
'0'..='9' => self.scan_number(),
'A'..='Z' | 'a'..='z' | '_' => self.scan_identifier(),
_ => return error!(self.loc, "unexpected character"),
}
Ok(())
}
fn handle_indentation(&mut self) -> Result<(), Box<dyn Error>> {
if self.peek() == '\n' {
return Ok(());
}
let new_indent = self.count_indentation();
match new_indent.cmp(&self.current_indent) {
Ordering::Greater => {
self.indent_stack.push(new_indent);
self.tokens.push(Token {
token_type: TokenType::Indent,
lexeme: String::new(),
loc: self.loc.clone(),
});
}
Ordering::Less => {
while !self.indent_stack.is_empty()
&& *self.indent_stack.last().unwrap() > new_indent
{
self.indent_stack.pop();
self.tokens.push(Token {
token_type: TokenType::Dedent,
lexeme: String::new(),
loc: self.loc.clone(),
});
}
if self.indent_stack.is_empty() || *self.indent_stack.last().unwrap() != new_indent
{
return error!(self.loc, "invalid indentation");
}
}
Ordering::Equal => {}
}
self.current_indent = new_indent;
Ok(())
}
fn count_indentation(&mut self) -> usize {
let mut count = 0;
while self.peek() == ' ' || self.peek() == '\t' {
if self.peek() == ' ' {
count += 1;
}
if self.peek() == '\t' {
count += 4;
}
self.advance();
}
count
}
fn scan_number(&mut self) {
while self.peek().is_ascii_digit() {
self.advance();
}
if self.peek() == '.' {
self.advance();
while self.peek().is_ascii_digit() {
self.advance();
}
}
self.add_token(TokenType::Number);
}
fn scan_identifier(&mut self) {
while self.peek().is_alphanumeric() || self.peek() == '_' || self.peek() == '.' {
self.advance();
}
let lexeme: String = self.source[self.start..self.current].iter().collect();
self.add_token(match lexeme.as_str() {
"let" => TokenType::KeywordLet,
"if" => TokenType::KeywordIf,
"else" => TokenType::KeywordElse,
"while" => TokenType::KeywordWhile,
_ => TokenType::Identifier,
})
}
fn match_char(&mut self, expected: char) -> bool {
if self.eof() || self.peek() != expected {
false
} else {
self.current += 1;
true
}
}
fn add_token(&mut self, token_type: TokenType) {
let lexeme: String = self.source[self.start..self.current].iter().collect();
self.tokens.push(Token {
token_type,
lexeme,
loc: self.loc.clone(),
});
}
fn advance(&mut self) -> char {
let c = self.source[self.current];
self.current += 1;
self.loc.column += 1;
c
}
fn peek(&self) -> char {
if self.eof() {
'\0'
} else {
self.source[self.current]
}
}
fn eof(&self) -> bool {
self.current >= self.source.len()
}
}