commit 0fbc964d374611004f4d14eac363169ce26c7cd6 Author: Toni Date: Thu May 29 18:07:57 2025 +0200 tokenizer diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..291951d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +/*.py +/*.mot \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..298844a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "mot" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f6f35f1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "mot" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5334a02 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,16 @@ +mod tokenizer; + +use std::{env, error::Error, fs}; + +fn main() -> Result<(), Box> { + let mut args = env::args(); + let path = args.nth(1).unwrap(); + + let source = fs::read_to_string(path.clone())?; + + // TODO: basename + let tokenizer = tokenizer::Tokenizer::new(path, source); + println!("{:#?}", tokenizer.tokenize()?); + + Ok(()) +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..c2617e6 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,272 @@ +use std::fmt; + +#[derive(Debug, Clone, PartialEq)] +pub enum TokenType { + LeftParen, + RightParen, + LeftBracket, + RightBracket, + Comma, + Plus, + Minus, + Star, + Slash, + Mod, + Xor, + Bang, + Colon, + And, + Or, + + Equal, + DoubleEqual, + NotEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + Identifier, + String, + Number, + + Eof, +} + +#[derive(Debug)] +pub struct MotError { + pub loc: Loc, + pub message: String, +} + +impl fmt::Display for MotError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} \x1b[91mERROR\x1b[0m: {}", self.loc, self.message) + } +} + +impl std::error::Error for MotError {} + +macro_rules! error { + ($loc:expr, $msg:expr) => { + Err(MotError { + loc: $loc.clone(), + message: $msg.into(), + }) + }; +} + +pub(crate) use error; + +#[derive(Debug, Clone, PartialEq)] +pub struct Loc { + pub filename: String, + pub line: usize, + pub column: usize, +} + +impl fmt::Display for Loc { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}:{}", self.filename, self.line, self.column) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Token { + pub token_type: TokenType, + pub lexeme: String, + pub loc: Loc, +} + +pub struct Tokenizer { + source: Vec, + tokens: Vec, + start: usize, + current: usize, + loc: Loc, +} + +impl Tokenizer { + pub fn new(filename: String, source: String) -> Tokenizer { + Tokenizer { + source: source.chars().collect(), + tokens: vec![], + start: 0, + current: 0, + loc: Loc { + filename, + line: 1, + column: 1, + }, + } + } + + pub fn tokenize(mut self) -> Result, MotError> { + while !self.eof() { + self.start = self.current; + self.scan_token()?; + } + self.tokens.push(Token { + token_type: TokenType::Eof, + lexeme: String::new(), + loc: self.loc.clone(), + }); + + Ok(self.tokens) + } + + fn scan_token(&mut self) -> Result<(), MotError> { + match self.advance() { + '(' => self.add_token(TokenType::LeftParen), + ')' => self.add_token(TokenType::RightParen), + '[' => self.add_token(TokenType::LeftBracket), + ']' => self.add_token(TokenType::RightBracket), + '+' => self.add_token(TokenType::Plus), + '*' => self.add_token(TokenType::Star), + ',' => self.add_token(TokenType::Comma), + '-' => self.add_token(TokenType::Minus), + '%' => self.add_token(TokenType::Mod), + '^' => self.add_token(TokenType::Xor), + ':' => self.add_token(TokenType::Colon), + '/' => { + if self.match_char('/') { + while !self.eof() && self.peek() != '\n' { + self.advance(); + } + } else { + self.add_token(TokenType::Slash) + } + } + '&' => { + if self.match_char('&') { + self.add_token(TokenType::And); + } else { + return error!(self.loc, "expected '&' after '&'"); + } + } + '|' => { + if self.match_char('|') { + self.add_token(TokenType::Or); + } else { + return error!(self.loc, "expected '|' after '|'"); + } + } + '!' => { + if self.match_char('=') { + self.add_token(TokenType::NotEqual) + } else { + self.add_token(TokenType::Bang) + } + } + '=' => { + if self.match_char('=') { + self.add_token(TokenType::DoubleEqual) + } else { + self.add_token(TokenType::Equal) + } + } + '>' => { + if self.match_char('=') { + self.add_token(TokenType::GreaterEqual) + } else { + self.add_token(TokenType::Greater) + } + } + '<' => { + if self.match_char('=') { + self.add_token(TokenType::LessEqual) + } else { + self.add_token(TokenType::Less) + } + } + '"' => { + while !self.eof() && self.peek() != '"' { + if self.peek() == '\n' { + self.loc.line += 1; + self.loc.column = 0; + } + self.advance(); + } + + if self.eof() { + return error!(self.loc, "unterminated string"); + } + + self.advance(); + self.add_token(TokenType::String); + } + ' ' | '\t' | '\r' => {} + '\n' => { + self.loc.line += 1; + self.loc.column = 0; + } + '0'..='9' => self.scan_number(), + 'A'..='z' => self.scan_identifier(), + _ => return error!(self.loc, "unexpected character"), + } + Ok(()) + } + + fn scan_number(&mut self) { + while self.peek().is_ascii_digit() { + self.advance(); + } + + if self.peek() == '.' { + self.advance(); + while self.peek().is_ascii_digit() { + self.advance(); + } + } + + self.add_token(TokenType::Number); + } + + fn scan_identifier(&mut self) { + while self.peek().is_alphanumeric() || self.peek() == '_' || self.peek() == '.' { + self.advance(); + } + + let lexeme: String = self.source[self.start..self.current].iter().collect(); + match lexeme.as_str() { + _ => self.add_token(TokenType::Identifier), + } + } + + fn match_char(&mut self, expected: char) -> bool { + if self.eof() || self.peek() != expected { + false + } else { + self.current += 1; + true + } + } + + fn add_token(&mut self, token_type: TokenType) { + let lexeme: String = self.source[self.start..self.current].iter().collect(); + self.tokens.push(Token { + token_type, + lexeme, + loc: self.loc.clone(), + }); + } + + fn advance(&mut self) -> char { + let c = self.source[self.current]; + self.current += 1; + self.loc.column += 1; + c + } + + fn peek(&self) -> char { + if self.eof() { + '\0' + } else { + self.source[self.current] + } + } + + fn eof(&self) -> bool { + self.current >= self.source.len() + } +}