From 270386da952e18ac4e1c677f014da4422c424a89 Mon Sep 17 00:00:00 2001 From: Toni Date: Thu, 25 Dec 2025 21:04:34 +0100 Subject: [PATCH] rewrite zern tokenizer in zern :) --- examples/tokenizer.zr | 265 ++++++++++++++++++++++++++++++++++++++++++ src/std.zr | 8 ++ src/tokenizer.rs | 10 -- 3 files changed, 273 insertions(+), 10 deletions(-) create mode 100644 examples/tokenizer.zr diff --git a/examples/tokenizer.zr b/examples/tokenizer.zr new file mode 100644 index 0000000..ded1bac --- /dev/null +++ b/examples/tokenizer.zr @@ -0,0 +1,265 @@ +func eof[current: i64, source_len: i64] : bool + return current >= source_len + +func peek[current: i64, source: str, source_len: i64] : u8 + if eof(current, source_len) + return 0 + return source[current] + +func advance[current: ptr, column: ptr, source: str, source_len: i64] : u8 + if eof(mem.read64(current), source_len) + return 0 + let c: u8 = source[mem.read64(current)] + mem.write64(current, mem.read64(current) + 1) + mem.write64(column, mem.read64(column) + 1) + return c + +func match_char[expected: u8, current: ptr, column: ptr, source: str, source_len: i64] : bool + if eof(mem.read64(current), source_len) + return false + if source[mem.read64(current)] != expected + return false + mem.write64(current, mem.read64(current) + 1) + mem.write64(column, mem.read64(column) + 1) + return true + +func zern_error[filename: str, line: i64, column: i64, message: str] : void + io.print(filename) + io.print(":") + io.print_i64(line) + io.print(":") + io.print_i64(column) + io.print(" ERROR: ") + io.println(message) + os.exit(1) + +func count_indentation[current: ptr, column: ptr, source: str, source_len: i64] : i64 + let count = 0 + while peek(mem.read64(current), source, source_len) == ' ' + count = count + 1 + advance(current, column, source, source_len) + return count + +func handle_indentation[tokens: array, current: ptr, column: ptr, line: i64, source: str, source_len: i64, indent_stack: array, current_indent: ptr, filename: str] : void + if peek(mem.read64(current), source, source_len) == 10 // \n + return 0 + + let new_indent: i64 = count_indentation(current, column, source, source_len) + + if new_indent > mem.read64(current_indent) + array.push(indent_stack, new_indent) + add_token_with_lexeme("Indent", tokens, "", line, mem.read64(column)) + else if new_indent < mem.read64(current_indent) + while array.size(indent_stack) > 1 & array.nth(indent_stack, array.size(indent_stack) - 1) > new_indent + array.pop(indent_stack) + add_token_with_lexeme("Dedent", tokens, "", line, mem.read64(column)) + + if array.size(indent_stack) == 0 | array.nth(indent_stack, array.size(indent_stack) - 1) != new_indent + zern_error(filename, line, mem.read64(column), "invalid indentation") + + mem.write64(current_indent, new_indent) + +func add_token[type: i64, tokens: array, source: str, start: i64, current: i64, line: i64, column: i64] : void + let len: i64 = current - start + let lexeme: str = mem.alloc(len + 1) + for i in 0..len + str.set(lexeme, i, source[start + i]) + str.set(lexeme, len, 0) + array.push(tokens, [type, lexeme, line, column]) + +func add_token_with_lexeme[type: i64, tokens: array, lexeme: str, line: i64, column: i64] : void + array.push(tokens, [type, lexeme, line, column]) + +func scan_number[current: ptr, column: ptr, source: str, source_len: i64] : void + if match_char('x', current, column, source, source_len) + while str.is_hex_digit(peek(mem.read64(current), source, source_len)) + advance(current, column, source, source_len) + else if match_char('o', current, column, source, source_len) + while peek(mem.read64(current), source, source_len) >= '0' & peek(mem.read64(current), source, source_len) <= '7' + advance(current, column, source, source_len) + else + while str.is_digit(peek(mem.read64(current), source, source_len)) + advance(current, column, source, source_len) + +func scan_identifier[tokens: array, current: ptr, column: ptr, start: i64, line: i64, source: str, source_len: i64] : void + while str.is_alphanumeric(peek(mem.read64(current), source, source_len)) | peek(mem.read64(current), source, source_len) == '_' | peek(mem.read64(current), source, source_len) == '.' + advance(current, column, source, source_len) + + let len: i64 = mem.read64(current) - start + let lexeme: str = mem.alloc(len + 1) + for i in 0..len + str.set(lexeme, i, source[start + i]) + str.set(lexeme, len, 0) + + let type: str = "Identifier" + if str.equal(lexeme, "let") + type = "KeywordLet" + if str.equal(lexeme, "const") + type = "KeywordConst" + if str.equal(lexeme, "if") + type = "KeywordIf" + if str.equal(lexeme, "else") + type = "KeywordElse" + if str.equal(lexeme, "while") + type = "KeywordWhile" + if str.equal(lexeme, "for") + type = "KeywordFor" + if str.equal(lexeme, "in") + type = "KeywordIn" + if str.equal(lexeme, "func") + type = "KeywordFunc" + if str.equal(lexeme, "return") + type = "KeywordReturn" + if str.equal(lexeme, "break") + type = "KeywordBreak" + if str.equal(lexeme, "continue") + type = "KeywordContinue" + if str.equal(lexeme, "extern") + type = "KeywordExtern" + if str.equal(lexeme, "export") + type = "KeywordExport" + if str.equal(lexeme, "true") + type = "True" + if str.equal(lexeme, "false") + type = "False" + + add_token_with_lexeme(type, tokens, lexeme, line, mem.read64(column)) + +func scan_token[tokens: array, current: ptr, line: ptr, column: ptr, source: str, source_len: i64, filename: str, indent_stack: array, current_indent: ptr] : void + let start: i64 = mem.read64(current) + let c: u8 = advance(current, column, source, source_len) + + if c == '(' + add_token("LeftParen", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == ')' + add_token("RightParen", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '[' + add_token("LeftBracket", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == ']' + add_token("RightBracket", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == ',' + add_token("Comma", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '+' + add_token("Plus", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '-' + add_token("Minus", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '*' + add_token("Star", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '%' + add_token("Mod", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '^' + add_token("Xor", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == ':' + add_token("Colon", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '@' + add_token("At", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '.' + if match_char('.', current, column, source, source_len) + add_token("DoubleDot", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + zern_error(filename, mem.read64(line), mem.read64(column), "expected '.' after '.'") + else if c == '/' + if match_char('/', current, column, source, source_len) + while !eof(mem.read64(current), source_len) + if peek(mem.read64(current), source, source_len) == 10 // \n + break + advance(current, column, source, source_len) + else + add_token("Slash", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '&' + add_token("BitAnd", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '|' + if match_char('>', current, column, source, source_len) + add_token("Pipe", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + add_token("BitOr", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '!' + if match_char('=', current, column, source, source_len) + add_token("NotEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + add_token("Bang", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '=' + if match_char('=', current, column, source, source_len) + add_token("DoubleEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + add_token("Equal", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '>' + if match_char('>', current, column, source, source_len) + add_token("ShiftRight", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if match_char('=', current, column, source, source_len) + add_token("GreaterEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + add_token("Greater", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == '<' + if match_char('<', current, column, source, source_len) + add_token("ShiftLeft", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if match_char('=', current, column, source, source_len) + add_token("LessEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else + add_token("Less", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == 39 // ' + if eof(mem.read64(current), source_len) + zern_error(filename, mem.read64(line), mem.read64(column), "unterminated char literal") + advance(current, column, source, source_len) + if !match_char(39, current, column, source, source_len) + zern_error(filename, mem.read64(line), mem.read64(column), "expected ' after char literal") + add_token("Char", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == 34 // " + while !eof(mem.read64(current), source_len) + if peek(mem.read64(current), source, source_len) == 34 + break + if peek(mem.read64(current), source, source_len) == 10 // \n + mem.write64(line, mem.read64(line) + 1) + mem.write64(column, 1) + advance(current, column, source, source_len) + if eof(mem.read64(current), source_len) + zern_error(filename, mem.read64(line), mem.read64(column), "unterminated string") + advance(current, column, source, source_len) + add_token("String", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if c == ' ' | c == 13 // \r + return 0 + else if c == 10 // \n + mem.write64(line, mem.read64(line) + 1) + mem.write64(column, 1) + handle_indentation(tokens, current, column, mem.read64(line), source, source_len, indent_stack, current_indent, filename) + else if str.is_digit(c) + scan_number(current, column, source, source_len) + add_token("Number", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column)) + else if str.is_letter(c) | c == '_' + scan_identifier(tokens, current, column, start, mem.read64(line), source, source_len) + else + zern_error(filename, mem.read64(line), mem.read64(column), "unexpected character") + +func tokenize[source: str, filename: str] : array + let source_len: i64 = str.len(source) + let current = 0 + let line = 1 + let column = 1 + let indent_stack: array = [0] + let current_indent = 0 + let tokens: array = [] + + while !eof(current, source_len) + scan_token(tokens, @current, @line, @column, source, source_len, filename, indent_stack, @current_indent) + + add_token_with_lexeme("Eof", tokens, "", line, column) + return tokens + +func main[argc: i64, argv: ptr] : i64 + if argc < 2 + dbg.panic("expected an argument") + + let path: str = mem.read64(argv + 8) + let source: str = io.read_file(path) + let tokens: array = tokenize(source, path) + + for i in 0..array.size(tokens) + let token: array = array.nth(tokens, i) + io.print(array.nth(token, 0)) + io.print(" ") + io.print(array.nth(token, 1)) + io.print(" ") + io.print_i64(array.nth(token, 2)) + io.print(" ") + io.print_i64(array.nth(token, 3)) + io.println("") \ No newline at end of file diff --git a/src/std.zr b/src/std.zr index b95f992..47851ef 100644 --- a/src/std.zr +++ b/src/std.zr @@ -424,6 +424,14 @@ func array.free[xs: array] : void mem.free(data) mem.free(xs) +func array.pop[xs: array] : i64 + let size: i64 = array.size(xs) + if size == 0 + dbg.panic("array.pop on empty array") + let x: i64 = array.nth(xs, size - 1) + mem.write64(xs + 16, size - 1) + return x + func array.slice[xs: array, start: i64, length: i64] : array if start < 0 | length < 0 | start + length > array.size(xs) dbg.panic("array.slice out of bounds") diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a96874b..6e9091c 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -314,16 +314,6 @@ impl Tokenizer { while self.peek().is_ascii_digit() { self.advance(); } - - if self.peek() == '.' - && self.current + 1 < self.source.len() - && self.source[self.current + 1].is_ascii_digit() - { - self.advance(); - while self.peek().is_ascii_digit() { - self.advance(); - } - } } self.add_token(TokenType::Number);