rewrite zern tokenizer in zern :)
This commit is contained in:
265
examples/tokenizer.zr
Normal file
265
examples/tokenizer.zr
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
func eof[current: i64, source_len: i64] : bool
|
||||||
|
return current >= source_len
|
||||||
|
|
||||||
|
func peek[current: i64, source: str, source_len: i64] : u8
|
||||||
|
if eof(current, source_len)
|
||||||
|
return 0
|
||||||
|
return source[current]
|
||||||
|
|
||||||
|
func advance[current: ptr, column: ptr, source: str, source_len: i64] : u8
|
||||||
|
if eof(mem.read64(current), source_len)
|
||||||
|
return 0
|
||||||
|
let c: u8 = source[mem.read64(current)]
|
||||||
|
mem.write64(current, mem.read64(current) + 1)
|
||||||
|
mem.write64(column, mem.read64(column) + 1)
|
||||||
|
return c
|
||||||
|
|
||||||
|
func match_char[expected: u8, current: ptr, column: ptr, source: str, source_len: i64] : bool
|
||||||
|
if eof(mem.read64(current), source_len)
|
||||||
|
return false
|
||||||
|
if source[mem.read64(current)] != expected
|
||||||
|
return false
|
||||||
|
mem.write64(current, mem.read64(current) + 1)
|
||||||
|
mem.write64(column, mem.read64(column) + 1)
|
||||||
|
return true
|
||||||
|
|
||||||
|
func zern_error[filename: str, line: i64, column: i64, message: str] : void
|
||||||
|
io.print(filename)
|
||||||
|
io.print(":")
|
||||||
|
io.print_i64(line)
|
||||||
|
io.print(":")
|
||||||
|
io.print_i64(column)
|
||||||
|
io.print(" ERROR: ")
|
||||||
|
io.println(message)
|
||||||
|
os.exit(1)
|
||||||
|
|
||||||
|
func count_indentation[current: ptr, column: ptr, source: str, source_len: i64] : i64
|
||||||
|
let count = 0
|
||||||
|
while peek(mem.read64(current), source, source_len) == ' '
|
||||||
|
count = count + 1
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
return count
|
||||||
|
|
||||||
|
func handle_indentation[tokens: array, current: ptr, column: ptr, line: i64, source: str, source_len: i64, indent_stack: array, current_indent: ptr, filename: str] : void
|
||||||
|
if peek(mem.read64(current), source, source_len) == 10 // \n
|
||||||
|
return 0
|
||||||
|
|
||||||
|
let new_indent: i64 = count_indentation(current, column, source, source_len)
|
||||||
|
|
||||||
|
if new_indent > mem.read64(current_indent)
|
||||||
|
array.push(indent_stack, new_indent)
|
||||||
|
add_token_with_lexeme("Indent", tokens, "", line, mem.read64(column))
|
||||||
|
else if new_indent < mem.read64(current_indent)
|
||||||
|
while array.size(indent_stack) > 1 & array.nth(indent_stack, array.size(indent_stack) - 1) > new_indent
|
||||||
|
array.pop(indent_stack)
|
||||||
|
add_token_with_lexeme("Dedent", tokens, "", line, mem.read64(column))
|
||||||
|
|
||||||
|
if array.size(indent_stack) == 0 | array.nth(indent_stack, array.size(indent_stack) - 1) != new_indent
|
||||||
|
zern_error(filename, line, mem.read64(column), "invalid indentation")
|
||||||
|
|
||||||
|
mem.write64(current_indent, new_indent)
|
||||||
|
|
||||||
|
func add_token[type: i64, tokens: array, source: str, start: i64, current: i64, line: i64, column: i64] : void
|
||||||
|
let len: i64 = current - start
|
||||||
|
let lexeme: str = mem.alloc(len + 1)
|
||||||
|
for i in 0..len
|
||||||
|
str.set(lexeme, i, source[start + i])
|
||||||
|
str.set(lexeme, len, 0)
|
||||||
|
array.push(tokens, [type, lexeme, line, column])
|
||||||
|
|
||||||
|
func add_token_with_lexeme[type: i64, tokens: array, lexeme: str, line: i64, column: i64] : void
|
||||||
|
array.push(tokens, [type, lexeme, line, column])
|
||||||
|
|
||||||
|
func scan_number[current: ptr, column: ptr, source: str, source_len: i64] : void
|
||||||
|
if match_char('x', current, column, source, source_len)
|
||||||
|
while str.is_hex_digit(peek(mem.read64(current), source, source_len))
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
else if match_char('o', current, column, source, source_len)
|
||||||
|
while peek(mem.read64(current), source, source_len) >= '0' & peek(mem.read64(current), source, source_len) <= '7'
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
else
|
||||||
|
while str.is_digit(peek(mem.read64(current), source, source_len))
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
|
||||||
|
func scan_identifier[tokens: array, current: ptr, column: ptr, start: i64, line: i64, source: str, source_len: i64] : void
|
||||||
|
while str.is_alphanumeric(peek(mem.read64(current), source, source_len)) | peek(mem.read64(current), source, source_len) == '_' | peek(mem.read64(current), source, source_len) == '.'
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
|
||||||
|
let len: i64 = mem.read64(current) - start
|
||||||
|
let lexeme: str = mem.alloc(len + 1)
|
||||||
|
for i in 0..len
|
||||||
|
str.set(lexeme, i, source[start + i])
|
||||||
|
str.set(lexeme, len, 0)
|
||||||
|
|
||||||
|
let type: str = "Identifier"
|
||||||
|
if str.equal(lexeme, "let")
|
||||||
|
type = "KeywordLet"
|
||||||
|
if str.equal(lexeme, "const")
|
||||||
|
type = "KeywordConst"
|
||||||
|
if str.equal(lexeme, "if")
|
||||||
|
type = "KeywordIf"
|
||||||
|
if str.equal(lexeme, "else")
|
||||||
|
type = "KeywordElse"
|
||||||
|
if str.equal(lexeme, "while")
|
||||||
|
type = "KeywordWhile"
|
||||||
|
if str.equal(lexeme, "for")
|
||||||
|
type = "KeywordFor"
|
||||||
|
if str.equal(lexeme, "in")
|
||||||
|
type = "KeywordIn"
|
||||||
|
if str.equal(lexeme, "func")
|
||||||
|
type = "KeywordFunc"
|
||||||
|
if str.equal(lexeme, "return")
|
||||||
|
type = "KeywordReturn"
|
||||||
|
if str.equal(lexeme, "break")
|
||||||
|
type = "KeywordBreak"
|
||||||
|
if str.equal(lexeme, "continue")
|
||||||
|
type = "KeywordContinue"
|
||||||
|
if str.equal(lexeme, "extern")
|
||||||
|
type = "KeywordExtern"
|
||||||
|
if str.equal(lexeme, "export")
|
||||||
|
type = "KeywordExport"
|
||||||
|
if str.equal(lexeme, "true")
|
||||||
|
type = "True"
|
||||||
|
if str.equal(lexeme, "false")
|
||||||
|
type = "False"
|
||||||
|
|
||||||
|
add_token_with_lexeme(type, tokens, lexeme, line, mem.read64(column))
|
||||||
|
|
||||||
|
func scan_token[tokens: array, current: ptr, line: ptr, column: ptr, source: str, source_len: i64, filename: str, indent_stack: array, current_indent: ptr] : void
|
||||||
|
let start: i64 = mem.read64(current)
|
||||||
|
let c: u8 = advance(current, column, source, source_len)
|
||||||
|
|
||||||
|
if c == '('
|
||||||
|
add_token("LeftParen", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == ')'
|
||||||
|
add_token("RightParen", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '['
|
||||||
|
add_token("LeftBracket", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == ']'
|
||||||
|
add_token("RightBracket", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == ','
|
||||||
|
add_token("Comma", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '+'
|
||||||
|
add_token("Plus", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '-'
|
||||||
|
add_token("Minus", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '*'
|
||||||
|
add_token("Star", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '%'
|
||||||
|
add_token("Mod", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '^'
|
||||||
|
add_token("Xor", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == ':'
|
||||||
|
add_token("Colon", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '@'
|
||||||
|
add_token("At", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '.'
|
||||||
|
if match_char('.', current, column, source, source_len)
|
||||||
|
add_token("DoubleDot", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
zern_error(filename, mem.read64(line), mem.read64(column), "expected '.' after '.'")
|
||||||
|
else if c == '/'
|
||||||
|
if match_char('/', current, column, source, source_len)
|
||||||
|
while !eof(mem.read64(current), source_len)
|
||||||
|
if peek(mem.read64(current), source, source_len) == 10 // \n
|
||||||
|
break
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
else
|
||||||
|
add_token("Slash", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '&'
|
||||||
|
add_token("BitAnd", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '|'
|
||||||
|
if match_char('>', current, column, source, source_len)
|
||||||
|
add_token("Pipe", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
add_token("BitOr", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '!'
|
||||||
|
if match_char('=', current, column, source, source_len)
|
||||||
|
add_token("NotEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
add_token("Bang", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '='
|
||||||
|
if match_char('=', current, column, source, source_len)
|
||||||
|
add_token("DoubleEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
add_token("Equal", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '>'
|
||||||
|
if match_char('>', current, column, source, source_len)
|
||||||
|
add_token("ShiftRight", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if match_char('=', current, column, source, source_len)
|
||||||
|
add_token("GreaterEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
add_token("Greater", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == '<'
|
||||||
|
if match_char('<', current, column, source, source_len)
|
||||||
|
add_token("ShiftLeft", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if match_char('=', current, column, source, source_len)
|
||||||
|
add_token("LessEqual", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else
|
||||||
|
add_token("Less", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == 39 // '
|
||||||
|
if eof(mem.read64(current), source_len)
|
||||||
|
zern_error(filename, mem.read64(line), mem.read64(column), "unterminated char literal")
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
if !match_char(39, current, column, source, source_len)
|
||||||
|
zern_error(filename, mem.read64(line), mem.read64(column), "expected ' after char literal")
|
||||||
|
add_token("Char", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == 34 // "
|
||||||
|
while !eof(mem.read64(current), source_len)
|
||||||
|
if peek(mem.read64(current), source, source_len) == 34
|
||||||
|
break
|
||||||
|
if peek(mem.read64(current), source, source_len) == 10 // \n
|
||||||
|
mem.write64(line, mem.read64(line) + 1)
|
||||||
|
mem.write64(column, 1)
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
if eof(mem.read64(current), source_len)
|
||||||
|
zern_error(filename, mem.read64(line), mem.read64(column), "unterminated string")
|
||||||
|
advance(current, column, source, source_len)
|
||||||
|
add_token("String", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if c == ' ' | c == 13 // \r
|
||||||
|
return 0
|
||||||
|
else if c == 10 // \n
|
||||||
|
mem.write64(line, mem.read64(line) + 1)
|
||||||
|
mem.write64(column, 1)
|
||||||
|
handle_indentation(tokens, current, column, mem.read64(line), source, source_len, indent_stack, current_indent, filename)
|
||||||
|
else if str.is_digit(c)
|
||||||
|
scan_number(current, column, source, source_len)
|
||||||
|
add_token("Number", tokens, source, start, mem.read64(current), mem.read64(line), mem.read64(column))
|
||||||
|
else if str.is_letter(c) | c == '_'
|
||||||
|
scan_identifier(tokens, current, column, start, mem.read64(line), source, source_len)
|
||||||
|
else
|
||||||
|
zern_error(filename, mem.read64(line), mem.read64(column), "unexpected character")
|
||||||
|
|
||||||
|
func tokenize[source: str, filename: str] : array
|
||||||
|
let source_len: i64 = str.len(source)
|
||||||
|
let current = 0
|
||||||
|
let line = 1
|
||||||
|
let column = 1
|
||||||
|
let indent_stack: array = [0]
|
||||||
|
let current_indent = 0
|
||||||
|
let tokens: array = []
|
||||||
|
|
||||||
|
while !eof(current, source_len)
|
||||||
|
scan_token(tokens, @current, @line, @column, source, source_len, filename, indent_stack, @current_indent)
|
||||||
|
|
||||||
|
add_token_with_lexeme("Eof", tokens, "", line, column)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
func main[argc: i64, argv: ptr] : i64
|
||||||
|
if argc < 2
|
||||||
|
dbg.panic("expected an argument")
|
||||||
|
|
||||||
|
let path: str = mem.read64(argv + 8)
|
||||||
|
let source: str = io.read_file(path)
|
||||||
|
let tokens: array = tokenize(source, path)
|
||||||
|
|
||||||
|
for i in 0..array.size(tokens)
|
||||||
|
let token: array = array.nth(tokens, i)
|
||||||
|
io.print(array.nth(token, 0))
|
||||||
|
io.print(" ")
|
||||||
|
io.print(array.nth(token, 1))
|
||||||
|
io.print(" ")
|
||||||
|
io.print_i64(array.nth(token, 2))
|
||||||
|
io.print(" ")
|
||||||
|
io.print_i64(array.nth(token, 3))
|
||||||
|
io.println("")
|
||||||
@@ -424,6 +424,14 @@ func array.free[xs: array] : void
|
|||||||
mem.free(data)
|
mem.free(data)
|
||||||
mem.free(xs)
|
mem.free(xs)
|
||||||
|
|
||||||
|
func array.pop[xs: array] : i64
|
||||||
|
let size: i64 = array.size(xs)
|
||||||
|
if size == 0
|
||||||
|
dbg.panic("array.pop on empty array")
|
||||||
|
let x: i64 = array.nth(xs, size - 1)
|
||||||
|
mem.write64(xs + 16, size - 1)
|
||||||
|
return x
|
||||||
|
|
||||||
func array.slice[xs: array, start: i64, length: i64] : array
|
func array.slice[xs: array, start: i64, length: i64] : array
|
||||||
if start < 0 | length < 0 | start + length > array.size(xs)
|
if start < 0 | length < 0 | start + length > array.size(xs)
|
||||||
dbg.panic("array.slice out of bounds")
|
dbg.panic("array.slice out of bounds")
|
||||||
|
|||||||
@@ -314,16 +314,6 @@ impl Tokenizer {
|
|||||||
while self.peek().is_ascii_digit() {
|
while self.peek().is_ascii_digit() {
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.peek() == '.'
|
|
||||||
&& self.current + 1 < self.source.len()
|
|
||||||
&& self.source[self.current + 1].is_ascii_digit()
|
|
||||||
{
|
|
||||||
self.advance();
|
|
||||||
while self.peek().is_ascii_digit() {
|
|
||||||
self.advance();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.add_token(TokenType::Number);
|
self.add_token(TokenType::Number);
|
||||||
|
|||||||
Reference in New Issue
Block a user