tal/build/lexer.lua

872 lines
16 KiB
Lua
Raw Normal View History

2025-02-06 00:30:52 +00:00
local TOK_ID = 1;
local TOK_OP = 2;
local TOK_STR = 3;
local TOK_NUM = 4;
local operators = {
AND = 1,
OR = 2,
NOT = 3,
CONCAT = 10,
ADD = 11,
SUB = 12,
MUL = 13,
DIV = 14,
IDIV = 15,
MOD = 16,
B_AND = 20,
B_OR = 21,
B_XOR = 22,
B_LSH = 24,
B_RSH = 25,
RSH = 26,
EQ = 30,
NEQ = 31,
LEQ = 32,
GEQ = 33,
LESS = 34,
GR = 35,
PAREN_OPEN = 40,
PAREN_CLOSE = 41,
BRACKET_OPEN = 42,
BRACKET_CLOSE = 43,
BRACE_OPEN = 44,
BRACE_CLOSE = 45,
SEMICOLON = 50,
COLON = 51,
COMMA = 52,
DOT = 53,
SPREAD = 54,
-- QUESTION = 55,
ASSIGN = 60,
ASSIGN_OR = 75,
};
local op_map = {
["+"] = { operators.ADD },
["-"] = { operators.SUB },
["*"] = {
operators.MUL,
["*"] = { operators.POW },
},
["/"] = {
operators.DIV,
["/"] = { operators.IDIV },
},
["%"] = { operators.MOD },
["&"] = { operators.B_AND },
["|"] = {operators.B_OR },
["^"] = { operators.POW },
["~"] = {
operators.B_XOR,
["="] = { operators.NEQ },
},
[">"] = {
operators.GR,
[">"] = {
operators.RSH,
[">"] = { operators.B_RSH },
},
},
["<"] = {
operators.LESS,
["<"] = { operators.B_LSH },
},
["="] = {
operators.ASSIGN,
["="] = { operators.EQ },
},
[","] = { operators.COMMA },
["."] = { operators.DOT },
[";"] = { operators.SEMICOLON },
[":"] = { operators.COLON },
["?"] = { operators.QUESTION },
["("] = { operators.PAREN_OPEN },
[")"] = { operators.PAREN_CLOSE },
["["] = { operators.BRACKET_OPEN },
["]"] = { operators.BRACKET_CLOSE },
["{"] = { operators.BRACE_OPEN },
["}"] = { operators.BRACE_CLOSE },
};
local to_byte = string.byte;
--- @class tok_base
--- @field loc string
--- @field end_loc string
--- @field comments string[]
--- @field raw string
--- @class tok_id: tok_base
--- @field type 1
--- @field val string
--- @class tok_op: tok_base
--- @field type 2
--- @field val integer
--- @class tok_str: tok_base
--- @field type 3
--- @field val string
--- @class tok_num: tok_base
--- @field type 4
--- @field val number
---@param base tok_base
---@param id string
---@return tok_id
local function tok_id(base, id)
base = base or { loc = "", comments = {} };
return {
loc = base.loc,
comments = base.comments,
type = TOK_ID,
val = id,
}
end
---@param base tok_base
---@param op integer
---@return tok_str
local function tok_op(base, op)
base = base or { loc = "", comments = {} };
return {
loc = base.loc,
comments = base.comments,
type = TOK_OP,
val = op,
}
end
---@param base tok_base?
---@param data string
---@return tok_str
local function tok_str(base, data)
base = base or { loc = "", comments = {} };
return {
loc = base.loc,
comments = base.comments,
type = TOK_STR,
val = data,
};
end
---@param base tok_base
---@param num number
---@return tok_num
local function tok_num(base, num)
base = base or { loc = "", comments = {} };
return {
loc = base.loc,
comments = base.comments,
type = TOK_NUM,
val = num,
};
end
--- @alias token
--- | tok_id
--- | tok_op
--- | tok_str
--- | tok_num
---@param loader string | fun(): string
---@return fun(): string?
local function char_supplier(loader)
if type(loader) == "string" then
local i = 0;
return function ()
i = i + 1;
local res = string.sub(loader, i, i);
if #res == 1 then return res end
end
else
local curr_str;
local i = 0;
return function ()
if curr_str == "" then return nil end
i = i + 1;
if curr_str == nil or i > #curr_str then
curr_str = loader();
if curr_str == false or curr_str == nil or curr_str == "" then
curr_str = "";
return nil;
end
i = 1;
end
return string.sub(curr_str, i, i);
end
end
end
---@param filename string
---@param chars fun(): string | nil
---@return fun(): token?
local function token_supplier(filename, chars, get_comments)
local line = 1;
local start = 1;
local _chars = chars;
chars = function ()
local c = _chars();
if c == "\n" then
line = line + 1;
start = 1;
else
start = start + 1;
end
return c;
end
local function unconsume(c)
if c == nil then return end
local old_chars = chars;
start = start - 1;
chars = function ()
chars = old_chars;
start = start + 1;
return c;
end
end
local consume_white;
if get_comments then
local function consume_comment()
-- local data = {};
-- local c = chars();
-- if c == "[" then
-- while true do
-- if c == nil then
-- return nil, "Unclosed comment";
-- elseif c == "]" and chars() == "#" then
-- break;
-- end
-- data[#data + 1] = c;
-- c = chars();
-- end
-- else
-- while true do
-- if c == "\n" or c == nil then break end
-- data[#data + 1] = c;
-- c = chars();
-- end
-- end
-- return table.concat(data);
local data = array {};
local function singleline(c)
while true do
if c == "\n" or c == nil then break end
data:push(c);
c = chars();
end
return data:join "";
end
local function multiline_end()
if chars() ~= "]" then return false end
if chars() ~= "-" then return false end
if chars() ~= "-" then return false end
return true
end
local function multiline()
while true do
local c = chars()
if c == "]" and multiline_end() then
break;
elseif c == nil then
return nil, "Missing ]]";
end
data:push(c);
end
return data:join "";
end
local c = chars()
if c == "[" then
c = chars()
if c == "[" then
return multiline();
else
return singleline(c);
end
else singleline(c) end
return data:join "";
end
---@return false | string[]?, string?
function consume_white()
local comments = {};
while true do
local c = chars();
if c == nil then
chars = function () return nil end
break;
elseif start == 2 and line == 1 and c == "#" then
local c2 = chars();
if c2 == "!" then
while c ~= "\n" do
c = chars();
end
else
unconsume(c2);
unconsume(c);
end
elseif c == "-" then
local c2 = chars()
if c2 == "-" then
local res, err = consume_comment();
if res == nil then return nil, err end
comments[#comments + 1] = res;
else
unconsume(c2)
unconsume(c)
break
end
elseif not string.find(c, "%s") then
unconsume(c);
break;
end
end
return comments;
end
else
---@return string?
local function consume_comment()
local c = chars();
if c == "[" then
while true do
c = chars();
if c == nil then
return "Unclosed comment";
elseif c == "]" and chars() == "#" then break end
end
else
while true do
if c == "\n" or c == nil then break end
c = chars();
end
end
end
---@return false | string[]?, string?
function consume_white()
while true do
local c = chars();
if c == nil then
chars = function () return nil end
break;
elseif c == "#" then
local err = consume_comment();
if err ~= nil then return nil, err end
elseif not string.find(c, "%s") then
unconsume(c);
break;
end
end
return false;
end
end
local function hex_one(c)
local b = to_byte(c)
if b >= 48 and b <= 57 then
return b - 48
elseif b >= 97 and b <= 102 then
return b - 97 + 10
elseif b >= 65 and b <= 70 then
return b - 65 + 10
else
return -1
end
end
local function hex(base)
local res = 0
local any = false
while true do
local c = chars()
if c == nil then break end
local digit = hex_one(c)
if digit == -1 then
unconsume(c)
break
else
res = res * 16 + digit
end
any = true
end
if not any then return end
return tok_num(base, res)
end
local function decimal(res, float, mult)
local any = true
local fract_mult = .1
if type(res) == "string" then
local b = to_byte(res)
if b >= 48 and b <= 57 then
res = b - 48
else
return nil
end
elseif res == nil then
res, any = 0, false
end
if mult == nil then mult = 1 end
local c
while true do
c = chars()
if c == nil then break end
local b = to_byte(c)
if b >= 48 and b <= 57 then
any = true
if float then
res = res + (b - 48) * fract_mult
fract_mult = fract_mult * .1
else
res = res * 10 + (b - 48)
end
else
break
end
end
if any then
return mult * res, c
else
return nil, c
end
end
local function number(base, res)
local fract, e
local whole, next = decimal(res, false)
if next == "." then
fract, next = decimal(nil, true)
end
if next == "e" then
local c = chars()
if c == "-" then
e, next = decimal(nil, false, -1)
else
e, next = decimal(c)
end
if e == nil then
return nil, "Expected number after 'e'"
end
end
if fract == nil then fract = 0 end
if e == nil then
e = 1
else
e = 10 ^ e
end
unconsume(next)
return tok_num(base, (whole + fract) * e)
end
local function zero(base)
local c = chars()
if c == nil then return tok_num(base, 0) end
local b = to_byte(c)
if c == "x" then
local res = hex(base)
if res == nil then return nil, "Expected a hex literal"
else return res end
else
unconsume(c)
return number(base, 0)
end
end
local function id(base, c)
local res = c
while true do
c = chars()
if c == nil then break end
local b = to_byte(c)
if
b >= 65 and b <= 90 or -- A-Z
b >= 97 and b <= 122 or -- a-z
b >= 48 and b <= 57 or -- 0-9
c == "_"
then
res = res .. c
else
unconsume(c)
break
end
end
base.raw = res
return tok_id(base, res)
end
local function dot(base)
local e, fract, next = nil, decimal(nil, true)
if fract == nil then
if next == "." then
local c = chars()
if c == "." then
return tok_op(base, operators.SPREAD)
else
unconsume(c)
return tok_op(base, operators.CONCAT)
end
else
unconsume(next)
return tok_op(base, operators.DOT)
end
return base
end
if next == "e" then
local c = chars()
if c == "-" then
e, next = decimal(nil, false, -1)
else
e, next = decimal(c)
end
if e == nil then
return nil, "Expected number after 'e'"
end
end
if fract == nil then fract = 0 end
if e == nil then
e = 1
else
e = 10 ^ e
end
unconsume(next)
return tok_num(base, fract * e)
end
local function char(c, allow_newline)
if c == nil then return nil
elseif c == "\\" then
c = chars()
if c == "a" then return "\a"
elseif c == "b" then return "\b"
elseif c == "f" then return "\f"
elseif c == "n" then return "\n"
elseif c == "r" then return "\r"
elseif c == "t" then return "\t"
elseif c == "v" then return "\v"
elseif c == "z" then
repeat
c = chars()
until c == " " or c == "\n" or c == "\r" or c == "\t" or c == "\v"
return char(c)
elseif c == "x" then
local ca, cb = chars(), chars()
if ca == nil or cb == nil then return nil, "Expected a hex number" end
local a, b = hex_one(ca), hex_one(cb)
if a == -1 or b == -1 then return nil, "Expected a hex number" end
return string.char(a * 16 + b)
else return c end
else return c end
end
local function quote_str(base, first)
local res = {};
while true do
local c, err;
c = chars()
if c == first then break end
if c == nil then return nil, "Unterminated string literal" end
c, err = char(c)
if c == nil then
return nil, err or "Unterminated string literal";
else
res[#res + 1] = c;
end
end
return tok_str(base, table.concat(res));
end
local function quote_char(base, first)
local res = 0;
while true do
local c, err;
c = chars();
if c == first then break end
if c == nil then return nil, "Unterminated string literal" end
c, err = char(c);
if c == nil then
return nil, err or "Unterminated string literal";
else
for _, v in ipairs { string.byte(c, 1, #c) } do
res = res * 256 + v;
end
end
end
return tok_num(base, res);
end
return function ()
-- local comments = consume_white()
local comments, err = consume_white();
if comments == nil then
error(table.concat({ filename, line, start }, ":") .. ": " .. err);
elseif comments == false then
comments = nil;
end
local loc = table.concat({ filename, line, start }, ":");
--- @type table | nil
local base = { loc = loc, comments = comments, raw = "" };
local c = chars()
if c == nil then return nil end
local b = to_byte(c)
if c == "." then base, err = dot(base)
elseif c == "0" then base, err = zero(base)
elseif b >= 49 and b <= 57 then base, err = number(base, b - 48) -- 1-9
elseif
b >= 65 and b <= 90 or -- A-Z
b >= 97 and b <= 122 or -- a-z
c == "_"
then
base, err = id(base, c)
elseif c == "\"" then
base, err = quote_str(base, c);
elseif c == "\'" then
base, err = quote_char(base, c);
else
local res = op_map;
while true do
local next = res[c];
if next == nil then
unconsume(c);
res = res[1];
break;
else
c = chars();
res = next;
end
end
if res == nil then
base, err = nil, string.format("Unexpected char '%s'", c)
else
base.type = TOK_OP;
base.val = res;
end
end
if base == nil then
return error(loc .. ": " .. err);
else
base.end_loc = table.concat({ filename, line, start }, ":");
return base;
end
end
end
---@param ... fun(): token?, string?
local function concat_tokens(...)
local arr = {...};
local i = 1;
return function ()
while true do
local el = arr[i];
if el == nil then
return nil;
elseif type(el) == "function" then
local buff = el();
if buff ~= "" then
return buff;
else
i = i + 1;
end
end
end
end
end
---@param first token
---@param second token
local function can_go_after(first, second)
if first.type == TOK_OP and second.type == TOK_OP then
if (
first.val == operators.ASSIGN or
first.val == operators.EQ or
first.val == operators.LESS or
first.val == operators.GR or
first.val == operators.B_XOR
) and (
second.val == operators.ASSIGN or
second.val == operators.EQ
) then return false end
if (
first.val == operators.LESS or
first.val == operators.GR
) and (
second.val == operators.LESS or
second.val == operators.GR
) then return false end
if (
first.val == operators.DOT or
first.val == operators.CONCAT or
first.val == operators.SPREAD
) and
(
second.val == operators.DOT or
second.val == operators.CONCAT or
second.val == operators.SPREAD
) then return false end
if (
first.val == operators.LABEL or
first.val == operators.COLON
) and (
second.val == operators.LABEL or
second.val == operators.COLON
) then return false end
if (
first.val == operators.DIV or
first.val == operators.IDIV
) and (
second.val == operators.DIV or
second.val == operators.IDIV
) then return false end
return true;
elseif first.type == TOK_NUM and second.type == TOK_ID then
return false
elseif first.type == TOK_ID and second.type == TOK_NUM then
return false
elseif first.type == TOK_ID and second.type == TOK_ID then
return false
elseif first.type == TOK_NUM and second.type == TOK_OP then
return (
second.val ~= operators.DOT and
second.val ~= operators.CONCAT and
second.val ~= operators.SPREAD
);
elseif first.type == TOK_OP and second.type == TOK_NUM then
return (
first.val ~= operators.DOT and
first.val ~= operators.CONCAT and
first.val ~= operators.SPREAD
);
elseif first.type == TOK_ID and second.type == TOK_ID then
return true;
elseif first.type == TOK_NUM and second.type == TOK_NUM then
return false;
elseif first.type == TOK_STR and second.type == TOK_STR then
return false;
else
return true;
end
end
---@param tokens fun(): token?, string?
---@return fun(): string?, string?
local function token_stringifier(tokens)
local last_tok
return function ()
if tokens == nil then return nil end
local tok, err = tokens()
if tok == nil then
--- @diagnostic disable-next-line: cast-local-type
tokens = nil
return nil, err
end
if last_tok ~= nil and not can_go_after(last_tok, tok) then
last_tok = tok
return " " .. tok.raw
else
last_tok = tok
return tok.raw
end
end
end
return {
TOK_ID = TOK_ID,
TOK_OP = TOK_OP,
TOK_STR = TOK_STR,
TOK_NUM = TOK_NUM,
operators = operators,
char_supplier = char_supplier,
token_supplier = token_supplier,
concat_tokens = concat_tokens,
token_stringifier = token_stringifier,
tok_id = tok_id,
tok_op = tok_op,
tok_num = tok_num,
tok_str = tok_str,
}