chore: rewrite awful lexer

This commit is contained in:
TopchetoEU 2022-10-10 18:53:31 +03:00
parent 4d6ce93ae3
commit 1192e86c23

View File

@ -3,333 +3,209 @@
#include "utils/message.hh" #include "utils/message.hh"
using namespace ppc; using namespace ppc;
using namespace comp::tree::lex; using namespace ppc::messages;
using namespace ppc::comp::tree::lex;
struct lexlet_t { struct res_t;
struct process_res_t { using lexlet_t = res_t (*)(char c, std::vector<char> &tok);
bool ended;
bool repeat;
bool dont_add;
const lexlet_t *new_parselet;
bool has_message;
messages::message_t msg;
};
bool(*is_valid)(char curr); struct res_t {
process_res_t (*process)(char curr); lexlet_t new_parselet;
token_t::kind_t type; token_t::kind_t type;
bool _repeat;
bool _add;
res_t add(bool val = false) {
this->_add = val;
return *this;
}
res_t repeat(bool val = true) {
this->_repeat = val;
return *this;
}
}; };
using process_res_t = lexlet_t::process_res_t;
static bool is_digit(char c) { static bool isoct(char c) {
return c >= '0' && c <= '9';
}
static bool is_oct(char c) {
return c >= '0' && c <= '7'; return c >= '0' && c <= '7';
} }
static bool is_hex(char c) {
return is_digit(c) || (c >= 'A' && c <= 'F') || (c >= 'a' || c <= 'f');
}
static bool is_lower(char c) {
return c >= 'a' && c <= 'z';
}
static bool is_upper(char c) {
return c >= 'A' && c <= 'Z';
}
static bool is_letter(char c) {
return is_lower(c) || is_upper(c);
}
static bool is_alphanumeric(char c) {
return is_letter(c) || is_digit(c);
}
static bool is_any(char c, std::string chars) { static bool is_any(char c, std::string chars) {
auto res = chars.find(c) != std::string::npos; auto res = chars.find(c) != std::string::npos;
return res; return res;
} }
static bool is_operator(char c) {
return is_any(c, "=!<>+-*/%&|^?:,.(){}[];");
}
static process_res_t lexer_switch(const lexlet_t *lexlet) { static res_t lexlet_default(char c, std::vector<char> &tok);
static res_t lexer_switch(lexlet_t lexlet, bool repeat = false) {
return { return {
.ended = false,
.repeat = false,
.new_parselet = lexlet, .new_parselet = lexlet,
._repeat = repeat,
}; };
} }
static process_res_t lexer_repeat_switch(const lexlet_t *lexlet) { static res_t lexer_end(token_t::kind_t type, bool repeat = true) {
return (process_res_t) { return {
.ended = false, .new_parselet = lexlet_default,
.repeat = true, .type = type,
.new_parselet = lexlet, ._repeat = repeat
}; };
} }
static process_res_t lexer_end() { static res_t lexer_none() {
return (process_res_t) { return { ._add = true };
.ended = true,
.repeat = true,
.new_parselet = nullptr,
};
}
static process_res_t lexer_none() {
return (process_res_t) {
.ended = false,
.repeat = false,
.new_parselet = nullptr,
};
} }
static bool last_escape = false; static res_t lexlet_identifier(char c, std::vector<char> &tok) {
static bool literal_ended = false; if (is_operator(c) || isspace(c)) return lexer_end(token_t::IDENTIFIER);
static char first_op; else return lexer_none();
static int op_i = 0; };
static bool only_dot = false; static res_t lexlet_hex(char c, std::vector<char> &tok) {
static bool last_star = false; if (isxdigit(c)) return lexer_none();
else return lexer_end(token_t::HEX_LITERAL);
};
static res_t lexlet_bin(char c, std::vector<char> &tok) {
if (is_any(c, "01")) return lexer_none();
else if (isdigit(c)) throw message_t::error("A binary literal may only contain zeroes and ones.");
else return lexer_end(token_t::BIN_LITERAL);
};
static res_t lexlet_oct(char c, std::vector<char> &tok) {
if (isoct(c)) return lexer_none();
else if (isdigit(c)) throw message_t::error("An octal literal may only contain octal digits.");
else return lexer_end(token_t::OCT_LITERAL);
};
static res_t lexlet_float(char c, std::vector<char> &tok) {
if (isdigit(c)) return lexer_none();
else return lexer_end(token_t::FLOAT_LITERAL);
};
static res_t lexlet_dec(char c, std::vector<char> &tok) {
if (isdigit(c)) return lexer_none();
else if (c == '.') return lexer_switch(lexlet_float);
else return lexer_end(token_t::DEC_LITERAL);
};
const lexlet_t LEXLET_IDENTIFIER = (lexlet_t) { static res_t lexlet_zero(char c, std::vector<char> &tok) {
.is_valid = [] (char curr) { return is_letter(curr) || curr == '_' || curr == '@' || curr == '$'; }, if (c == '.') return lexer_switch(lexlet_float);
.process = [] (char curr) { else if (c == 'b') return lexer_switch(lexlet_bin);
bool valid = (is_alphanumeric(curr) || curr == '_' || curr == '@' || curr == '$'); else if (c == 'x') return lexer_switch(lexlet_hex);
return (process_res_t) { else if (isdigit(c)) return lexer_switch(lexlet_oct, true);
.ended = !valid, else return lexer_end(token_t::DEC_LITERAL);
.repeat = !valid,
.new_parselet = &LEXLET_IDENTIFIER,
};
},
.type = token_t::IDENTIFIER,
}; };
const lexlet_t LEXLET_HEX = (lexlet_t) { static res_t lexlet_comment(char c, std::vector<char> &tok) {
.process = [] (char curr) { tok.clear();
if (is_hex(curr)) return lexer_none(); if (c == '\n') return lexer_switch(lexlet_default);
else return lexer_end(); else return lexer_none().add(false);
},
.type = token_t::HEX_LITERAL,
}; };
const lexlet_t LEXLET_BIN = (lexlet_t) { static res_t lexlet_multicomment(char c, std::vector<char> &tok) {
.process = [] (char curr) { if (c == '/' && tok.size() && tok.back() == '*') {
if (curr == '0' || curr == '1') return lexer_none(); tok.clear();
else if (is_digit(curr)) return lexer_switch(lexlet_default);
throw messages::message_t(messages::message_t::ERROR, "A binary literal may only contain zeroes and ones.", location_t::NONE); }
else return lexer_end();
}, return lexer_none();
.type = token_t::BIN_LITERAL,
}; };
const lexlet_t LEXLET_OCT = (lexlet_t) { static res_t lexlet_operator(char c, std::vector<char> &tok) {
.process = [] (char curr) { bool failed = false;
if (is_oct(curr)) return lexer_none();
else if (is_digit(curr)) if (tok.size() > 0) {
throw messages::message_t(messages::message_t::ERROR, "An octal literal may only contain octal digits.", location_t::NONE); failed = true;
else return lexer_end(); char first_op = tok[0];
}, size_t op_i = tok.size();
.type = token_t::OCT_LITERAL,
}; if (first_op == c && op_i == 1 && is_any(c, ":+-&|?<>")) failed = false;
const lexlet_t LEXLET_FLOAT = (lexlet_t) { if (c == '=') {
.is_valid = [] (char curr) { return only_dot = curr == '.'; },
.process = [] (char curr) {
if (is_digit(curr)) {
only_dot = false;
return lexer_none();
}
else return lexer_end();
},
.type = token_t::FLOAT_LITERAL,
};
const lexlet_t LEXLET_DEC = (lexlet_t) {
.is_valid = [] (char curr) { return is_digit(curr); },
.process = [] (char curr) {
if (is_digit(curr)) return lexer_none();
else if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
else return lexer_end();
},
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_ZERO = (lexlet_t) {
.is_valid = [] (char curr) { return curr == '0'; },
.process = [] (char curr) {
if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
else if (curr == 'b') return lexer_switch(&LEXLET_BIN);
else if (curr == 'x') return lexer_switch(&LEXLET_HEX);
else if (is_digit(curr)) return lexer_repeat_switch(&LEXLET_OCT);
else return lexer_end();
},
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_COMMENT = {
.process = [] (char curr) {
if (curr == '\n') return lexer_end();
else return (process_res_t) {
.ended = false,
.dont_add = true,
};
},
.type = token_t::NONE,
};
const lexlet_t LEXLET_MULTICOMMENT = {
.process = [] (char curr) {
if (curr == '/' && last_star) {
last_star = false;
return (process_res_t) {
.ended = true,
};
}
if (curr == '*') last_star = true;
return (process_res_t) {
.dont_add = true,
};
},
.type = token_t::NONE,
};
const lexlet_t LEXLET_OPERATOR = (lexlet_t) {
.is_valid = [] (char curr) {
if (is_any(curr, "=!<>+-*/%&|^?:,.(){}[];")) {
first_op = curr;
op_i = 1;
return true;
}
else return false;
},
.process = [] (char curr) {
bool failed = true;
if (first_op == curr && op_i == 1 && is_any(curr, ":+-&|?<>")) failed = false;
if (curr == '=') {
if (op_i == 1 && is_any(first_op, "<>=!+-/*%")) failed = false; if (op_i == 1 && is_any(first_op, "<>=!+-/*%")) failed = false;
if (op_i == 2 && is_any(first_op, "<>?")) failed = false; if (op_i == 2 && is_any(first_op, "<>?")) failed = false;
} }
if (first_op == '-' && curr == '>' && op_i == 1) failed = false; if (first_op == '-' && c == '>' && op_i == 1) failed = false;
if (first_op == '/' && op_i == 1) { if (first_op == '/' && op_i == 1) {
if (curr == '/') return lexer_switch(&LEXLET_COMMENT); if (c == '/') return lexer_switch(lexlet_comment);
else if (curr == '*') return lexer_switch(&LEXLET_MULTICOMMENT); else if (c == '*') return lexer_switch(lexlet_multicomment);
} }
}
op_i++; if (failed) return lexer_end(token_t::OPERATOR);
else return lexer_none();
if (failed) return lexer_end();
else return lexer_none();
},
.type = token_t::OPERATOR,
}; };
const lexlet_t LEXLET_STRING_LITERAL = (lexlet_t) { static res_t lexlet_string(char c, std::vector<char> &tok) {
.is_valid = [] (char curr) { if (c == '"' && tok.back() != '\\') return lexer_end(token_t::STRING_LITERAL, true);
last_escape = false; else return lexer_none();
literal_ended = false;
return curr == '"';
},
.process = [] (char curr) {
if (last_escape) {
last_escape = false;
return lexer_none();
}
if (curr == '\\') {
last_escape = true;
}
else if (curr == '"') {
literal_ended = true;
}
else if (literal_ended) return lexer_end();
return lexer_none();
},
.type = token_t::STRING_LITERAL,
}; };
const lexlet_t LEXLET_CHAR_LITERAL = (lexlet_t) { static res_t lexlet_char(char c, std::vector<char> &tok) {
.is_valid = [] (char curr) { if (c == '"' && tok.back() != '\\') return lexer_end(token_t::CHAR_LITERAL, true);
last_escape = false; else return lexer_none();
literal_ended = false;
return curr == '\'';
},
.process = [] (char curr) {
if (last_escape) {
last_escape = false;
return lexer_none();
}
if (curr == '\\') {
last_escape = true;
}
else if (curr == '\'') {
literal_ended = true;
}
else if (literal_ended) return lexer_end();
return lexer_none();
},
.type = token_t::CHAR_LITERAL,
};
const lexlet_t LEXLET_DEFAULT = (lexlet_t) {
.process = [] (char curr) {
if (LEXLET_STRING_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_STRING_LITERAL);
if (LEXLET_CHAR_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_CHAR_LITERAL);
if (LEXLET_OPERATOR.is_valid(curr)) return lexer_switch(&LEXLET_OPERATOR);
if (LEXLET_ZERO.is_valid(curr)) return lexer_switch(&LEXLET_ZERO);
if (LEXLET_DEC.is_valid(curr)) return lexer_switch(&LEXLET_DEC);
if (LEXLET_FLOAT.is_valid(curr)) return lexer_switch(&LEXLET_FLOAT);
if (LEXLET_IDENTIFIER.is_valid(curr)) return lexer_switch(&LEXLET_IDENTIFIER);
else return (process_res_t) {
.ended = true,
.repeat = false,
.new_parselet = nullptr,
};
},
.type = token_t::NONE,
}; };
std::vector<token_t> token_t::parse_many(ppc::messages::msg_stack_t &msg_stack, const std::string &filename, const std::string &src) { static res_t lexlet_default(char c, std::vector<char> &tok) {
tok.push_back(c);
if (c == '"') return lexer_switch(lexlet_string);
if (c == '\'') return lexer_switch(lexlet_char);
if (c == '0') return lexer_switch(lexlet_zero);
if (c == '.') return lexer_switch(lexlet_float);
if (is_operator(c)) return lexer_switch(lexlet_operator);
if (isdigit(c)) return lexer_switch(lexlet_dec);
if (isspace(c)) {
tok.clear();
return lexer_none().add(false);
}
return lexer_switch(lexlet_identifier);
};
std::vector<token_t> token_t::parse_many(ppc::messages::msg_stack_t &msg_stack, const std::string &filename, const std::string &_src) {
auto src = _src + '\n';
std::vector<token_t> tokens; std::vector<token_t> tokens;
std::vector<char> curr_token; std::vector<char> curr_token;
lexlet_t curr = LEXLET_DEFAULT; lexlet_t curr = lexlet_default;
std::size_t start = 0, line = 0, curr_start = 0, curr_line = 0, length = 0, i = 0; std::size_t start = 0, line = 0, curr_start = 0, curr_line = 0, i = 0;
while (src[i]) { while (i < src.size()) {
char c = src[i]; char c = src[i];
try { try {
process_res_t res = curr.process(c); res_t res = curr(c, curr_token);
if (i == 0) res.repeat = false; if (i == 0) res._repeat = false;
if (res.has_message) throw res.msg;
if (res.ended) {
if (curr.type) {
location_t loc = { filename, line, start, i - length, length };
tokens.push_back({ curr.type, { curr_token.begin(), curr_token.end() }, loc });
}
if (res._add) {
curr_token.push_back(c);
}
if (res.type) {
size_t len = curr_token.size();
location_t loc(filename, line, start, i - len, len);
tokens.push_back({ res.type, { curr_token.begin(), curr_token.end() }, loc });
curr_token.clear(); curr_token.clear();
length = 0;
curr = LEXLET_DEFAULT;
} }
else { if (res.new_parselet) {
if (res.new_parselet) { if (curr == lexlet_default && res.new_parselet != lexlet_default) {
if (!curr.type) { start = curr_start;
start = curr_start; line = curr_line;
line = curr_line;
}
curr = *res.new_parselet;
}
if (!res.dont_add) {
curr_token.push_back(c);
length++;
} }
curr = res.new_parselet;
} }
if (!res.repeat) { if (!res._repeat) {
curr_start++;
if (c == '\n') {
curr_line++;
curr_start = 0;
}
i++; i++;
curr_start++;
if (i == src.size()) break;
if (c == '\n') {
curr_start = 0;
curr_line++;
}
} }
} }
catch (const messages::message_t &msg) { catch (const messages::message_t &msg) {
throw messages::message_t(msg.level, msg.content, location_t(filename, line, start, i - length, length)); throw message_t(msg.level, msg.content, location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
} }
} }
location_t loc = { filename, line, start, i - length, length }; curr_start--;
if (curr.type) {
tokens.push_back({ if (curr_token.size()) curr_token.pop_back();
curr.type, std::string { curr_token.begin(), curr_token.end() },
{ filename, line, start, i - length, length } if (curr == lexlet_string)
}); throw message_t::error("Unclosed string literal.", location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
} if (curr == lexlet_char)
throw message_t::error("Unclosed char literal.", location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
if (curr != lexlet_default) throw message_t::error("Unexpected end.", location_t(filename, curr_line, curr_start, i, 1));
return tokens; return tokens;
} }