chore: rewrite awful lexer

2022-10-10 18:53:31 +03:00 · 2022-10-10 18:53:31 +03:00 · 1192e86c23
commit 1192e86c23
parent 4d6ce93ae3
1 changed files with 149 additions and 273 deletions
--- a/src/compiler/treeifier/lexer.cc
+++ b/src/compiler/treeifier/lexer.cc
@ -3,333 +3,209 @@
 #include "utils/message.hh"

 using namespace ppc;
-using namespace comp::tree::lex;
+using namespace ppc::messages;
+using namespace ppc::comp::tree::lex;

-struct lexlet_t {
-    struct process_res_t {
-        bool ended;
-        bool repeat;
-        bool dont_add;
-        const lexlet_t *new_parselet;
-        bool has_message;
-        messages::message_t msg;
-    };
+struct res_t;
+using lexlet_t = res_t (*)(char c, std::vector<char> &tok);

-    bool(*is_valid)(char curr);
-    process_res_t (*process)(char curr);
+struct res_t {
+    lexlet_t new_parselet;
    token_t::kind_t type;
+    bool _repeat;
+    bool _add;
+
+    res_t add(bool val = false) {
+        this->_add = val;
+        return *this;
+    }
+    res_t repeat(bool val = true) {
+        this->_repeat = val;
+        return *this;
+    }
 };

-using process_res_t = lexlet_t::process_res_t;

-static bool is_digit(char c) {
-    return c >= '0' && c <= '9';
-}
-static bool is_oct(char c) {
+static bool isoct(char c) {
    return c >= '0' && c <= '7';
 }
-static bool is_hex(char c) {
-    return is_digit(c) || (c >= 'A' && c <= 'F') || (c >= 'a' || c <= 'f');
-}
-static bool is_lower(char c) {
-    return c >= 'a' && c <= 'z';
-}
-static bool is_upper(char c) {
-    return c >= 'A' && c <= 'Z';
-}
-static bool is_letter(char c) {
-    return is_lower(c) || is_upper(c);
-}
-static bool is_alphanumeric(char c) {
-    return is_letter(c) || is_digit(c);
-}
 static bool is_any(char c, std::string chars) {
    auto res = chars.find(c) != std::string::npos;
    return res;
 }
+static bool is_operator(char c) {
+    return is_any(c, "=!<>+-*/%&|^?:,.(){}[];");
+}

-static process_res_t lexer_switch(const lexlet_t *lexlet) {
+static res_t lexlet_default(char c, std::vector<char> &tok);
+
+static res_t lexer_switch(lexlet_t lexlet, bool repeat = false) {
    return {
-        .ended = false,
-        .repeat = false,
        .new_parselet = lexlet,
+        ._repeat = repeat,
    };
 }
-static process_res_t lexer_repeat_switch(const lexlet_t *lexlet) {
-    return (process_res_t) {
-        .ended = false,
-        .repeat = true,
-        .new_parselet = lexlet,
+static res_t lexer_end(token_t::kind_t type, bool repeat = true) {
+    return {
+        .new_parselet = lexlet_default,
+        .type = type,
+        ._repeat = repeat
    };
 }
-static process_res_t lexer_end() {
-    return (process_res_t) {
-        .ended = true,
-        .repeat = true,
-        .new_parselet = nullptr,
-    };
-}
-static process_res_t lexer_none() {
-    return (process_res_t) {
-        .ended = false,
-        .repeat = false,
-        .new_parselet = nullptr,
-    };
+static res_t lexer_none() {
+    return { ._add = true };
 }

-static bool last_escape = false;
-static bool literal_ended = false;
-static char first_op;
-static int op_i = 0;
-static bool only_dot = false;
-static bool last_star = false;
+static res_t lexlet_identifier(char c, std::vector<char> &tok) {
+    if (is_operator(c) || isspace(c)) return lexer_end(token_t::IDENTIFIER);
+    else return lexer_none();
+};
+static res_t lexlet_hex(char c, std::vector<char> &tok) {
+    if (isxdigit(c)) return lexer_none();
+    else return lexer_end(token_t::HEX_LITERAL);
+};
+static res_t lexlet_bin(char c, std::vector<char> &tok) {
+    if (is_any(c, "01")) return lexer_none();
+    else if (isdigit(c)) throw message_t::error("A binary literal may only contain zeroes and ones.");
+    else return lexer_end(token_t::BIN_LITERAL);
+};
+static res_t lexlet_oct(char c, std::vector<char> &tok) {
+    if (isoct(c)) return lexer_none();
+    else if (isdigit(c)) throw message_t::error("An octal literal may only contain octal digits.");
+    else return lexer_end(token_t::OCT_LITERAL);
+};  
+static res_t lexlet_float(char c, std::vector<char> &tok) {
+    if (isdigit(c)) return lexer_none();
+    else return lexer_end(token_t::FLOAT_LITERAL);
+};
+static res_t lexlet_dec(char c, std::vector<char> &tok) {
+    if (isdigit(c)) return lexer_none();
+    else if (c == '.') return lexer_switch(lexlet_float);
+    else return lexer_end(token_t::DEC_LITERAL);
+};

-const lexlet_t LEXLET_IDENTIFIER = (lexlet_t) {
-    .is_valid = [] (char curr) { return is_letter(curr) || curr == '_' || curr == '@' || curr == '$'; },
-    .process = [] (char curr) {
-        bool valid = (is_alphanumeric(curr) || curr == '_' || curr == '@' || curr == '$');
-        return (process_res_t) {
-            .ended = !valid,
-            .repeat = !valid,
-            .new_parselet = &LEXLET_IDENTIFIER,
+static res_t lexlet_zero(char c, std::vector<char> &tok) {
+    if (c == '.') return lexer_switch(lexlet_float);
+    else if (c == 'b') return lexer_switch(lexlet_bin);
+    else if (c == 'x') return lexer_switch(lexlet_hex);
+    else if (isdigit(c)) return lexer_switch(lexlet_oct, true);
+    else return lexer_end(token_t::DEC_LITERAL);
 };
-    },
-    .type = token_t::IDENTIFIER,
+static res_t lexlet_comment(char c, std::vector<char> &tok) {
+    tok.clear();
+    if (c == '\n') return lexer_switch(lexlet_default);
+    else return lexer_none().add(false);
 };
-const lexlet_t LEXLET_HEX = (lexlet_t) {
-    .process = [] (char curr) {
-        if (is_hex(curr)) return lexer_none();
-        else return lexer_end();
-    },
-    .type = token_t::HEX_LITERAL,
-};
-const lexlet_t LEXLET_BIN = (lexlet_t) {
-    .process = [] (char curr) {
-        if (curr == '0' || curr == '1') return lexer_none();
-        else if (is_digit(curr))
-            throw messages::message_t(messages::message_t::ERROR, "A binary literal may only contain zeroes and ones.", location_t::NONE);
-        else return lexer_end();
-    },
-    .type = token_t::BIN_LITERAL,
-};
-const lexlet_t LEXLET_OCT = (lexlet_t) {
-    .process = [] (char curr) {
-        if (is_oct(curr)) return lexer_none();
-        else if (is_digit(curr))
-            throw messages::message_t(messages::message_t::ERROR, "An octal literal may only contain octal digits.", location_t::NONE);
-        else return lexer_end();
-    },
-    .type = token_t::OCT_LITERAL,
-};
-const lexlet_t LEXLET_FLOAT = (lexlet_t) {
-    .is_valid = [] (char curr) { return only_dot = curr == '.'; },
-    .process = [] (char curr) {
-        if (is_digit(curr)) {
-            only_dot = false;
+static res_t lexlet_multicomment(char c, std::vector<char> &tok) {
+    if (c == '/' && tok.size() && tok.back() == '*') {
+        tok.clear();
+        return lexer_switch(lexlet_default);
+    }
+
    return lexer_none();
-        }
-        else return lexer_end();
-    },
-    .type = token_t::FLOAT_LITERAL,
 };
-const lexlet_t LEXLET_DEC = (lexlet_t) {
-    .is_valid = [] (char curr) { return is_digit(curr); },
-    .process = [] (char curr) {
-        if (is_digit(curr)) return lexer_none();
-        else if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
-        else return lexer_end();
-    },
-    .type = token_t::DEC_LITERAL,
-};
-const lexlet_t LEXLET_ZERO = (lexlet_t) {
-    .is_valid = [] (char curr) { return curr == '0'; },
-    .process = [] (char curr) {
-        if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
-        else if (curr == 'b') return lexer_switch(&LEXLET_BIN);
-        else if (curr == 'x') return lexer_switch(&LEXLET_HEX);
-        else if (is_digit(curr)) return lexer_repeat_switch(&LEXLET_OCT);
-        else return lexer_end();
-    },
-    .type = token_t::DEC_LITERAL,
-};
-const lexlet_t LEXLET_COMMENT = {
-    .process = [] (char curr) {
-        if (curr == '\n') return lexer_end();
-        else return (process_res_t) {
-            .ended = false,
-            .dont_add = true,
-        };
-    },
-    .type = token_t::NONE,
-};
-const lexlet_t LEXLET_MULTICOMMENT = {
-    .process = [] (char curr) {
-        if (curr == '/' && last_star) {
-            last_star = false;
-            return (process_res_t) {
-                .ended = true,
-            };
-        }
-        if (curr == '*') last_star = true;
+static res_t lexlet_operator(char c, std::vector<char> &tok) {
+    bool failed = false;

-        return (process_res_t) {
-            .dont_add = true,
-        };
-    },
-    .type = token_t::NONE,
-};
-const lexlet_t LEXLET_OPERATOR = (lexlet_t) {
-    .is_valid = [] (char curr) {
-        if (is_any(curr, "=!<>+-*/%&|^?:,.(){}[];")) {
-            first_op = curr;
-            op_i = 1;
-            return true;
-        }
-        else return false;
-    },
-    .process = [] (char curr) {
-        bool failed = true;
-        if (first_op == curr && op_i == 1 && is_any(curr, ":+-&|?<>")) failed = false;
-        if (curr == '=') {
+    if (tok.size() > 0) {
+        failed = true;
+        char first_op = tok[0];
+        size_t op_i = tok.size();
+
+        if (first_op == c && op_i == 1 && is_any(c, ":+-&|?<>")) failed = false;
+        if (c == '=') {
            if (op_i == 1 && is_any(first_op, "<>=!+-/*%")) failed = false;
            if (op_i == 2 && is_any(first_op, "<>?")) failed = false;
        }
-        if (first_op == '-' && curr == '>' && op_i == 1) failed = false;
+        if (first_op == '-' && c == '>' && op_i == 1) failed = false;

        if (first_op == '/' && op_i == 1) {
-            if (curr == '/') return lexer_switch(&LEXLET_COMMENT);
-            else if (curr == '*') return lexer_switch(&LEXLET_MULTICOMMENT);
+            if (c == '/') return lexer_switch(lexlet_comment);
+            else if (c == '*') return lexer_switch(lexlet_multicomment);
+        }
    }

-        op_i++;
-
-        if (failed) return lexer_end();
+    if (failed) return lexer_end(token_t::OPERATOR);
    else return lexer_none();
-    },
-    .type = token_t::OPERATOR,
 };
-const lexlet_t LEXLET_STRING_LITERAL = (lexlet_t) {
-    .is_valid = [] (char curr) {
-        last_escape = false;
-        literal_ended = false;
-        return curr == '"';
-    },
-    .process = [] (char curr) {
-        if (last_escape) {
-            last_escape = false;
-            return lexer_none();
-        }
-
-        if (curr == '\\') {
-            last_escape = true;
-        }
-        else if (curr == '"') {
-            literal_ended = true;
-        }
-        else if (literal_ended) return lexer_end();
-        return lexer_none();
-    },
-    .type = token_t::STRING_LITERAL,
+static res_t lexlet_string(char c, std::vector<char> &tok) {
+    if (c == '"' && tok.back() != '\\') return lexer_end(token_t::STRING_LITERAL, true);
+    else return lexer_none();
 };
-const lexlet_t LEXLET_CHAR_LITERAL = (lexlet_t) {
-    .is_valid = [] (char curr) {
-        last_escape = false;
-        literal_ended = false;
-        return curr == '\'';
-    },
-    .process = [] (char curr) {
-        if (last_escape) {
-            last_escape = false;
-            return lexer_none();
-        }
-
-        if (curr == '\\') {
-            last_escape = true;
-        }
-        else if (curr == '\'') {
-            literal_ended = true;
-        }
-        else if (literal_ended) return lexer_end();
-        return lexer_none();
-    },
-    .type = token_t::CHAR_LITERAL,
-};
-const lexlet_t LEXLET_DEFAULT = (lexlet_t) {
-    .process = [] (char curr) {
-        if (LEXLET_STRING_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_STRING_LITERAL);
-        if (LEXLET_CHAR_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_CHAR_LITERAL);
-        if (LEXLET_OPERATOR.is_valid(curr)) return lexer_switch(&LEXLET_OPERATOR);
-        if (LEXLET_ZERO.is_valid(curr)) return lexer_switch(&LEXLET_ZERO);
-        if (LEXLET_DEC.is_valid(curr)) return lexer_switch(&LEXLET_DEC);
-        if (LEXLET_FLOAT.is_valid(curr)) return lexer_switch(&LEXLET_FLOAT);
-        if (LEXLET_IDENTIFIER.is_valid(curr)) return lexer_switch(&LEXLET_IDENTIFIER);
-        else return (process_res_t) {
-            .ended = true,
-            .repeat = false,
-            .new_parselet = nullptr,
-        };
-    },
-    .type = token_t::NONE,
+static res_t lexlet_char(char c, std::vector<char> &tok) {
+    if (c == '"' && tok.back() != '\\') return lexer_end(token_t::CHAR_LITERAL, true);
+    else return lexer_none();
 };

-std::vector<token_t> token_t::parse_many(ppc::messages::msg_stack_t &msg_stack, const std::string &filename, const std::string &src) {
+static res_t lexlet_default(char c, std::vector<char> &tok) {
+    tok.push_back(c);
+    if (c == '"') return lexer_switch(lexlet_string);
+    if (c == '\'') return lexer_switch(lexlet_char);
+    if (c == '0') return lexer_switch(lexlet_zero);
+    if (c == '.') return lexer_switch(lexlet_float);
+    if (is_operator(c)) return lexer_switch(lexlet_operator);
+    if (isdigit(c)) return lexer_switch(lexlet_dec);
+    if (isspace(c)) {
+        tok.clear();
+        return lexer_none().add(false);
+    }
+    return lexer_switch(lexlet_identifier);
+};
+
+std::vector<token_t> token_t::parse_many(ppc::messages::msg_stack_t &msg_stack, const std::string &filename, const std::string &_src) {
+    auto src = _src + '\n';
    std::vector<token_t> tokens;
    std::vector<char> curr_token;
-    lexlet_t curr = LEXLET_DEFAULT;
-    std::size_t start = 0, line = 0, curr_start = 0, curr_line = 0, length = 0, i = 0;
+    lexlet_t curr = lexlet_default;
+    std::size_t start = 0, line = 0, curr_start = 0, curr_line = 0, i = 0;

-    while (src[i]) {
+    while (i < src.size()) {
        char c = src[i];
        try {
-            process_res_t res = curr.process(c);
-            if (i == 0) res.repeat = false;
-            if (res.has_message) throw res.msg;
+            res_t res = curr(c, curr_token);
+            if (i == 0) res._repeat = false;

-            if (res.ended) {
-                if (curr.type) {
-                    location_t loc = { filename, line, start, i - length, length };
-                    tokens.push_back({ curr.type, { curr_token.begin(), curr_token.end() }, loc });
+            if (res._add) {
+                curr_token.push_back(c);
            }
-
+            if (res.type) {
+                size_t len = curr_token.size();
+                location_t loc(filename, line, start, i - len, len);
+                tokens.push_back({ res.type, { curr_token.begin(), curr_token.end() }, loc });
                curr_token.clear();
-                length = 0;
-                curr = LEXLET_DEFAULT;
            }
-            else {
            if (res.new_parselet) {
-                    if (!curr.type) {
+                if (curr == lexlet_default && res.new_parselet != lexlet_default) {
                    start = curr_start;
                    line = curr_line;
                }
-                    curr = *res.new_parselet;
-                }
-                if (!res.dont_add) {
-                    curr_token.push_back(c);
-                    length++;
-                }
+                curr = res.new_parselet;
            }

-            if (!res.repeat) {
-                curr_start++;
-                if (c == '\n') {
-                    curr_line++;
-                    curr_start = 0;
-                }
+            if (!res._repeat) {
                i++;
+                curr_start++;
+                if (i == src.size()) break;
+                if (c == '\n') {
+                    curr_start = 0;
+                    curr_line++;
+                }
            }
        }
        catch (const messages::message_t &msg) { 
-            throw messages::message_t(msg.level, msg.content, location_t(filename, line, start, i - length, length));
+            throw message_t(msg.level, msg.content, location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
        }
    }

-    location_t loc = { filename, line, start, i - length, length };
-    if (curr.type) {
-        tokens.push_back({
-            curr.type, std::string { curr_token.begin(), curr_token.end() },
-            { filename, line, start, i - length, length }
-        });
-    }
+    curr_start--;
+
+    if (curr_token.size()) curr_token.pop_back();
+
+    if (curr == lexlet_string)
+        throw message_t::error("Unclosed string literal.", location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
+    if (curr == lexlet_char)
+        throw message_t::error("Unclosed char literal.", location_t(filename, line, start, i - curr_token.size(), curr_token.size()));
+    if (curr != lexlet_default) throw message_t::error("Unexpected end.", location_t(filename, curr_line, curr_start, i, 1));

    return tokens;
 }