fix&refactor: clean up lex and fix some bugs with npos

This commit is contained in:
TopchetoEU 2022-10-04 19:35:19 +03:00
parent a0ff612dd2
commit 96232c88c9

View File

@ -6,7 +6,6 @@ using namespace ppc;
using namespace comp::tree::lex; using namespace comp::tree::lex;
struct lexlet_t { struct lexlet_t {
bool(*is_valid)(char curr);
struct process_res_t { struct process_res_t {
bool ended; bool ended;
bool repeat; bool repeat;
@ -15,26 +14,14 @@ struct lexlet_t {
bool has_message; bool has_message;
messages::message_t msg; messages::message_t msg;
}; };
bool(*is_valid)(char curr);
process_res_t (*process)(char curr); process_res_t (*process)(char curr);
token_t::kind_t type; token_t::kind_t type;
}; };
using process_res_t = lexlet_t::process_res_t; using process_res_t = lexlet_t::process_res_t;
extern const lexlet_t LEXLET_DEFAULT;
extern const lexlet_t LEXLET_IDENTIFIER;
extern const lexlet_t LEXLET_OPERATOR;
extern const lexlet_t LEXLET_ZERO;
extern const lexlet_t LEXLET_FLOAT;
extern const lexlet_t LEXLET_BIN;
extern const lexlet_t LEXLET_OCT;
extern const lexlet_t LEXLET_DEC;
extern const lexlet_t LEXLET_HEX;
extern const lexlet_t LEXLET_STRING_LITERAL;
extern const lexlet_t LEXLET_CHAR_LITERAL;
extern const lexlet_t LEXLET_COMMENT;
extern const lexlet_t LEXLET_MULTICOMMENT;
static bool is_digit(char c) { static bool is_digit(char c) {
return c >= '0' && c <= '9'; return c >= '0' && c <= '9';
} }
@ -57,7 +44,8 @@ static bool is_alphanumeric(char c) {
return is_letter(c) || is_digit(c); return is_letter(c) || is_digit(c);
} }
static bool is_any(char c, std::string chars) { static bool is_any(char c, std::string chars) {
return chars.find(c) != -1u; auto res = chars.find(c) != std::string::npos;
return res;
} }
static process_res_t lexer_switch(const lexlet_t *lexlet) { static process_res_t lexer_switch(const lexlet_t *lexlet) {
@ -88,90 +76,118 @@ static process_res_t lexer_none() {
.new_parselet = nullptr, .new_parselet = nullptr,
}; };
} }
static process_res_t default_process(char curr) {
if (LEXLET_STRING_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_STRING_LITERAL);
if (LEXLET_CHAR_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_CHAR_LITERAL);
if (LEXLET_OPERATOR.is_valid(curr)) return lexer_switch(&LEXLET_OPERATOR);
if (LEXLET_ZERO.is_valid(curr)) return lexer_switch(&LEXLET_ZERO);
if (LEXLET_DEC.is_valid(curr)) return lexer_switch(&LEXLET_DEC);
if (LEXLET_FLOAT.is_valid(curr)) return lexer_switch(&LEXLET_FLOAT);
if (LEXLET_IDENTIFIER.is_valid(curr)) return lexer_switch(&LEXLET_IDENTIFIER);
else return (process_res_t) {
.ended = true,
.repeat = false,
.new_parselet = nullptr,
};
}
static bool identifier_is_valid(char curr) { static bool last_escape = false;
return is_letter(curr) || curr == '_' || curr == '@' || curr == '$'; static bool literal_ended = false;
} static char first_op;
static process_res_t identifier_process(char curr) { static int op_i = 0;
static bool only_dot = false;
static bool last_star = false;
const lexlet_t LEXLET_IDENTIFIER = (lexlet_t) {
.is_valid = [] (char curr) { return is_letter(curr) || curr == '_' || curr == '@' || curr == '$'; },
.process = [] (char curr) {
bool valid = (is_alphanumeric(curr) || curr == '_' || curr == '@' || curr == '$'); bool valid = (is_alphanumeric(curr) || curr == '_' || curr == '@' || curr == '$');
return (process_res_t) { return (process_res_t) {
.ended = !valid, .ended = !valid,
.repeat = !valid, .repeat = !valid,
.new_parselet = &LEXLET_IDENTIFIER, .new_parselet = &LEXLET_IDENTIFIER,
}; };
} },
.type = token_t::IDENTIFIER,
static bool last_escape = false; };
static bool literal_ended = false; const lexlet_t LEXLET_HEX = (lexlet_t) {
.process = [] (char curr) {
static bool string_is_valid(char curr) { if (is_hex(curr)) return lexer_none();
last_escape = false; else return lexer_end();
literal_ended = false; },
return curr == '"'; .type = token_t::HEX_LITERAL,
} };
static process_res_t string_process(char curr) { const lexlet_t LEXLET_BIN = (lexlet_t) {
if (last_escape) { .process = [] (char curr) {
last_escape = false; if (curr == '0' || curr == '1') return lexer_none();
else if (is_digit(curr))
throw messages::message_t(messages::message_t::ERROR, "A binary literal may only contain zeroes and ones.", location_t::NONE);
else return lexer_end();
},
.type = token_t::BIN_LITERAL,
};
const lexlet_t LEXLET_OCT = (lexlet_t) {
.process = [] (char curr) {
if (is_oct(curr)) return lexer_none();
else if (is_digit(curr))
throw messages::message_t(messages::message_t::ERROR, "An octal literal may only contain octal digits.", location_t::NONE);
else return lexer_end();
},
.type = token_t::OCT_LITERAL,
};
const lexlet_t LEXLET_FLOAT = (lexlet_t) {
.is_valid = [] (char curr) { return only_dot = curr == '.'; },
.process = [] (char curr) {
if (is_digit(curr)) {
only_dot = false;
return lexer_none(); return lexer_none();
} }
else return lexer_end();
if (curr == '\\') { },
last_escape = true; .type = token_t::FLOAT_LITERAL,
};
const lexlet_t LEXLET_DEC = (lexlet_t) {
.is_valid = [] (char curr) { return is_digit(curr); },
.process = [] (char curr) {
if (is_digit(curr)) return lexer_none();
else if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
else return lexer_end();
},
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_ZERO = (lexlet_t) {
.is_valid = [] (char curr) { return curr == '0'; },
.process = [] (char curr) {
if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
else if (curr == 'b') return lexer_switch(&LEXLET_BIN);
else if (curr == 'x') return lexer_switch(&LEXLET_HEX);
else if (is_digit(curr)) return lexer_repeat_switch(&LEXLET_OCT);
else return lexer_end();
},
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_COMMENT = {
.process = [] (char curr) {
if (curr == '\n') return lexer_end();
else return (process_res_t) {
.ended = false,
.dont_add = true,
};
},
.type = token_t::NONE,
};
const lexlet_t LEXLET_MULTICOMMENT = {
.process = [] (char curr) {
if (curr == '/' && last_star) {
last_star = false;
return (process_res_t) {
.ended = true,
};
} }
else if (curr == '"') { if (curr == '*') last_star = true;
literal_ended = true;
}
else if (literal_ended) return lexer_end();
return lexer_none();
}
static bool char_is_valid(char curr) { return (process_res_t) {
last_escape = false; .dont_add = true,
literal_ended = false; };
return curr == '\''; },
} .type = token_t::NONE,
static process_res_t char_process(char curr) { };
if (last_escape) { const lexlet_t LEXLET_OPERATOR = (lexlet_t) {
last_escape = false; .is_valid = [] (char curr) {
return lexer_none();
}
if (curr == '\\') {
last_escape = true;
}
else if (curr == '\'') {
literal_ended = true;
}
else if (literal_ended) return lexer_end();
return lexer_none();
}
static char first_op;
static int op_i = 0;
static bool operator_is_valid(char curr) {
if (is_any(curr, "=!<>+-*/%&|^?:,.(){}[];")) { if (is_any(curr, "=!<>+-*/%&|^?:,.(){}[];")) {
first_op = curr; first_op = curr;
op_i = 1; op_i = 1;
return true; return true;
} }
else return false; else return false;
} },
static process_res_t operator_process(char curr) { .process = [] (char curr) {
bool failed = true; bool failed = true;
if (first_op == curr && op_i == 1 && is_any(curr, "+-&|?<>")) failed = false; if (first_op == curr && op_i == 1 && is_any(curr, "+-&|?<>")) failed = false;
if (curr == '=') { if (curr == '=') {
@ -189,144 +205,70 @@ static process_res_t operator_process(char curr) {
if (failed) return lexer_end(); if (failed) return lexer_end();
else return lexer_none(); else return lexer_none();
} },
.type = token_t::OPERATOR,
static bool zero_is_valid(char curr) { };
return curr == '0'; const lexlet_t LEXLET_STRING_LITERAL = (lexlet_t) {
} .is_valid = [] (char curr) {
static process_res_t zero_process(char curr) { last_escape = false;
if (curr == '.') return lexer_switch(&LEXLET_FLOAT); literal_ended = false;
else if (curr == 'b') return lexer_switch(&LEXLET_BIN); return curr == '"';
else if (curr == 'x') return lexer_switch(&LEXLET_HEX); },
else if (is_digit(curr)) return lexer_repeat_switch(&LEXLET_OCT); .process = [] (char curr) {
else return lexer_end(); if (last_escape) {
} last_escape = false;
static bool dec_is_valid(char curr) {
return is_digit(curr);
}
static process_res_t dec_process(char curr) {
if (is_digit(curr)) return lexer_none();
else if (curr == '.') return lexer_switch(&LEXLET_FLOAT);
else return lexer_end();
}
static bool only_dot = false;
static bool float_is_valid(char curr) {
return only_dot = curr == '.';
}
static process_res_t float_process(char curr) {
if (is_digit(curr)) {
only_dot = false;
return lexer_none(); return lexer_none();
} }
else return lexer_end();
}
static process_res_t hex_process(char curr) { if (curr == '\\') {
if (is_hex(curr)) return lexer_none(); last_escape = true;
else return lexer_end(); }
} else if (curr == '"') {
static process_res_t bin_process(char curr) { literal_ended = true;
if (curr == '0' || curr == '1') return lexer_none(); }
else if (is_digit(curr)) else if (literal_ended) return lexer_end();
throw messages::message_t(messages::message_t::ERROR, "A binary literal may only contain zeroes and ones.", location_t::NONE); return lexer_none();
else return lexer_end(); },
} .type = token_t::STRING_LITERAL,
static process_res_t oct_process(char curr) { };
if (is_oct(curr)) return lexer_none(); const lexlet_t LEXLET_CHAR_LITERAL = (lexlet_t) {
else if (is_digit(curr)) .is_valid = [] (char curr) {
throw messages::message_t(messages::message_t::ERROR, "An octal literal may only contain octal digits.", location_t::NONE); last_escape = false;
else return lexer_end(); literal_ended = false;
} return curr == '\'';
},
.process = [] (char curr) {
if (last_escape) {
last_escape = false;
return lexer_none();
}
static process_res_t comment_process(char curr) { if (curr == '\\') {
if (curr == '\n') return lexer_end(); last_escape = true;
}
else if (curr == '\'') {
literal_ended = true;
}
else if (literal_ended) return lexer_end();
return lexer_none();
},
.type = token_t::CHAR_LITERAL,
};
const lexlet_t LEXLET_DEFAULT = (lexlet_t) {
.process = [] (char curr) {
if (LEXLET_STRING_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_STRING_LITERAL);
if (LEXLET_CHAR_LITERAL.is_valid(curr)) return lexer_switch(&LEXLET_CHAR_LITERAL);
if (LEXLET_OPERATOR.is_valid(curr)) return lexer_switch(&LEXLET_OPERATOR);
if (LEXLET_ZERO.is_valid(curr)) return lexer_switch(&LEXLET_ZERO);
if (LEXLET_DEC.is_valid(curr)) return lexer_switch(&LEXLET_DEC);
if (LEXLET_FLOAT.is_valid(curr)) return lexer_switch(&LEXLET_FLOAT);
if (LEXLET_IDENTIFIER.is_valid(curr)) return lexer_switch(&LEXLET_IDENTIFIER);
else return (process_res_t) { else return (process_res_t) {
.ended = false,
.dont_add = true,
};
}
static bool last_star = false;
static process_res_t multicomment_process(char curr) {
if (curr == '/' && last_star) {
last_star = false;
return {
.ended = true, .ended = true,
.repeat = false, .repeat = false,
.new_parselet = nullptr, .new_parselet = nullptr,
}; };
} },
if (curr == '*') last_star = true;
return {
.ended = false,
.dont_add = true,
};
}
const lexlet_t LEXLET_DEFAULT = (lexlet_t) {
.process = default_process,
.type = token_t::NONE,
};
const lexlet_t LEXLET_IDENTIFIER = (lexlet_t) {
.is_valid = identifier_is_valid,
.process = identifier_process,
.type = token_t::IDENTIFIER,
};
const lexlet_t LEXLET_ZERO = (lexlet_t) {
.is_valid = zero_is_valid,
.process = zero_process,
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_DEC = (lexlet_t) {
.is_valid = dec_is_valid,
.process = dec_process,
.type = token_t::DEC_LITERAL,
};
const lexlet_t LEXLET_HEX = (lexlet_t) {
.process = hex_process,
.type = token_t::HEX_LITERAL,
};
const lexlet_t LEXLET_BIN = (lexlet_t) {
.process = bin_process,
.type = token_t::BIN_LITERAL,
};
const lexlet_t LEXLET_OCT = (lexlet_t) {
.process = oct_process,
.type = token_t::OCT_LITERAL,
};
const lexlet_t LEXLET_FLOAT = (lexlet_t) {
.is_valid = float_is_valid,
.process = float_process,
.type = token_t::FLOAT_LITERAL,
};
const lexlet_t LEXLET_OPERATOR = (lexlet_t) {
.is_valid = operator_is_valid,
.process = operator_process,
.type = token_t::OPERATOR,
};
const lexlet_t LEXLET_STRING_LITERAL = (lexlet_t) {
.is_valid = string_is_valid,
.process = string_process,
.type = token_t::STRING_LITERAL,
};
const lexlet_t LEXLET_CHAR_LITERAL = (lexlet_t) {
.is_valid = char_is_valid,
.process = char_process,
.type = token_t::CHAR_LITERAL,
};
const lexlet_t LEXLET_COMMENT = {
.is_valid = nullptr,
.process = comment_process,
.type = token_t::NONE,
};
const lexlet_t LEXLET_MULTICOMMENT = {
.is_valid = nullptr,
.process = multicomment_process,
.type = token_t::NONE, .type = token_t::NONE,
}; };
@ -347,11 +289,10 @@ std::vector<token_t> token_t::parse_many(ppc::messages::msg_stack_t &msg_stack,
if (curr.type) { if (curr.type) {
location_t loc = { filename, line, start, i - length, length }; location_t loc = { filename, line, start, i - length, length };
tokens.push_back({ curr.type, { curr_token.begin(), curr_token.end() }, loc }); tokens.push_back({ curr.type, { curr_token.begin(), curr_token.end() }, loc });
curr_token.clear();
} }
curr_token.clear();
length = 0; length = 0;
curr = LEXLET_DEFAULT; curr = LEXLET_DEFAULT;
} }
else { else {