236 lines
6.4 KiB
C++
236 lines
6.4 KiB
C++
#include "lexer.h"
|
|
#include <cctype>
|
|
#include <unordered_map>
|
|
|
|
namespace camellya {
|
|
|
|
Lexer::Lexer(std::string source) : source_(std::move(source)) {}
|
|
|
|
std::vector<Token> Lexer::tokenize() {
|
|
while (!is_at_end()) {
|
|
start_ = current_;
|
|
scan_token();
|
|
}
|
|
|
|
tokens_.emplace_back(TokenType::END_OF_FILE, "", line_, column_);
|
|
return tokens_;
|
|
}
|
|
|
|
char Lexer::advance() {
|
|
column_++;
|
|
return source_[current_++];
|
|
}
|
|
|
|
char Lexer::peek() const {
|
|
if (is_at_end()) return '\0';
|
|
return source_[current_];
|
|
}
|
|
|
|
char Lexer::peek_next() const {
|
|
if (current_ + 1 >= source_.length()) return '\0';
|
|
return source_[current_ + 1];
|
|
}
|
|
|
|
bool Lexer::match(char expected) {
|
|
if (is_at_end()) return false;
|
|
if (source_[current_] != expected) return false;
|
|
|
|
current_++;
|
|
column_++;
|
|
return true;
|
|
}
|
|
|
|
void Lexer::skip_whitespace() {
|
|
while (!is_at_end()) {
|
|
char c = peek();
|
|
switch (c) {
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
advance();
|
|
break;
|
|
case '\n':
|
|
line_++;
|
|
column_ = 0;
|
|
advance();
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Lexer::skip_comment() {
|
|
if (peek() == '/' && peek_next() == '/') {
|
|
while (peek() != '\n' && !is_at_end()) {
|
|
advance();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Lexer::scan_token() {
|
|
skip_whitespace();
|
|
|
|
if (is_at_end()) return;
|
|
|
|
start_ = current_;
|
|
int start_column = column_;
|
|
char c = advance();
|
|
|
|
switch (c) {
|
|
case '(': add_token(TokenType::LEFT_PAREN); break;
|
|
case ')': add_token(TokenType::RIGHT_PAREN); break;
|
|
case '{': add_token(TokenType::LEFT_BRACE); break;
|
|
case '}': add_token(TokenType::RIGHT_BRACE); break;
|
|
case '[': add_token(TokenType::LEFT_BRACKET); break;
|
|
case ']': add_token(TokenType::RIGHT_BRACKET); break;
|
|
case ',': add_token(TokenType::COMMA); break;
|
|
case '.': add_token(TokenType::DOT); break;
|
|
case ';': add_token(TokenType::SEMICOLON); break;
|
|
case ':': add_token(TokenType::COLON); break;
|
|
case '+': add_token(TokenType::PLUS); break;
|
|
case '*': add_token(TokenType::STAR); break;
|
|
case '%': add_token(TokenType::PERCENT); break;
|
|
case '-':
|
|
if (match('>')) {
|
|
add_token(TokenType::ARROW);
|
|
} else {
|
|
add_token(TokenType::MINUS);
|
|
}
|
|
break;
|
|
case '!':
|
|
add_token(match('=') ? TokenType::BANG_EQUAL : TokenType::BANG);
|
|
break;
|
|
case '=':
|
|
add_token(match('=') ? TokenType::EQUAL_EQUAL : TokenType::EQUAL);
|
|
break;
|
|
case '<':
|
|
add_token(match('=') ? TokenType::LESS_EQUAL : TokenType::LESS);
|
|
break;
|
|
case '>':
|
|
add_token(match('=') ? TokenType::GREATER_EQUAL : TokenType::GREATER);
|
|
break;
|
|
case '/':
|
|
if (peek() == '/') {
|
|
skip_comment();
|
|
} else {
|
|
add_token(TokenType::SLASH);
|
|
}
|
|
break;
|
|
case '"':
|
|
scan_string();
|
|
break;
|
|
default:
|
|
if (std::isdigit(c)) {
|
|
scan_number();
|
|
} else if (std::isalpha(c) || c == '_') {
|
|
scan_identifier();
|
|
} else {
|
|
add_token(TokenType::INVALID);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
void Lexer::add_token(TokenType type) {
|
|
std::string text = source_.substr(start_, current_ - start_);
|
|
tokens_.emplace_back(type, text, line_, column_ - static_cast<int>(text.length()));
|
|
}
|
|
|
|
void Lexer::add_token(TokenType type, std::variant<std::monostate, double, std::string> literal) {
|
|
std::string text = source_.substr(start_, current_ - start_);
|
|
tokens_.emplace_back(type, text, literal, line_, column_ - static_cast<int>(text.length()));
|
|
}
|
|
|
|
void Lexer::scan_string() {
|
|
std::string value;
|
|
|
|
while (peek() != '"' && !is_at_end()) {
|
|
if (peek() == '\n') {
|
|
line_++;
|
|
column_ = 0;
|
|
}
|
|
if (peek() == '\\' && peek_next() != '\0') {
|
|
advance(); // consume backslash
|
|
char escaped = advance();
|
|
switch (escaped) {
|
|
case 'n': value += '\n'; break;
|
|
case 't': value += '\t'; break;
|
|
case 'r': value += '\r'; break;
|
|
case '\\': value += '\\'; break;
|
|
case '"': value += '"'; break;
|
|
default: value += escaped; break;
|
|
}
|
|
} else {
|
|
value += advance();
|
|
}
|
|
}
|
|
|
|
if (is_at_end()) {
|
|
add_token(TokenType::INVALID);
|
|
return;
|
|
}
|
|
|
|
advance(); // closing "
|
|
add_token(TokenType::STRING_LITERAL, value);
|
|
}
|
|
|
|
void Lexer::scan_number() {
|
|
while (std::isdigit(peek())) {
|
|
advance();
|
|
}
|
|
|
|
if (peek() == '.' && std::isdigit(peek_next())) {
|
|
advance(); // consume '.'
|
|
while (std::isdigit(peek())) {
|
|
advance();
|
|
}
|
|
}
|
|
|
|
std::string text = source_.substr(start_, current_ - start_);
|
|
double value = std::stod(text);
|
|
add_token(TokenType::NUMBER_LITERAL, value);
|
|
}
|
|
|
|
void Lexer::scan_identifier() {
|
|
while (std::isalnum(peek()) || peek() == '_') {
|
|
advance();
|
|
}
|
|
|
|
std::string text = source_.substr(start_, current_ - start_);
|
|
TokenType type = get_keyword_type(text);
|
|
add_token(type);
|
|
}
|
|
|
|
TokenType Lexer::get_keyword_type(const std::string& text) const {
|
|
static const std::unordered_map<std::string, TokenType> keywords = {
|
|
{"class", TokenType::CLASS},
|
|
{"func", TokenType::FUNC},
|
|
{"number", TokenType::NUMBER},
|
|
{"string", TokenType::STRING},
|
|
{"bool", TokenType::BOOL},
|
|
{"list", TokenType::LIST},
|
|
{"map", TokenType::MAP},
|
|
{"if", TokenType::IF},
|
|
{"else", TokenType::ELSE},
|
|
{"while", TokenType::WHILE},
|
|
{"for", TokenType::FOR},
|
|
{"return", TokenType::RETURN},
|
|
{"var", TokenType::VAR},
|
|
{"true", TokenType::TRUE},
|
|
{"false", TokenType::FALSE},
|
|
{"nil", TokenType::NIL},
|
|
{"and", TokenType::AND},
|
|
{"or", TokenType::OR},
|
|
{"this", TokenType::THIS},
|
|
};
|
|
|
|
auto it = keywords.find(text);
|
|
if (it != keywords.end()) {
|
|
return it->second;
|
|
}
|
|
return TokenType::IDENTIFIER;
|
|
}
|
|
|
|
} // namespace camellya
|