camellya/lexer.cpp

#include "lexer.h"
#include <cctype>
#include <unordered_map>

namespace camellya {

Lexer::Lexer(std::string source) : source_(std::move(source)) {}

std::vector<Token> Lexer::tokenize() {
    while (!is_at_end()) {
        start_ = current_;
        scan_token();
    }

    tokens_.emplace_back(TokenType::END_OF_FILE, "", line_, column_);
    return tokens_;
}

char Lexer::advance() {
    column_++;
    return source_[current_++];
}

char Lexer::peek() const {
    if (is_at_end()) return '\0';
    return source_[current_];
}

char Lexer::peek_next() const {
    if (current_ + 1 >= source_.length()) return '\0';
    return source_[current_ + 1];
}

bool Lexer::match(char expected) {
    if (is_at_end()) return false;
    if (source_[current_] != expected) return false;

    current_++;
    column_++;
    return true;
}

void Lexer::skip_whitespace() {
    while (!is_at_end()) {
        char c = peek();
        switch (c) {
            case ' ':
            case '\r':
            case '\t':
                advance();
                break;
            case '\n':
                line_++;
                column_ = 0;
                advance();
                break;
            default:
                return;
        }
    }
}

void Lexer::skip_comment() {
    if (peek() == '/' && peek_next() == '/') {
        while (peek() != '\n' && !is_at_end()) {
            advance();
        }
    }
}

void Lexer::scan_token() {
    skip_whitespace();

    if (is_at_end()) return;

    start_ = current_;
    int start_column = column_;
    char c = advance();

    switch (c) {
        case '(': add_token(TokenType::LEFT_PAREN); break;
        case ')': add_token(TokenType::RIGHT_PAREN); break;
        case '{': add_token(TokenType::LEFT_BRACE); break;
        case '}': add_token(TokenType::RIGHT_BRACE); break;
        case '[': add_token(TokenType::LEFT_BRACKET); break;
        case ']': add_token(TokenType::RIGHT_BRACKET); break;
        case ',': add_token(TokenType::COMMA); break;
        case '.': add_token(TokenType::DOT); break;
        case ';': add_token(TokenType::SEMICOLON); break;
        case ':': add_token(TokenType::COLON); break;
        case '+': add_token(TokenType::PLUS); break;
        case '*': add_token(TokenType::STAR); break;
        case '%': add_token(TokenType::PERCENT); break;
        case '-':
            if (match('>')) {
                add_token(TokenType::ARROW);
            } else {
                add_token(TokenType::MINUS);
            }
            break;
        case '!':
            add_token(match('=') ? TokenType::BANG_EQUAL : TokenType::BANG);
            break;
        case '=':
            add_token(match('=') ? TokenType::EQUAL_EQUAL : TokenType::EQUAL);
            break;
        case '<':
            add_token(match('=') ? TokenType::LESS_EQUAL : TokenType::LESS);
            break;
        case '>':
            add_token(match('=') ? TokenType::GREATER_EQUAL : TokenType::GREATER);
            break;
        case '/':
            if (peek() == '/') {
                skip_comment();
            } else {
                add_token(TokenType::SLASH);
            }
            break;
        case '"':
            scan_string();
            break;
        default:
            if (std::isdigit(c)) {
                scan_number();
            } else if (std::isalpha(c) || c == '_') {
                scan_identifier();
            } else {
                add_token(TokenType::INVALID);
            }
            break;
    }
}

void Lexer::add_token(TokenType type) {
    std::string text = source_.substr(start_, current_ - start_);
    tokens_.emplace_back(type, text, line_, column_ - static_cast<int>(text.length()));
}

void Lexer::add_token(TokenType type, std::variant<std::monostate, double, std::string> literal) {
    std::string text = source_.substr(start_, current_ - start_);
    tokens_.emplace_back(type, text, literal, line_, column_ - static_cast<int>(text.length()));
}

void Lexer::scan_string() {
    std::string value;

    while (peek() != '"' && !is_at_end()) {
        if (peek() == '\n') {
            line_++;
            column_ = 0;
        }
        if (peek() == '\\' && peek_next() != '\0') {
            advance(); // consume backslash
            char escaped = advance();
            switch (escaped) {
                case 'n': value += '\n'; break;
                case 't': value += '\t'; break;
                case 'r': value += '\r'; break;
                case '\\': value += '\\'; break;
                case '"': value += '"'; break;
                default: value += escaped; break;
            }
        } else {
            value += advance();
        }
    }

    if (is_at_end()) {
        add_token(TokenType::INVALID);
        return;
    }

    advance(); // closing "
    add_token(TokenType::STRING_LITERAL, value);
}

void Lexer::scan_number() {
    while (std::isdigit(peek())) {
        advance();
    }

    if (peek() == '.' && std::isdigit(peek_next())) {
        advance(); // consume '.'
        while (std::isdigit(peek())) {
            advance();
        }
    }

    std::string text = source_.substr(start_, current_ - start_);
    double value = std::stod(text);
    add_token(TokenType::NUMBER_LITERAL, value);
}

void Lexer::scan_identifier() {
    while (std::isalnum(peek()) || peek() == '_') {
        advance();
    }

    std::string text = source_.substr(start_, current_ - start_);
    TokenType type = get_keyword_type(text);
    add_token(type);
}

TokenType Lexer::get_keyword_type(const std::string& text) const {
    static const std::unordered_map<std::string, TokenType> keywords = {
        {"class", TokenType::CLASS},
        {"func", TokenType::FUNC},
        {"number", TokenType::NUMBER},
        {"string", TokenType::STRING},
        {"bool", TokenType::BOOL},
        {"list", TokenType::LIST},
        {"map", TokenType::MAP},
        {"if", TokenType::IF},
        {"else", TokenType::ELSE},
        {"while", TokenType::WHILE},
        {"for", TokenType::FOR},
        {"return", TokenType::RETURN},
        {"var", TokenType::VAR},
        {"true", TokenType::TRUE},
        {"false", TokenType::FALSE},
        {"nil", TokenType::NIL},
        {"and", TokenType::AND},
        {"or", TokenType::OR},
        {"this", TokenType::THIS},
    };

    auto it = keywords.find(text);
    if (it != keywords.end()) {
        return it->second;
    }
    return TokenType::IDENTIFIER;
}

} // namespace camellya