Support utf-8

2026-01-19 21:39:41 +08:00
parent 4247a59146
commit 57def6137b
6 changed files with 129 additions and 32 deletions
--- a/src/lexer.cpp
+++ b/src/lexer.cpp
@@ -1,6 +1,7 @@
 #include "lexer.h"
 #include <cctype>
 #include <unordered_map>
+#include <cstdint>

 namespace camellya {

@@ -17,8 +18,11 @@ std::vector<Token> Lexer::tokenize() {
 }

 char Lexer::advance() {
-    column_++;
-    return source_[current_++];
+    char c = source_[current_++];
+    if ((c & 0xC0) != 0x80) {
+        column_++;
+    }
+    return c;
 }

 char Lexer::peek() const {
@@ -75,7 +79,8 @@ void Lexer::scan_token() {
    if (is_at_end()) return;
    
    start_ = current_;
-    int start_column = column_;
+    start_line_ = line_;
+    start_column_ = column_;
    char c = advance();
    
    switch (c) {
@@ -124,7 +129,7 @@ void Lexer::scan_token() {
        default:
            if (std::isdigit(c)) {
                scan_number();
-            } else if (std::isalpha(c) || c == '_') {
+            } else if (std::isalpha(c) || c == '_' || (static_cast<unsigned char>(c) >= 0x80)) {
                scan_identifier();
            } else {
                add_token(TokenType::INVALID);
@@ -135,12 +140,12 @@ void Lexer::scan_token() {

 void Lexer::add_token(TokenType type) {
    std::string text = source_.substr(start_, current_ - start_);
-    tokens_.emplace_back(type, text, line_, column_ - static_cast<int>(text.length()));
+    tokens_.emplace_back(type, text, start_line_, start_column_);
 }

 void Lexer::add_token(TokenType type, std::variant<std::monostate, double, std::string> literal) {
    std::string text = source_.substr(start_, current_ - start_);
-    tokens_.emplace_back(type, text, literal, line_, column_ - static_cast<int>(text.length()));
+    tokens_.emplace_back(type, text, literal, start_line_, start_column_);
 }

 void Lexer::scan_string() {
@@ -160,6 +165,32 @@ void Lexer::scan_string() {
                case 'r': value += '\r'; break;
                case '\\': value += '\\'; break;
                case '"': value += '"'; break;
+                case 'u': {
+                    std::string hex;
+                    for (int i = 0; i < 4 && !is_at_end(); ++i) {
+                        if (std::isxdigit(peek())) {
+                            hex += advance();
+                        } else {
+                            break;
+                        }
+                    }
+                    if (hex.length() == 4) {
+                        uint32_t codepoint = std::stoul(hex, nullptr, 16);
+                        if (codepoint <= 0x7F) {
+                            value += static_cast<char>(codepoint);
+                        } else if (codepoint <= 0x7FF) {
+                            value += static_cast<char>(0xC0 | (codepoint >> 6));
+                            value += static_cast<char>(0x80 | (codepoint & 0x3F));
+                        } else {
+                            value += static_cast<char>(0xE0 | (codepoint >> 12));
+                            value += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
+                            value += static_cast<char>(0x80 | (codepoint & 0x3F));
+                        }
+                    } else {
+                        value += "\\u" + hex;
+                    }
+                    break;
+                }
                default: value += escaped; break;
            }
        } else {
@@ -194,7 +225,7 @@ void Lexer::scan_number() {
 }

 void Lexer::scan_identifier() {
-    while (std::isalnum(peek()) || peek() == '_') {
+    while (std::isalnum(peek()) || peek() == '_' || (static_cast<unsigned char>(peek()) >= 0x80)) {
        advance();
    }
    
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -61,6 +61,8 @@ private:
    size_t current_ = 0;
    int line_ = 1;
    int column_ = 1;
+    int start_line_ = 1;
+    int start_column_ = 1;
    std::vector<Token> tokens_;
    
    bool is_at_end() const { return current_ >= source_.length(); }