#include #include "lexer.h" #include #include using namespace camellya; TEST_CASE("UTF-8 string support", "[lexer][utf8]") { std::string source = "string s = \"你好, world\";"; Lexer lexer(source); auto tokens = lexer.tokenize(); // Expected tokens: // 1. string (keyword) // 2. s (identifier) // 3. = (equal) // 4. "你好, world" (string literal) // 5. ; (semicolon) // 6. EOF REQUIRE(tokens.size() == 6); REQUIRE(tokens[0].type == TokenType::STRING); REQUIRE(tokens[1].type == TokenType::IDENTIFIER); REQUIRE(tokens[1].lexeme == "s"); REQUIRE(tokens[3].type == TokenType::STRING_LITERAL); // Check value auto literal = std::get(tokens[3].literal); REQUIRE(literal == "你好, world"); REQUIRE(tokens[3].line == 1); REQUIRE(tokens[3].column == 12); REQUIRE(tokens[4].type == TokenType::SEMICOLON); REQUIRE(tokens[4].column == 23); } TEST_CASE("UTF-8 identifier support", "[lexer][utf8]") { std::string source = "var 变量 = 10;"; Lexer lexer(source); auto tokens = lexer.tokenize(); REQUIRE(tokens.size() == 6); REQUIRE(tokens[1].type == TokenType::IDENTIFIER); REQUIRE(tokens[1].lexeme == "变量"); REQUIRE(tokens[1].column == 5); REQUIRE(tokens[2].type == TokenType::EQUAL); // "var " (4) + "变量" (2) + " " (1) = 7. "=" should be at column 8. REQUIRE(tokens[2].column == 8); } TEST_CASE("Unicode escape sequence support", "[lexer][utf8]") { std::string source = "string s = \"\\u4e2d\\u6587\";"; // "中文" Lexer lexer(source); auto tokens = lexer.tokenize(); REQUIRE(tokens.size() == 6); REQUIRE(tokens[3].type == TokenType::STRING_LITERAL); auto literal = std::get(tokens[3].literal); REQUIRE(literal == "中文"); }