#include "lexer.h" #include #include typedef struct { const char* src; const char* filename; int pos; int line; int col; } Lexer; static char peek(const Lexer* l) { return l->src[l->pos]; } static char peek_next(const Lexer* l) { return l->src[l->pos + 1]; } static void advance(Lexer* l) { if (l->src[l->pos] == '\n') { l->line++; l->col = 1; } else { l->col++; } l->pos++; } static void skip_whitespace(Lexer* l) { while (1) { char c = peek(l); if (c == ' ' || c == '\t' || c == '\r' || c == '\n') { advance(l); continue; } if (c == '/' && peek_next(l) == '/') { while (peek(l) != '\n' && peek(l) != '\0') advance(l); continue; } if (c == '/' && peek_next(l) == '*') { advance(l); advance(l); while (peek(l) != '\0' && !(peek(l) == '*' && peek_next(l) == '/')) advance(l); if (peek(l) != '\0') { advance(l); advance(l); } // skip */ continue; } break; } } static Token make_token(Lexer* l, TokenKind kind, int start_pos, int len) { Token t = {.kind = kind, .start = l->src + start_pos, .length = len, .line = l->line, .col = l->col}; return t; } static Token lex_number(Lexer* l) { int start = l->pos; TokenKind kind = TOK_INT_LIT; while (isdigit(peek(l))) advance(l); if (peek(l) == '.' && peek_next(l) != '.') { kind = TOK_FLOAT_LIT; advance(l); while (isdigit(peek(l))) advance(l); } return make_token(l, kind, start, l->pos - start); } static TokenKind check_keyword(const Token* tok) { #define KW(s, k) if (tok->length == sizeof(s)-1 && memcmp(tok->start, s, sizeof(s)-1) == 0) return k KW("fn", TOK_FN); KW("let", TOK_LET); KW("var", TOK_VAR); KW("if", TOK_IF); KW("else", TOK_ELSE); KW("guard", TOK_GUARD); KW("while", TOK_WHILE); KW("for", TOK_FOR); KW("in", TOK_IN); KW("to", TOK_TO); KW("return", TOK_RETURN); KW("i32", TOK_I32); KW("i64", TOK_I64); KW("u64", TOK_U64); KW("f64", TOK_F64); KW("bool", TOK_BOOL); KW("char", TOK_CHAR); KW("str", TOK_STR); KW("void", TOK_VOID); KW("struct", TOK_STRUCT); KW("type", TOK_TYPE); KW("enum", TOK_ENUM); KW("extend", TOK_EXTEND); KW("defer", TOK_DEFER); KW("match", TOK_MATCH); KW("pub", TOK_PUB); KW("mod", TOK_MOD); KW("use", TOK_USE); KW("trait", TOK_TRAIT); KW("Self", TOK_SELF); KW("_", TOK_UNDERSCORE); KW("true", TOK_TRUE); KW("false", TOK_FALSE); #undef KW return TOK_IDENT; } static Token lex_ident_or_keyword(Lexer* l) { int start = l->pos; while (isalnum(peek(l)) || peek(l) == '_') { if (l->pos - start > 65535) break; // 标识符长度上限 advance(l); } Token t = make_token(l, TOK_IDENT, start, l->pos - start); t.kind = check_keyword(&t); return t; } Token* lex(Arena* a, const char* source, const char* filename, size_t* count, ErrorInfo* error) { Lexer l = {.src = source, .filename = filename, .pos = 0, .line = 1, .col = 1}; // 容量上限: 极端情况每个字符一个 token (如 "(){}+-"), src_len 足够 size_t src_len = strlen(source); size_t cap = src_len + 16; Token* tokens = arena_alloc(a, cap * sizeof(Token)); if (!tokens) { *count = 0; return NULL; } size_t idx = 0; while (peek(&l) != '\0') { skip_whitespace(&l); if (peek(&l) == '\0') break; if (idx >= cap) { *count = 0; return NULL; } // 防御 int line = l.line, col = l.col; char c = peek(&l); if (isdigit(c)) { tokens[idx++] = lex_number(&l); } else if (c == '\'') { advance(&l); // 跳过开头的 ' int char_start = l.pos; if (peek(&l) == '\\') advance(&l); // 转义字符: \n \t \\ \' advance(&l); // 跳过字符内容 if (peek(&l) != '\'') { *error = (ErrorInfo){.message="未闭合的字符字面量", .filename=filename, .line=line, .col=col}; return NULL; } int char_len = l.pos - char_start; advance(&l); // 跳过结尾的 ' tokens[idx++] = make_token(&l, TOK_CHAR_LIT, char_start, char_len); } else if (c == '"') { advance(&l); // 跳过开头的 " int start = l.pos; while (peek(&l) != '"' && peek(&l) != '\0' && peek(&l) != '\n') { if (l.pos - start > 65535) break; // 字符串长度上限 advance(&l); } int len = l.pos - start; if (peek(&l) != '"') { *error = (ErrorInfo){.message="未闭合的字符串", .filename=filename, .line=line, .col=col}; return NULL; } advance(&l); // 跳过结尾的 " tokens[idx++] = make_token(&l, TOK_STR_LIT, start, len); } else if (isalpha(c) || c == '_') { tokens[idx++] = lex_ident_or_keyword(&l); } else if (c == '+' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_PLUS_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '+') { tokens[idx++] = make_token(&l, TOK_PLUS, l.pos, 1); advance(&l); } else if (c == '-' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_MINUS_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '-' && peek_next(&l) == '>') { tokens[idx++] = make_token(&l, TOK_ARROW, l.pos, 2); advance(&l); advance(&l); } else if (c == '-') { tokens[idx++] = make_token(&l, TOK_MINUS, l.pos, 1); advance(&l); } else if (c == '*' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_STAR_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '*') { tokens[idx++] = make_token(&l, TOK_STAR, l.pos, 1); advance(&l); } else if (c == '/' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_SLASH_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '/') { tokens[idx++] = make_token(&l, TOK_SLASH, l.pos, 1); advance(&l); } else if (c == '%') { tokens[idx++] = make_token(&l, TOK_PERCENT, l.pos, 1); advance(&l); } else if (c == '=' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_EQ_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '=' && peek_next(&l) == '>') { tokens[idx++] = make_token(&l, TOK_MATCH_ARROW, l.pos, 2); advance(&l); advance(&l); } else if (c == '=') { tokens[idx++] = make_token(&l, TOK_ASSIGN, l.pos, 1); advance(&l); } else if (c == '!' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_BANG_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '!') { tokens[idx++] = make_token(&l, TOK_BANG, l.pos, 1); advance(&l); } else if (c == '<' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_LT_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '<') { tokens[idx++] = make_token(&l, TOK_LT, l.pos, 1); advance(&l); } else if (c == '>' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_GT_EQ, l.pos, 2); advance(&l); advance(&l); } else if (c == '>') { tokens[idx++] = make_token(&l, TOK_GT, l.pos, 1); advance(&l); } else if (c == '&' && peek_next(&l) == '&') { tokens[idx++] = make_token(&l, TOK_AND_AND, l.pos, 2); advance(&l); advance(&l); } else if (c == '|' && peek_next(&l) == '>') { tokens[idx++] = make_token(&l, TOK_PIPE, l.pos, 2); advance(&l); advance(&l); } else if (c == '|' && peek_next(&l) == '|') { tokens[idx++] = make_token(&l, TOK_PIPE_PIPE, l.pos, 2); advance(&l); advance(&l); } else if (c == '.') { tokens[idx++] = make_token(&l, TOK_DOT, l.pos, 1); advance(&l); } else if (c == '[') { tokens[idx++] = make_token(&l, TOK_LBRACKET, l.pos, 1); advance(&l); } else if (c == ']') { tokens[idx++] = make_token(&l, TOK_RBRACKET, l.pos, 1); advance(&l); } else if (c == '(') { tokens[idx++] = make_token(&l, TOK_LPAREN, l.pos, 1); advance(&l); } else if (c == ')') { tokens[idx++] = make_token(&l, TOK_RPAREN, l.pos, 1); advance(&l); } else if (c == '{') { tokens[idx++] = make_token(&l, TOK_LBRACE, l.pos, 1); advance(&l); } else if (c == '}') { tokens[idx++] = make_token(&l, TOK_RBRACE, l.pos, 1); advance(&l); } else if (c == ',') { tokens[idx++] = make_token(&l, TOK_COMMA, l.pos, 1); advance(&l); } else if (c == ':' && peek_next(&l) == ':') { tokens[idx++] = make_token(&l, TOK_COLON_COLON, l.pos, 2); advance(&l); advance(&l); } else if (c == ':') { tokens[idx++] = make_token(&l, TOK_COLON, l.pos, 1); advance(&l); } else if (c == ';') { tokens[idx++] = make_token(&l, TOK_SEMICOLON, l.pos, 1); advance(&l); } else { *error = (ErrorInfo){ .message = "无法识别的字符", .filename = filename, .line = line, .col = col }; return NULL; } } tokens[idx++] = make_token(&l, TOK_EOF, l.pos, 0); *count = idx; return tokens; }