feat: L Language v0.1 编译器完整实现

5 阶段编译流水线: 词法分析 → 语法分析(Pratt) → 语义分析(类型推断) → LLVM IR → .exe 模块: - lexer: 手写状态机, 40 种 Token, // 和 /* */ 注释 - parser: Pratt 表达式解析(9 级优先级) + 递归下降语句/函数 - ast: 14 种节点类型 + 工厂函数 - sema: 作用域链符号表 + 类型推断 + 类型检查 - codegen: AST → LLVM-C API, print_i64/f64/bool 内建 - driver: 命令行 + 流水线串联 + 错误报告 - util: Arena bump allocator (8MB) 测试: 65 单元测试(词法41+语法15+语义9) + 5 集成测试全部通过语言特性: i64/f64/bool/void, let不可变变量, if/else, while, 递归函数
2026-06-05 00:26:59 +08:00
commit 3b7bab1e1b
40 changed files with 5804 additions and 0 deletions
@@ -0,0 +1,127 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <string.h>
+
+typedef struct {
+    const char* src;
+    const char* filename;
+    int         pos;
+    int         line;
+    int         col;
+} Lexer;
+
+static char peek(const Lexer* l) { return l->src[l->pos]; }
+static char peek_next(const Lexer* l) { return l->src[l->pos + 1]; }
+static void advance(Lexer* l) {
+    if (l->src[l->pos] == '\n') { l->line++; l->col = 1; }
+    else { l->col++; }
+    l->pos++;
+}
+static void skip_whitespace(Lexer* l) {
+    while (1) {
+        char c = peek(l);
+        if (c == ' ' || c == '\t' || c == '\r' || c == '\n') { advance(l); continue; }
+        if (c == '/' && peek_next(l) == '/') {
+            while (peek(l) != '\n' && peek(l) != '\0') advance(l);
+            continue;
+        }
+        if (c == '/' && peek_next(l) == '*') {
+            advance(l); advance(l);
+            while (peek(l) != '\0' && !(peek(l) == '*' && peek_next(l) == '/')) advance(l);
+            if (peek(l) != '\0') { advance(l); advance(l); }  // skip */
+            continue;
+        }
+        break;
+    }
+}
+
+static Token make_token(Lexer* l, TokenKind kind, int start_pos, int len) {
+    Token t = {.kind = kind, .start = l->src + start_pos,
+               .length = len, .line = l->line, .col = l->col};
+    return t;
+}
+
+static Token lex_number(Lexer* l) {
+    int start = l->pos;
+    TokenKind kind = TOK_INT_LIT;
+    while (isdigit(peek(l))) advance(l);
+    if (peek(l) == '.') {
+        kind = TOK_FLOAT_LIT; advance(l);
+        while (isdigit(peek(l))) advance(l);
+    }
+    return make_token(l, kind, start, l->pos - start);
+}
+
+static TokenKind check_keyword(const Token* tok) {
+    #define KW(s, k) if (tok->length == sizeof(s)-1 && memcmp(tok->start, s, sizeof(s)-1) == 0) return k
+    KW("fn",     TOK_FN);     KW("let",    TOK_LET);
+    KW("if",     TOK_IF);     KW("else",   TOK_ELSE);
+    KW("while",  TOK_WHILE);  KW("return", TOK_RETURN);
+    KW("i64",    TOK_I64);    KW("f64",    TOK_F64);
+    KW("bool",   TOK_BOOL);   KW("void",   TOK_VOID);
+    KW("true",   TOK_TRUE);   KW("false",  TOK_FALSE);
+    #undef KW
+    return TOK_IDENT;
+}
+
+static Token lex_ident_or_keyword(Lexer* l) {
+    int start = l->pos;
+    while (isalnum(peek(l)) || peek(l) == '_') advance(l);
+    Token t = make_token(l, TOK_IDENT, start, l->pos - start);
+    t.kind = check_keyword(&t);
+    return t;
+}
+
+Token* lex(Arena* a, const char* source, const char* filename,
+           size_t* count, ErrorInfo* error) {
+    Lexer l = {.src = source, .filename = filename, .pos = 0, .line = 1, .col = 1};
+    // 预估容量：源码长度的 1/3
+    size_t cap = strlen(source) / 3 + 16;
+    Token* tokens = arena_alloc(a, cap * sizeof(Token));
+    if (!tokens) { *count = 0; return NULL; }
+    size_t idx = 0;
+
+    while (peek(&l) != '\0') {
+        skip_whitespace(&l);
+        if (peek(&l) == '\0') break;
+
+        int line = l.line, col = l.col;
+        char c = peek(&l);
+
+        if (isdigit(c)) { tokens[idx++] = lex_number(&l); }
+        else if (isalpha(c) || c == '_') { tokens[idx++] = lex_ident_or_keyword(&l); }
+        else if (c == '+' && peek_next(&l) != '=') { tokens[idx++] = make_token(&l, TOK_PLUS, l.pos, 1); advance(&l); }
+        else if (c == '-' && peek_next(&l) != '>') { tokens[idx++] = make_token(&l, TOK_MINUS, l.pos, 1); advance(&l); }
+        else if (c == '-' && peek_next(&l) == '>') { tokens[idx++] = make_token(&l, TOK_ARROW, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '*') { tokens[idx++] = make_token(&l, TOK_STAR, l.pos, 1); advance(&l); }
+        else if (c == '/') { tokens[idx++] = make_token(&l, TOK_SLASH, l.pos, 1); advance(&l); }
+        else if (c == '%') { tokens[idx++] = make_token(&l, TOK_PERCENT, l.pos, 1); advance(&l); }
+        else if (c == '=' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_EQ_EQ, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '=') { tokens[idx++] = make_token(&l, TOK_ASSIGN, l.pos, 1); advance(&l); }
+        else if (c == '!' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_BANG_EQ, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '!') { tokens[idx++] = make_token(&l, TOK_BANG, l.pos, 1); advance(&l); }
+        else if (c == '<' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_LT_EQ, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '<') { tokens[idx++] = make_token(&l, TOK_LT, l.pos, 1); advance(&l); }
+        else if (c == '>' && peek_next(&l) == '=') { tokens[idx++] = make_token(&l, TOK_GT_EQ, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '>') { tokens[idx++] = make_token(&l, TOK_GT, l.pos, 1); advance(&l); }
+        else if (c == '&' && peek_next(&l) == '&') { tokens[idx++] = make_token(&l, TOK_AND_AND, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '|' && peek_next(&l) == '|') { tokens[idx++] = make_token(&l, TOK_PIPE_PIPE, l.pos, 2); advance(&l); advance(&l); }
+        else if (c == '(') { tokens[idx++] = make_token(&l, TOK_LPAREN, l.pos, 1); advance(&l); }
+        else if (c == ')') { tokens[idx++] = make_token(&l, TOK_RPAREN, l.pos, 1); advance(&l); }
+        else if (c == '{') { tokens[idx++] = make_token(&l, TOK_LBRACE, l.pos, 1); advance(&l); }
+        else if (c == '}') { tokens[idx++] = make_token(&l, TOK_RBRACE, l.pos, 1); advance(&l); }
+        else if (c == ',') { tokens[idx++] = make_token(&l, TOK_COMMA, l.pos, 1); advance(&l); }
+        else if (c == ':') { tokens[idx++] = make_token(&l, TOK_COLON, l.pos, 1); advance(&l); }
+        else if (c == ';') { tokens[idx++] = make_token(&l, TOK_SEMICOLON, l.pos, 1); advance(&l); }
+        else {
+            *error = (ErrorInfo){
+                .message = "无法识别的字符",
+                .filename = filename, .line = line, .col = col
+            };
+            return NULL;
+        }
+    }
+    tokens[idx++] = make_token(&l, TOK_EOF, l.pos, 0);
+    *count = idx;
+    return tokens;
+}
@@ -0,0 +1,13 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#include "token.h"
+#include "arena.h"
+#include "error.h"
+
+// 返回 Token 数组（分配在 arena 中），*count 为数量。
+// 如遇错误，error 被填充并返回 NULL。
+Token* lex(Arena* a, const char* source, const char* filename,
+           size_t* count, ErrorInfo* error);
+
+#endif
@@ -0,0 +1,49 @@
+#include "token.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+static const char* NAMES[] = {
+    [TOK_FN] = "fn", [TOK_LET] = "let", [TOK_IF] = "if",
+    [TOK_ELSE] = "else", [TOK_WHILE] = "while", [TOK_RETURN] = "return",
+    [TOK_I64] = "i64", [TOK_F64] = "f64", [TOK_BOOL] = "bool", [TOK_VOID] = "void",
+    [TOK_INT_LIT] = "整数", [TOK_FLOAT_LIT] = "浮点数",
+    [TOK_TRUE] = "true", [TOK_FALSE] = "false",
+    [TOK_IDENT] = "标识符",
+    [TOK_PLUS] = "+", [TOK_MINUS] = "-", [TOK_STAR] = "*",
+    [TOK_SLASH] = "/", [TOK_PERCENT] = "%",
+    [TOK_EQ_EQ] = "==", [TOK_BANG_EQ] = "!=",
+    [TOK_LT] = "<", [TOK_GT] = ">", [TOK_LT_EQ] = "<=", [TOK_GT_EQ] = ">=",
+    [TOK_AND_AND] = "&&", [TOK_PIPE_PIPE] = "||", [TOK_BANG] = "!",
+    [TOK_ARROW] = "->",
+    [TOK_LPAREN] = "(", [TOK_RPAREN] = ")",
+    [TOK_LBRACE] = "{", [TOK_RBRACE] = "}",
+    [TOK_COMMA] = ",", [TOK_COLON] = ":", [TOK_SEMICOLON] = ";",
+    [TOK_ASSIGN] = "=",
+    [TOK_EOF] = "EOF", [TOK_ERROR] = "错误",
+};
+
+const char* tok_name(TokenKind kind) {
+    return NAMES[kind];
+}
+
+bool tok_is_type(TokenKind kind) {
+    return kind == TOK_I64 || kind == TOK_F64 || kind == TOK_BOOL || kind == TOK_VOID;
+}
+
+int64_t tok_int_value(const Token* tok) {
+    if (tok->length <= 0 || tok->length >= 32) return 0;
+    char buf[32];
+    memcpy(buf, tok->start, tok->length);
+    buf[tok->length] = '\0';
+    return strtoll(buf, NULL, 10);
+}
+
+double tok_float_value(const Token* tok) {
+    if (tok->length <= 0 || tok->length >= 64) return 0.0;
+    char buf[64];
+    memcpy(buf, tok->start, tok->length);
+    buf[tok->length] = '\0';
+    return strtod(buf, NULL);
+}
@@ -0,0 +1,45 @@
+#ifndef TOKEN_H
+#define TOKEN_H
+
+#include "l_lang.h"
+
+// === Token 类型枚举 ===
+typedef enum {
+    // 关键字
+    TOK_FN, TOK_LET, TOK_IF, TOK_ELSE, TOK_WHILE, TOK_RETURN,
+    // 类型关键字
+    TOK_I64, TOK_F64, TOK_BOOL, TOK_VOID,
+    // 字面量
+    TOK_INT_LIT, TOK_FLOAT_LIT, TOK_TRUE, TOK_FALSE,
+    // 标识符
+    TOK_IDENT,
+    // 运算符
+    TOK_PLUS, TOK_MINUS, TOK_STAR, TOK_SLASH, TOK_PERCENT,
+    TOK_EQ_EQ, TOK_BANG_EQ, TOK_LT, TOK_GT, TOK_LT_EQ, TOK_GT_EQ,
+    TOK_AND_AND, TOK_PIPE_PIPE, TOK_BANG,
+    TOK_ARROW,
+    // 分隔符
+    TOK_LPAREN, TOK_RPAREN, TOK_LBRACE, TOK_RBRACE,
+    TOK_COMMA, TOK_COLON, TOK_SEMICOLON, TOK_ASSIGN,
+    // 特殊
+    TOK_EOF, TOK_ERROR,
+} TokenKind;
+
+// === Token 结构体 ===
+struct Token {
+    TokenKind   kind;
+    const char* start;      // 指向源码中 token 起始位置
+    int         length;     // token 文本长度
+    int         line;
+    int         col;
+};
+
+// === 工具函数 ===
+const char* tok_name(TokenKind kind);
+bool        tok_is_type(TokenKind kind);
+
+// 从 Token 提取值
+int64_t  tok_int_value(const Token* tok);
+double   tok_float_value(const Token* tok);
+
+#endif