From 9e41b093188f3b68570b4166a219e282446d4468 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com>
Date: Fri, 5 Jun 2026 02:36:23 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20str+str=20=E8=BF=90=E8=A1=8C=E6=97=B6?=
 =?UTF-8?q?=E6=8B=BC=E6=8E=A5=20=E2=80=94=20malloc=20+=20strlen=20+=20memc?=
 =?UTF-8?q?py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- codegen: 声明 CRT 的 malloc/strlen/memcpy
- str+str 拼接: strlen(l)+strlen(r)+1 → malloc → memcpy×2 → 返回指针
- 新增集成测试 08_str_concat.l ("Hello, " + "World!" → "Hello, World!")
- 修复自报告 §5-6 字符串拼接不工作的 bug
---
 ...tecture-analysis-report-2026-06-05-0100.md | 268 ++++++++++++++++++
 src/codegen/codegen.c                         |  62 +++-
 test/programs/08_str_concat.l                 |   7 +
 3 files changed, 335 insertions(+), 2 deletions(-)
 create mode 100644 docs/analysis/architecture-analysis-report-2026-06-05-0100.md
 create mode 100644 test/programs/08_str_concat.l

diff --git a/docs/analysis/architecture-analysis-report-2026-06-05-0100.md b/docs/analysis/architecture-analysis-report-2026-06-05-0100.md
new file mode 100644
index 0000000..00bcb9c
--- /dev/null
+++ b/docs/analysis/architecture-analysis-report-2026-06-05-0100.md
@@ -0,0 +1,268 @@
+# L Language 架构分析报告 (v0.1+)
+
+> 日期: 2026-06-05 01:00 | 自动生成 | 自上次报告后有 4 个提交
+> 上次报告: [docs/architecture-analysis-report-2026-06-05.md](/D:\Code\doing_exercises\programs\L Language\docs\architecture-analysis-report-2026-06-05.md) (v0.1 基线 3b7bab1)
+
+---
+
+## 变更摘要 (自上次报告)
+
+- [x] P0: `let mut` + 赋值语句
+- [x] P0: 字符串类型 + 字面量 + `print_str`
+- [x] 架构: LLVM 目标初始化解耦 -> `target.h/c`
+- [x] 技术债: codegen `malloc` -> arena
+- [x] 技术债: 添加 `.codegraphignore`
+
+上次报告提出的 4 项 P0 建议已全部落地。本次报告反映 v0.1 + 4 commits 后的当前快照，新增 588 行代码（net），2 个集成测试程序（06_mut_while.l, 07_hello_str.l）。
+
+| Commit | 功能 |
+|--------|------|
+| `f8c5e18` | 技术债修复：codegen malloc->arena + .codegraphignore |
+| `bd02a49` | `let mut` + 赋值语句 -- while 循环可修改变量 |
+| `9a53d97` | 字符串类型 + 字面量 + `print_str` |
+| `9ff2990` | LLVM 目标初始化解耦 -- 抽取 target.h/c |
+
+---
+
+## 1. 当前架构全景
+
+### 1.1 编译流水线
+
+```
+源码(.l) -> Lexer(词法) -> Parser(语法) -> Sema(语义) -> Codegen(LLVM IR) -> Target(obj) -> 链接(.exe)
+              Token[]        AstNode*        类型标注         LLVMModuleRef       .o 文件       GCC/ld
+```
+
+一条完整的手写单趟编译器流水线，无依赖生成器。Codegen 层与 Target 层已通过 `target.h/c` 解耦。
+
+### 1.2 模块清单与指标
+
+| 模块 | 文件 | 行数 | 关键数据结构 | 新增功能 |
+|------|------|------|-------------|---------|
+| lexer/ | lexer.c(131) token.c(44) + .h(49) | 224 | Token {kind, start, length, line, col} | TOK_MUT, TOK_STR, TOK_STR_LIT, TOK_ASSIGN |
+| ast/ | ast.c(107) ast.h(93) | 200 | AstNode {kind, type, as{union}} | AST_ASSIGN_STMT, is_mut, str_val |
+| parser/ | parser.c(314) parser.h(10) | 324 | Parser {tokens, pos, filename, arena} | let mut 解析, 赋值语句, 字符串字面量 |
+| sema/ | sema.c(294) symbol.c(46) + .h(41) | 381 | Scope {head, parent}, Symbol {is_mut} | 可变性检查, TYPE_STR, print_str, 赋值检查 |
+| codegen/ | codegen.c(365) target.c(30) + .h(26) | 421 | CgCtx {builder, var_table, fn_table} | Store(赋值), GlobalStringPtr(str) |
+| driver/ | main.c(133) error.c(46) + .h(18) | 197 | 流水线串联 | 调用 target_* 接口 |
+| util/ | arena.c(39) arena.h(13) | 52 | Arena {memory, offset, capacity} | 8MB bump allocator |
+| include/ | l_lang.h | 34 | TypeKind, 公共声明 | TYPE_STR 新增 |
+
+**总计: 22 源文件, ~1,833 行实现代码**
+
+### 1.3 技术选型评估
+
+| 选择 | 评价 | 最新状态 |
+|------|------|---------|
+| C17 + CMake + MinGW | 轻量，学习友好 | 稳定 |
+| Arena bump allocator | 极简内存管理，无 GC 开销 | 已统一（codegen 原先混用 malloc） |
+| LLVM-C API (19.x) | 成熟但版本差异需处理 | 已适配 LLVMContext API |
+| 手写 Lexer/Parser | 0 外部依赖，完整控制 | 稳定，30 种 Token 类型 |
+| Pratt 表达式解析 | 优雅处理优先级，易扩展 | 无变化 |
+| GCC 链接 (`system()`) | 简单但平台绑定 | 待改进 |
+| 分离 target.h/c | 关注点分离，可测试 | **新建模块** |
+
+---
+
+## 2. 功能清单与成熟度
+
+### 2.1 v0.1 基线功能
+
+- [x] 基本类型: i64, f64, bool, void
+- [x] 算术运算: + - * / %
+- [x] 比较运算: == != < > <= >=
+- [x] 逻辑运算: && || !
+- [x] 变量声明: let 不可变，类型标注 + 类型推断
+- [x] 控制流: if/else, while
+- [x] 函数定义与调用: 参数、返回类型、递归
+- [x] 内置函数: print_i64, print_f64, print_bool
+- [x] 注释: // 行注释, /* */ 块注释
+- [x] 错误报告: 词法/语法即时终止, 语义错误批量输出
+- [x] 测试: 65 单元测试 + 5 集成测试
+
+### 2.2 v0.1+ 新增功能
+
+- [x] **let mut 可变变量** -- `let mut x: i64 = 0;` 声明可变变量
+- [x] **赋值语句** -- `x = x + 1;` 对可变变量赋值（LLVMBuildStore）
+- [x] **可变性检查** -- 对不可变变量赋值报 "不能对不可变变量赋值（需用 let mut 声明）"
+- [x] **字符串类型 str** -- 类型系统新增 TYPE_STR
+- [x] **字符串字面量** -- `"Hello, L Language!"` 双引号字面量，LLVMBuildGlobalStringPtr
+- [x] **字符串拼接 (语义)** -- `"a" + "b"` 类型检查为 str + str -> str
+- [x] **print_str 内置函数** -- 委托 printf("%%s\n", str)
+- [x] **target.h/c 独立模块** -- LLVM 目标初始化与 Codegen 解耦
+- [x] **集成测试**: 06_mut_while.l (可变 while 循环), 07_hello_str.l (字符串输出)
+
+### 2.3 实现细节
+
+**let mut + 赋值** 贯穿全流水线:
+- Lexer: `TOK_MUT` (关键字), `TOK_ASSIGN` (分隔符)
+- AST: `AST_ASSIGN_STMT` 节点, `let_stmt.is_mut` 标志
+- Parser: `let mut IDENT = expr;` 完整解析; `IDENT = expr;` 赋值语句（标识符后紧跟 `=` 时触发）
+- Sema: `Symbol.is_mut` 标记; 赋值时检查可变性 + 类型一致性
+- Codegen: `LLVMBuildStore` 向现有 alloca 写入新值
+
+**字符串类型** 实现:
+- Lexer: `TOK_STR` (类型关键字), `TOK_STR_LIT` (双引号字面量)
+- AST: `literal.str_val` (arena 拷贝), `AST_LITERAL_EXPR` 支持 TYPE_STR
+- Parser: `TOK_STR_LIT` 创建 `ast_make_literal_str()`
+- Sema: TYPE_STR 加入类型系统; `+` 对 str+str 做类型检查
+- Codegen: `LLVMBuildGlobalStringPtr` 生成全局常量; `print_str` 委派 printf
+- **限制**: 运行时字符串拼接（str + str）在 codegen 中直接返回左操作数，语义层验证通过但**运行时行为不正确**
+
+---
+
+## 3. 与 Rust 对标：缺失功能清单
+
+按 "学习价值 + 实现难度" 排序。已完成项标注 ✓。
+
+### P0: 短期（1-2 天/项）
+
+| # | 功能 | Rust 启发 | 改动范围 | 学习价值 |
+|---|------|----------|---------|---------|
+| ✓ | let mut + 赋值 | 默认不可变 | lexer/parser/sema/codegen | 不可变性贯穿全流水线 |
+| ✓ | 字符串类型 + 字面量 | &str/String | lexer/parser/sema/codegen | LLVM 全局常量和指针 |
+| 1 | **复合赋值** += -= *= /= | 复合赋值运算符 | lexer(4 Token) + parser(desugar) + sema(mut check) | 语法糖去糖 |
+| 2 | **修复 str+str 运行时拼接** | -- | codegen(strcat/memcpy) | 运行时内存操作 |
+| 3 | **for 循环 + Range** | for i in 0..10 {} | lexer(for/in/..) + parser + sema + codegen->while | 去糖为 while 循环 |
+| 4 | **结构体 struct** | struct 具名域 | lexer(struct/.) + parser + sema + codegen(GEP) | GEP 指针计算 |
+
+### P1: 中期（3-5 天/项）
+
+| # | 功能 | 改动范围 | 学习价值 |
+|---|------|---------|---------|
+| 5 | **数组 + 索引** [i64; N], arr[i] | lexer/parser/sema/codegen(GEP) | GEP 多维指针计算 |
+| 6 | **枚举 (C 风格)** | lexer(enum) + parser + sema + codegen(i64) | 为代数类型铺路 |
+| 7 | **match 表达式** | lexer/parser/sema(穷举检查) + codegen | 模式匹配是 PL 核心 |
+| 8 | **类型别名** type Meters = i64 | lexer/parser/sema(展开) | 名称等价 vs 结构等价 |
+
+### P2: 中后期（1-2 周/项）
+
+| # | 功能 | 学习价值 |
+|---|------|---------|
+| 9 | 模块系统 mod + use | 从单文件到多文件跳跃 |
+| 10 | 自定义 IR 层 (三地址码/SSA) | "IR 是编译器的核心抽象" |
+| 11 | 泛型 (单态化) | Rust 零成本抽象的基石 |
+| 12 | 解释器模式 (walk AST) | 快速验证语义，无需 LLVM |
+
+### P3: 长期（2-4 周/项）
+
+| # | 功能 | 备注 |
+|---|------|------|
+| 13 | trait / 接口 | 需泛型 + 虚表 |
+| 14 | Option/Result 错误处理 | 需泛型 + enum |
+| 15 | 所有权 / 借用检查 | 借用检查器，极度复杂 |
+| 16 | 自举 (L 编译 L) | 终极考验 |
+
+### Rust 设计哲学吸收建议
+
+| Rust 特性 | 决策 | 理由 |
+|-----------|------|------|
+| 默认不可变 (let vs let mut) | **已加** | 编译期检查，零实现成本 |
+| 表达式 vs 语句 | 建议加 | if/match/block 作为表达式 |
+| 模式匹配 | 建议加 | 最优雅的控制流之一 |
+| 代数数据类型 | 有条件 | 需要泛型支持 |
+| Option/Result | 有条件 | 依赖泛型 + enum |
+| 所有权 / 借用 | 暂不加 | 借用检查器，远超当前阶段 |
+| 生命周期标注 | 暂不加 | 所有权系统的伴生 |
+| trait 系统 | 暂不加 | 需泛型 + 虚表 |
+| 宏系统 | 暂不加 | 自举后才考虑 |
+
+---
+
+## 4. 架构改进：已完成 vs 待实施
+
+### 4.1 已完成 ✓
+
+1. **代码生成与链接解耦** (来自上次报告 4.1-1)
+   - `src/codegen/target.h/c` 新增 (30+15 行)
+   - 接口: target_init(), target_get_default_triple(), target_create_machine(), target_emit_obj()
+   - main.c 减少 25 行 inline LLVM Target API 调用
+
+2. **内存管理统一** (来自上次报告 7-1)
+   - codegen.c 所有 malloc 替换为 arena_alloc()
+   - 消除内存泄漏风险
+
+3. **索引排除** (来自上次报告 7-5)
+   - .codegraphignore 排除 build/
+
+### 4.2 待实施
+
+4. **错误类型统一** (上次报告 4.1-2)
+   - 当前: ErrorInfo 和 ErrorList 分散多个模块
+   - 建议: 统一为 CompilerError 结构体
+
+5. **测试框架扩展** (上次报告 4.1-3)
+   - 当前: 无 codegen 层测试
+   - 建议: test_codegen.c + LLVMVerifyModule
+
+6. **调用约定抽象** (上次报告 4.2-6)
+   - 建议: 抽 abi.h 封装参数传递和返回值处理
+
+---
+
+## 5. 技术债务与风险
+
+| # | 问题 | 状态 | 影响 | 建议 |
+|---|------|------|------|------|
+| 1 | codegen malloc 混用 arena | **已修复** | -- | -- |
+| 2 | 无 .codegraphignore | **已修复** | -- | -- |
+| 3 | system("gcc ...") 链接 | **仍存在** | 平台不兼容 | 封装 Linker 接口，支持 ld/lld |
+| 4 | 无 codegen 层测试 | **仍存在** | IR 正确性无形式验证 | test_codegen.c |
+| 5 | 无优化 pass (mem2reg) | **仍存在** | alloca 效率低 | LLVMRunPassManager |
+| 6 | 字符串运行时拼接不工作 | **新增** | "a"+"b" 语义通过但运行时错误 | codegen 中实现 strcat/memcpy |
+| 7 | CHANGELOG 未更新 | **新增** | 4 个新 commit 未记录 | 添加 v0.2.0 条目 |
+
+---
+
+## 6. 推荐开发路线图
+
+### v0.2 目标
+
+| 优先级 | 功能 | 预计工时 | 备注 |
+|--------|------|---------|------|
+| P0 | 修复 str+str 运行时拼接 | 0.5 天 | 阻塞后续 |
+| P0 | 复合赋值 += -= *= /= | 0.5 天 | 语法糖，改动面小 |
+| P0 | 结构体 struct | 2-3 天 | 解锁 GEP 和复合类型 |
+| P0 | for 循环 + range | 1 天 | 去糖为 while |
+| P1 | test_codegen.c | 1 天 | 技术债补偿 |
+
+### v0.3 目标
+
+| 优先级 | 功能 | 预计工时 |
+|--------|------|---------|
+| P1 | 数组 + 索引 | 2 天 |
+| P1 | 枚举 (C 风格) | 1 天 |
+| P2 | match 表达式 | 2-3 天 |
+| P1 | 类型别名 | 0.5 天 |
+| P2 | 解释器模式 | 2 天 |
+
+### 长期
+
+| 优先级 | 功能 | 预计工时 |
+|--------|------|---------|
+| P2 | 自定义 IR 层 | 3-5 天 |
+| P2 | 模块系统 | 3-5 天 |
+| P3 | 泛型 (单态化) | 5-7 天 |
+| P3 | trait / 接口 | 5-7 天 |
+| P3 | 自举尝试 | 数周 |
+
+---
+
+## 7. 度量汇总
+
+| 指标 | v0.1 基线 | 当前 (v0.1+) | 变化 |
+|------|----------|-------------|------|
+| 源文件数 | 19 | 22 | +3 |
+| 实现代码行 | ~1,300 | ~1,833 | +~530 |
+| Token 类型 | 25 | 30 | +5 |
+| AST 节点类型 | 13 | 15 | +2 |
+| 类型系统 | 4 种 | 5 种 (+str) | +1 |
+| 内置函数 | 3 | 4 (+print_str) | +1 |
+| 集成测试 | 5 程序 | 7 程序 | +2 |
+| 单元测试 | 65 | 65 | 无变化 |
+| LLVM 模块 | 单体 codegen.c | codegen.c + target.c | 解耦 |
+| 报告 P0 完成度 | -- | 4/4 | 100% |
+
+---
+
+*本报告由 Codex 自动生成。自上次报告后代码有显著变化 (4 commits, +588/-53 行)，上次报告的 4 项 P0 建议全部实现。*
diff --git a/src/codegen/codegen.c b/src/codegen/codegen.c
index f5364a9..d11490e 100644
--- a/src/codegen/codegen.c
+++ b/src/codegen/codegen.c
@@ -31,6 +31,10 @@ typedef struct {
     // printf 运行时支持（内置 print 函数委托给 printf）
     LLVMValueRef      printf_fn;
     LLVMTypeRef       printf_ty;
+    // 字符串拼接运行时支持
+    LLVMValueRef      malloc_fn;
+    LLVMValueRef      strlen_fn;
+    LLVMValueRef      memcpy_fn;
 } CgCtx;
 
 // === 类型映射（需要 Context）===
@@ -124,8 +128,39 @@ static LLVMValueRef codegen_expr(CgCtx* ctx, AstNode* node) {
         LLVMValueRef r = codegen_expr(ctx, node->as.binary.right);
         if (!l || !r) return NULL;
 
-        // 字符串拼接：暂不支持运行时拼接，直接返回左操作数
-        if (node->type.kind == TYPE_STR) return l;
+        // 字符串拼接：alloc 栈缓冲区，strcpy + strcat
+        if (node->type.kind == TYPE_STR) {
+            // strlen(left)
+            LLVMValueRef len_l = LLVMBuildCall2(ctx->builder,
+                LLVMGlobalGetValueType(ctx->strlen_fn), ctx->strlen_fn,
+                (LLVMValueRef[]){l}, 1, "strlen_l");
+            // strlen(right)
+            LLVMValueRef len_r = LLVMBuildCall2(ctx->builder,
+                LLVMGlobalGetValueType(ctx->strlen_fn), ctx->strlen_fn,
+                (LLVMValueRef[]){r}, 1, "strlen_r");
+            // total = len_l + len_r + 1
+            LLVMValueRef total = LLVMBuildAdd(ctx->builder, len_l, len_r, "total");
+            total = LLVMBuildAdd(ctx->builder, total,
+                LLVMConstInt(LLVMInt64TypeInContext(ctx->context), 1, false), "total_1");
+            // char* buf = malloc(total)
+            LLVMValueRef buf = LLVMBuildCall2(ctx->builder,
+                LLVMGlobalGetValueType(ctx->malloc_fn), ctx->malloc_fn,
+                (LLVMValueRef[]){total}, 1, "str_buf");
+            // memcpy(buf, left, len_l)
+            LLVMBuildCall2(ctx->builder,
+                LLVMGlobalGetValueType(ctx->memcpy_fn), ctx->memcpy_fn,
+                (LLVMValueRef[]){buf, l, len_l}, 3, "");
+            // memcpy(buf + len_l, right, len_r + 1)  -- includes null terminator
+            LLVMValueRef offset_ptr = LLVMBuildGEP2(ctx->builder,
+                LLVMInt8TypeInContext(ctx->context), buf,
+                (LLVMValueRef[]){len_l}, 1, "offset");
+            LLVMValueRef len_r1 = LLVMBuildAdd(ctx->builder, len_r,
+                LLVMConstInt(LLVMInt64TypeInContext(ctx->context), 1, false), "len_r1");
+            LLVMBuildCall2(ctx->builder,
+                LLVMGlobalGetValueType(ctx->memcpy_fn), ctx->memcpy_fn,
+                (LLVMValueRef[]){offset_ptr, r, len_r1}, 3, "");
+            return buf;
+        }
 
         bool is_float = (node->type.kind == TYPE_F64);
 
@@ -352,6 +387,29 @@ LLVMModuleRef codegen_module(AstNode* ast, Arena* codegen_arena,
         LLVMInt32TypeInContext(ctx.context), printf_param_types, 1, true);
     ctx.printf_fn = LLVMAddFunction(ctx.module, "printf", ctx.printf_ty);
 
+    // 声明 malloc: void* malloc(size_t)
+    LLVMTypeRef malloc_args[] = { LLVMInt64TypeInContext(ctx.context) };
+    LLVMTypeRef malloc_ty = LLVMFunctionType(
+        LLVMPointerType(LLVMInt8TypeInContext(ctx.context), 0), malloc_args, 1, false);
+    ctx.malloc_fn = LLVMAddFunction(ctx.module, "malloc", malloc_ty);
+
+    // 声明 strlen: size_t strlen(const char*)
+    LLVMTypeRef strlen_args[] = { LLVMPointerType(LLVMInt8TypeInContext(ctx.context), 0) };
+    LLVMTypeRef strlen_ty = LLVMFunctionType(
+        LLVMInt64TypeInContext(ctx.context), strlen_args, 1, false);
+    ctx.strlen_fn = LLVMAddFunction(ctx.module, "strlen", strlen_ty);
+
+    // 声明 memcpy: void* memcpy(void*, const void*, size_t)
+    LLVMTypeRef memcpy_args[] = {
+        LLVMPointerType(LLVMInt8TypeInContext(ctx.context), 0),
+        LLVMPointerType(LLVMInt8TypeInContext(ctx.context), 0),
+        LLVMInt64TypeInContext(ctx.context),
+    };
+    LLVMTypeRef memcpy_ty = LLVMFunctionType(
+        LLVMPointerType(LLVMInt8TypeInContext(ctx.context), 0),
+        memcpy_args, 3, false);
+    ctx.memcpy_fn = LLVMAddFunction(ctx.module, "memcpy", memcpy_ty);
+
     // 第一遍：声明所有 L 函数
     for (size_t i = 0; i < ast->as.program.fn_count; i++) {
         AstNode* fn = ast->as.program.functions[i];
diff --git a/test/programs/08_str_concat.l b/test/programs/08_str_concat.l
new file mode 100644
index 0000000..b80d1f8
--- /dev/null
+++ b/test/programs/08_str_concat.l
@@ -0,0 +1,7 @@
+fn main() -> i64 {
+    let hello: str = "Hello, ";
+    let world: str = "World!";
+    let msg: str = hello + world;
+    print_str(msg);
+    return 0;
+}