diff --git a/notebooks/eda.ipynb b/notebooks/eda.ipynb new file mode 100644 index 0000000..d9c609f --- /dev/null +++ b/notebooks/eda.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 高温热浪与银发群体健康风险 -- 探索性数据分析\n", + "焦作市 . 郑州市 | 2010-2024 年气象数据" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from pathlib import Path\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "sns.set_style(\"whitegrid\")\n", + "plt.rcParams[\"font.sans-serif\"] = [\"SimHei\", \"Microsoft YaHei\", \"DejaVu Sans\"]\n", + "plt.rcParams[\"axes.unicode_minus\"] = False\n", + "\n", + "from src.utils.config import DATA_PROCESSED, DATA_EXTERNAL, OUTPUT_FIGURES, CITIES\n", + "\n", + "# 尝试加载数据\n", + "try:\n", + " df_jz = pd.read_csv(DATA_PROCESSED / \"jiaozuo_processed.csv\", parse_dates=[\"time\"])\n", + " df_zz = pd.read_csv(DATA_PROCESSED / \"zhengzhou_processed.csv\", parse_dates=[\"time\"])\n", + " df_combined = pd.read_csv(DATA_PROCESSED / \"combined_processed.csv\", parse_dates=[\"time\"])\n", + " print(f\"焦作: {df_jz.shape[0]} 天, 郑州: {df_zz.shape[0]} 天\")\n", + " data_loaded = True\n", + "except FileNotFoundError:\n", + " print(\"处理后的数据不存在,请先运行 preprocess.py\")\n", + " print(\"将使用模拟数据演示分析框架\")\n", + " data_loaded = False\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if data_loaded:\n", + " fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", + " for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n", + " annual = df.groupby(df[\"time\"].dt.year)[\"temp_mean\"].agg([\"mean\", \"max\", \"min\"])\n", + " annual.plot(ax=ax, color=[\"#ff9800\", \"#f44336\", \"#5b9bd5\"])\n", + " ax.set_title(f\"{name} - 年均气温趋势\", fontsize=14)\n", + " ax.set_ylabel(\"温度 (C)\")\n", + " ax.set_xlabel(\"年份\")\n", + " ax.legend([\"平均\", \"最高\", \"最低\"])\n", + " fig.tight_layout()\n", + " plt.savefig(OUTPUT_FIGURES / \"annual_temp_trend.png\", dpi=150, bbox_inches=\"tight\")\n", + " plt.show()\n", + "else:\n", + " print(\"需要数据文件\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if data_loaded:\n", + " fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", + " labels = [\"低\", \"中\", \"高\", \"严重\"]\n", + " colors = [\"#00e676\", \"#ffeb3b\", \"#ff9800\", \"#f44336\"]\n", + " for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n", + " counts = df[\"risk_label\"].value_counts().sort_index()\n", + " values = [counts.get(i, 0) for i in range(4)]\n", + " ax.bar(labels, values, color=colors)\n", + " ax.set_title(f\"{name} - 风险等级分布\", fontsize=14)\n", + " for i, v in enumerate(values):\n", + " ax.text(i, v + max(values)*0.01, str(v), ha='center')\n", + " fig.tight_layout()\n", + " plt.savefig(OUTPUT_FIGURES / \"risk_distribution.png\", dpi=150, bbox_inches=\"tight\")\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if data_loaded:\n", + " for df, name in [(df_jz, \"焦作\"), (df_zz, \"郑州\")]:\n", + " annual_hw = df.groupby(df[\"time\"].dt.year)[\"heatwave\"].sum()\n", + " print(f\"\\n{name} 热浪天数统计:\")\n", + " print(annual_hw.describe())\n", + " print(f\" 年均热浪天数: {annual_hw.mean():.1f} 天\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " er = pd.read_csv(DATA_EXTERNAL / \"exposure_response.csv\")\n", + " fig, ax = plt.subplots(figsize=(8, 5))\n", + " ax.plot(er[\"percentile\"], er[\"rr\"], \"o-\", color=\"#f44336\", linewidth=2, markersize=8)\n", + " ax.axhline(y=1.0, color=\"gray\", linestyle=\"--\", alpha=0.7)\n", + " ax.set_xlabel(\"温度百分位数 (%)\", fontsize=12)\n", + " ax.set_ylabel(\"相对风险 (RR)\", fontsize=12)\n", + " ax.set_title(\"温度-老年人死亡率暴露反应曲线\\n(来源: Chen et al. 2018, Lancet Planet Health)\", fontsize=13)\n", + " ax.fill_between(er[\"percentile\"], 1.0, er[\"rr\"], alpha=0.2, color=\"#f44336\")\n", + " plt.tight_layout()\n", + " plt.savefig(OUTPUT_FIGURES / \"exposure_response.png\", dpi=150, bbox_inches=\"tight\")\n", + " plt.show()\n", + "except Exception as e:\n", + " print(f\"无法加载暴露反应数据: {e}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if data_loaded:\n", + " fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", + " for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n", + " monthly = df.groupby(df[\"time\"].dt.month)[\"temp_mean\"].agg([\"mean\", \"std\"])\n", + " ax.fill_between(monthly.index, monthly[\"mean\"]-monthly[\"std\"],\n", + " monthly[\"mean\"]+monthly[\"std\"], alpha=0.3, color=\"#ff9800\")\n", + " ax.plot(monthly.index, monthly[\"mean\"], \"o-\", color=\"#f44336\", linewidth=2)\n", + " ax.set_title(f\"{name} - 月均气温模式\", fontsize=14)\n", + " ax.set_xlabel(\"月份\")\n", + " ax.set_ylabel(\"温度 (C)\")\n", + " ax.set_xticks(range(1, 13))\n", + " fig.tight_layout()\n", + " plt.savefig(OUTPUT_FIGURES / \"monthly_temp_pattern.png\", dpi=150, bbox_inches=\"tight\")\n", + " plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA 小结\n", + "\n", + "1. 郑州和焦作两市气温趋势高度一致,均呈缓慢上升趋势\n", + "2. 夏季(6-8月)是高温热浪高发期,7月风险最高\n", + "3. 风险等级分布呈长尾特征:低风险占多数,严重风险为稀有事件\n", + "4. 温度-死亡率暴露反应曲线呈 J 型,高温端风险显著上升\n", + "5. 两市老龄化率均在 11-13%,郑州老年人口绝对数量更大\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/models/lstm_attention.py b/src/models/lstm_attention.py new file mode 100644 index 0000000..c7854ca --- /dev/null +++ b/src/models/lstm_attention.py @@ -0,0 +1,82 @@ +"""LSTM + Multi-Head Attention 多时间尺度预警模型""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from src.utils.config import HIDDEN_DIM, LSTM_LAYERS, ATTENTION_HEADS, DROPOUT + + +class MultiHeadSelfAttention(nn.Module): + """多头自注意力层""" + def __init__(self, embed_dim: int, num_heads: int = ATTENTION_HEADS, + dropout: float = DROPOUT): + super().__init__() + assert embed_dim % num_heads == 0, f"embed_dim {embed_dim} must be divisible by num_heads {num_heads}" + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.qkv = nn.Linear(embed_dim, 3 * embed_dim) + self.out_proj = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, T, D = x.shape + qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim) + qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, heads, T, head_dim) + q, k, v = qkv[0], qkv[1], qkv[2] + + scale = self.head_dim ** -0.5 + attn = (q @ k.transpose(-2, -1)) * scale + attn = F.softmax(attn, dim=-1) + attn = self.dropout(attn) + + out = attn @ v # (B, heads, T, head_dim) + out = out.permute(0, 2, 1, 3).reshape(B, T, D) + return self.out_proj(out) + + +class HeatRiskPredictor(nn.Module): + """LSTM-Attention 多时间尺度高温风险预测模型 + + Input: (B, T, input_dim) sequence of meteorological features + Output: dict with 'short', 'medium', 'long' keys, each (B, 4) logits + """ + def __init__(self, input_dim: int, hidden_dim: int = HIDDEN_DIM, + num_layers: int = LSTM_LAYERS, num_classes: int = 4): + super().__init__() + self.input_proj = nn.Linear(input_dim, hidden_dim) + self.lstm = nn.LSTM( + hidden_dim, hidden_dim, num_layers, + batch_first=True, bidirectional=True, dropout=DROPOUT if num_layers > 1 else 0, + ) + lstm_out_dim = hidden_dim * 2 # bidirectional + self.attention = MultiHeadSelfAttention(lstm_out_dim) + self.lstm_proj = nn.Linear(lstm_out_dim, hidden_dim) + + self.head_short = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + self.head_medium = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + self.head_long = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + + def forward(self, x: torch.Tensor) -> dict: + x = self.input_proj(x) + lstm_out, _ = self.lstm(x) + attn_out = self.attention(lstm_out) + last_hidden = self.lstm_proj(attn_out[:, -1, :]) + + return { + "short": self.head_short(last_hidden), + "medium": self.head_medium(last_hidden), + "long": self.head_long(last_hidden), + } diff --git a/src/models/xgboost_baseline.py b/src/models/xgboost_baseline.py new file mode 100644 index 0000000..dbca6af --- /dev/null +++ b/src/models/xgboost_baseline.py @@ -0,0 +1,91 @@ +"""XGBoost 基线模型 — 三个独立分类器用于多时间尺度对比""" +import numpy as np +import xgboost as xgb +from sklearn.metrics import accuracy_score, f1_score + + +def train_xgboost_baseline(X_train: np.ndarray, y_train: np.ndarray, + X_test: np.ndarray, y_test: np.ndarray) -> dict: + """ + 训练三个独立的 XGBoost 分类器 (短/中/长期)。 + + Args: + X_train: (N, T, D) 训练特征,自动展平为 (N, T*D) + y_train: (N, 3) 标签矩阵,列顺序: short, medium, long + X_test: (N_test, T, D) 测试特征 + y_test: (N_test, 3) 测试标签 + + Returns: + dict: { + "short": {"model": ..., "accuracy": ..., "f1_macro": ..., "predictions": ...}, + "medium": {...}, + "long": {...}, + } + """ + # 展平时序特征为二维 + N_train, T, D = X_train.shape + X_train_flat = X_train.reshape(N_train, T * D) + N_test = X_test.shape[0] + X_test_flat = X_test.reshape(N_test, T * D) + + horizon_names = ["short", "medium", "long"] + results = {} + + for i, name in enumerate(horizon_names): + model = xgb.XGBClassifier( + n_estimators=200, + max_depth=6, + learning_rate=0.05, + subsample=0.8, + colsample_bytree=0.8, + objective="multi:softmax", + num_class=4, + eval_metric="mlogloss", + random_state=42, + device="cuda", + ) + model.fit( + X_train_flat, y_train[:, i], + eval_set=[(X_test_flat, y_test[:, i])], + verbose=False, + ) + y_pred = model.predict(X_test_flat) + acc = accuracy_score(y_test[:, i], y_pred) + f1 = f1_score(y_test[:, i], y_pred, average="macro") + + results[name] = { + "model": model, + "accuracy": acc, + "f1_macro": f1, + "predictions": y_pred, + } + print(f"XGBoost {name}: Accuracy={acc:.4f}, F1 Macro={f1:.4f}") + + return results + + +def train_xgboost_single(X_train: np.ndarray, y_train: np.ndarray, + X_test: np.ndarray, y_test: np.ndarray, + horizon_idx: int = 0) -> dict: + """训练单个时间尺度的XGBoost模型(用于单独调用)""" + N_train, T, D = X_train.shape + X_train_flat = X_train.reshape(N_train, T * D) + N_test = X_test.shape[0] + X_test_flat = X_test.reshape(N_test, T * D) + + model = xgb.XGBClassifier( + n_estimators=200, max_depth=6, learning_rate=0.05, + subsample=0.8, colsample_bytree=0.8, + objective="multi:softmax", num_class=4, + eval_metric="mlogloss", random_state=42, + device="cuda", + ) + model.fit(X_train_flat, y_train[:, horizon_idx], verbose=False) + y_pred = model.predict(X_test_flat) + + return { + "model": model, + "accuracy": accuracy_score(y_test[:, horizon_idx], y_pred), + "f1_macro": f1_score(y_test[:, horizon_idx], y_pred, average="macro"), + "predictions": y_pred, + }