feat: 添加探索性数据分析和多时间尺度高温风险预测模型
This commit is contained in:
@@ -0,0 +1,173 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 高温热浪与银发群体健康风险 -- 探索性数据分析\n",
|
||||||
|
"焦作市 . 郑州市 | 2010-2024 年气象数据"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import warnings\n",
|
||||||
|
"warnings.filterwarnings('ignore')\n",
|
||||||
|
"\n",
|
||||||
|
"sns.set_style(\"whitegrid\")\n",
|
||||||
|
"plt.rcParams[\"font.sans-serif\"] = [\"SimHei\", \"Microsoft YaHei\", \"DejaVu Sans\"]\n",
|
||||||
|
"plt.rcParams[\"axes.unicode_minus\"] = False\n",
|
||||||
|
"\n",
|
||||||
|
"from src.utils.config import DATA_PROCESSED, DATA_EXTERNAL, OUTPUT_FIGURES, CITIES\n",
|
||||||
|
"\n",
|
||||||
|
"# 尝试加载数据\n",
|
||||||
|
"try:\n",
|
||||||
|
" df_jz = pd.read_csv(DATA_PROCESSED / \"jiaozuo_processed.csv\", parse_dates=[\"time\"])\n",
|
||||||
|
" df_zz = pd.read_csv(DATA_PROCESSED / \"zhengzhou_processed.csv\", parse_dates=[\"time\"])\n",
|
||||||
|
" df_combined = pd.read_csv(DATA_PROCESSED / \"combined_processed.csv\", parse_dates=[\"time\"])\n",
|
||||||
|
" print(f\"焦作: {df_jz.shape[0]} 天, 郑州: {df_zz.shape[0]} 天\")\n",
|
||||||
|
" data_loaded = True\n",
|
||||||
|
"except FileNotFoundError:\n",
|
||||||
|
" print(\"处理后的数据不存在,请先运行 preprocess.py\")\n",
|
||||||
|
" print(\"将使用模拟数据演示分析框架\")\n",
|
||||||
|
" data_loaded = False\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if data_loaded:\n",
|
||||||
|
" fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
|
||||||
|
" for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n",
|
||||||
|
" annual = df.groupby(df[\"time\"].dt.year)[\"temp_mean\"].agg([\"mean\", \"max\", \"min\"])\n",
|
||||||
|
" annual.plot(ax=ax, color=[\"#ff9800\", \"#f44336\", \"#5b9bd5\"])\n",
|
||||||
|
" ax.set_title(f\"{name} - 年均气温趋势\", fontsize=14)\n",
|
||||||
|
" ax.set_ylabel(\"温度 (C)\")\n",
|
||||||
|
" ax.set_xlabel(\"年份\")\n",
|
||||||
|
" ax.legend([\"平均\", \"最高\", \"最低\"])\n",
|
||||||
|
" fig.tight_layout()\n",
|
||||||
|
" plt.savefig(OUTPUT_FIGURES / \"annual_temp_trend.png\", dpi=150, bbox_inches=\"tight\")\n",
|
||||||
|
" plt.show()\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"需要数据文件\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if data_loaded:\n",
|
||||||
|
" fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
|
||||||
|
" labels = [\"低\", \"中\", \"高\", \"严重\"]\n",
|
||||||
|
" colors = [\"#00e676\", \"#ffeb3b\", \"#ff9800\", \"#f44336\"]\n",
|
||||||
|
" for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n",
|
||||||
|
" counts = df[\"risk_label\"].value_counts().sort_index()\n",
|
||||||
|
" values = [counts.get(i, 0) for i in range(4)]\n",
|
||||||
|
" ax.bar(labels, values, color=colors)\n",
|
||||||
|
" ax.set_title(f\"{name} - 风险等级分布\", fontsize=14)\n",
|
||||||
|
" for i, v in enumerate(values):\n",
|
||||||
|
" ax.text(i, v + max(values)*0.01, str(v), ha='center')\n",
|
||||||
|
" fig.tight_layout()\n",
|
||||||
|
" plt.savefig(OUTPUT_FIGURES / \"risk_distribution.png\", dpi=150, bbox_inches=\"tight\")\n",
|
||||||
|
" plt.show()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if data_loaded:\n",
|
||||||
|
" for df, name in [(df_jz, \"焦作\"), (df_zz, \"郑州\")]:\n",
|
||||||
|
" annual_hw = df.groupby(df[\"time\"].dt.year)[\"heatwave\"].sum()\n",
|
||||||
|
" print(f\"\\n{name} 热浪天数统计:\")\n",
|
||||||
|
" print(annual_hw.describe())\n",
|
||||||
|
" print(f\" 年均热浪天数: {annual_hw.mean():.1f} 天\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"try:\n",
|
||||||
|
" er = pd.read_csv(DATA_EXTERNAL / \"exposure_response.csv\")\n",
|
||||||
|
" fig, ax = plt.subplots(figsize=(8, 5))\n",
|
||||||
|
" ax.plot(er[\"percentile\"], er[\"rr\"], \"o-\", color=\"#f44336\", linewidth=2, markersize=8)\n",
|
||||||
|
" ax.axhline(y=1.0, color=\"gray\", linestyle=\"--\", alpha=0.7)\n",
|
||||||
|
" ax.set_xlabel(\"温度百分位数 (%)\", fontsize=12)\n",
|
||||||
|
" ax.set_ylabel(\"相对风险 (RR)\", fontsize=12)\n",
|
||||||
|
" ax.set_title(\"温度-老年人死亡率暴露反应曲线\\n(来源: Chen et al. 2018, Lancet Planet Health)\", fontsize=13)\n",
|
||||||
|
" ax.fill_between(er[\"percentile\"], 1.0, er[\"rr\"], alpha=0.2, color=\"#f44336\")\n",
|
||||||
|
" plt.tight_layout()\n",
|
||||||
|
" plt.savefig(OUTPUT_FIGURES / \"exposure_response.png\", dpi=150, bbox_inches=\"tight\")\n",
|
||||||
|
" plt.show()\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(f\"无法加载暴露反应数据: {e}\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if data_loaded:\n",
|
||||||
|
" fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
|
||||||
|
" for ax, (df, name) in zip(axes, [(df_jz, \"焦作\"), (df_zz, \"郑州\")]):\n",
|
||||||
|
" monthly = df.groupby(df[\"time\"].dt.month)[\"temp_mean\"].agg([\"mean\", \"std\"])\n",
|
||||||
|
" ax.fill_between(monthly.index, monthly[\"mean\"]-monthly[\"std\"],\n",
|
||||||
|
" monthly[\"mean\"]+monthly[\"std\"], alpha=0.3, color=\"#ff9800\")\n",
|
||||||
|
" ax.plot(monthly.index, monthly[\"mean\"], \"o-\", color=\"#f44336\", linewidth=2)\n",
|
||||||
|
" ax.set_title(f\"{name} - 月均气温模式\", fontsize=14)\n",
|
||||||
|
" ax.set_xlabel(\"月份\")\n",
|
||||||
|
" ax.set_ylabel(\"温度 (C)\")\n",
|
||||||
|
" ax.set_xticks(range(1, 13))\n",
|
||||||
|
" fig.tight_layout()\n",
|
||||||
|
" plt.savefig(OUTPUT_FIGURES / \"monthly_temp_pattern.png\", dpi=150, bbox_inches=\"tight\")\n",
|
||||||
|
" plt.show()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## EDA 小结\n",
|
||||||
|
"\n",
|
||||||
|
"1. 郑州和焦作两市气温趋势高度一致,均呈缓慢上升趋势\n",
|
||||||
|
"2. 夏季(6-8月)是高温热浪高发期,7月风险最高\n",
|
||||||
|
"3. 风险等级分布呈长尾特征:低风险占多数,严重风险为稀有事件\n",
|
||||||
|
"4. 温度-死亡率暴露反应曲线呈 J 型,高温端风险显著上升\n",
|
||||||
|
"5. 两市老龄化率均在 11-13%,郑州老年人口绝对数量更大\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.13.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
"""LSTM + Multi-Head Attention 多时间尺度预警模型"""
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from src.utils.config import HIDDEN_DIM, LSTM_LAYERS, ATTENTION_HEADS, DROPOUT
|
||||||
|
|
||||||
|
|
||||||
|
class MultiHeadSelfAttention(nn.Module):
|
||||||
|
"""多头自注意力层"""
|
||||||
|
def __init__(self, embed_dim: int, num_heads: int = ATTENTION_HEADS,
|
||||||
|
dropout: float = DROPOUT):
|
||||||
|
super().__init__()
|
||||||
|
assert embed_dim % num_heads == 0, f"embed_dim {embed_dim} must be divisible by num_heads {num_heads}"
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.head_dim = embed_dim // num_heads
|
||||||
|
|
||||||
|
self.qkv = nn.Linear(embed_dim, 3 * embed_dim)
|
||||||
|
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
||||||
|
self.dropout = nn.Dropout(dropout)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
B, T, D = x.shape
|
||||||
|
qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim)
|
||||||
|
qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, heads, T, head_dim)
|
||||||
|
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||||
|
|
||||||
|
scale = self.head_dim ** -0.5
|
||||||
|
attn = (q @ k.transpose(-2, -1)) * scale
|
||||||
|
attn = F.softmax(attn, dim=-1)
|
||||||
|
attn = self.dropout(attn)
|
||||||
|
|
||||||
|
out = attn @ v # (B, heads, T, head_dim)
|
||||||
|
out = out.permute(0, 2, 1, 3).reshape(B, T, D)
|
||||||
|
return self.out_proj(out)
|
||||||
|
|
||||||
|
|
||||||
|
class HeatRiskPredictor(nn.Module):
|
||||||
|
"""LSTM-Attention 多时间尺度高温风险预测模型
|
||||||
|
|
||||||
|
Input: (B, T, input_dim) sequence of meteorological features
|
||||||
|
Output: dict with 'short', 'medium', 'long' keys, each (B, 4) logits
|
||||||
|
"""
|
||||||
|
def __init__(self, input_dim: int, hidden_dim: int = HIDDEN_DIM,
|
||||||
|
num_layers: int = LSTM_LAYERS, num_classes: int = 4):
|
||||||
|
super().__init__()
|
||||||
|
self.input_proj = nn.Linear(input_dim, hidden_dim)
|
||||||
|
self.lstm = nn.LSTM(
|
||||||
|
hidden_dim, hidden_dim, num_layers,
|
||||||
|
batch_first=True, bidirectional=True, dropout=DROPOUT if num_layers > 1 else 0,
|
||||||
|
)
|
||||||
|
lstm_out_dim = hidden_dim * 2 # bidirectional
|
||||||
|
self.attention = MultiHeadSelfAttention(lstm_out_dim)
|
||||||
|
self.lstm_proj = nn.Linear(lstm_out_dim, hidden_dim)
|
||||||
|
|
||||||
|
self.head_short = nn.Sequential(
|
||||||
|
nn.Linear(hidden_dim, hidden_dim // 2),
|
||||||
|
nn.ReLU(), nn.Dropout(DROPOUT),
|
||||||
|
nn.Linear(hidden_dim // 2, num_classes),
|
||||||
|
)
|
||||||
|
self.head_medium = nn.Sequential(
|
||||||
|
nn.Linear(hidden_dim, hidden_dim // 2),
|
||||||
|
nn.ReLU(), nn.Dropout(DROPOUT),
|
||||||
|
nn.Linear(hidden_dim // 2, num_classes),
|
||||||
|
)
|
||||||
|
self.head_long = nn.Sequential(
|
||||||
|
nn.Linear(hidden_dim, hidden_dim // 2),
|
||||||
|
nn.ReLU(), nn.Dropout(DROPOUT),
|
||||||
|
nn.Linear(hidden_dim // 2, num_classes),
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> dict:
|
||||||
|
x = self.input_proj(x)
|
||||||
|
lstm_out, _ = self.lstm(x)
|
||||||
|
attn_out = self.attention(lstm_out)
|
||||||
|
last_hidden = self.lstm_proj(attn_out[:, -1, :])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"short": self.head_short(last_hidden),
|
||||||
|
"medium": self.head_medium(last_hidden),
|
||||||
|
"long": self.head_long(last_hidden),
|
||||||
|
}
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
"""XGBoost 基线模型 — 三个独立分类器用于多时间尺度对比"""
|
||||||
|
import numpy as np
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score
|
||||||
|
|
||||||
|
|
||||||
|
def train_xgboost_baseline(X_train: np.ndarray, y_train: np.ndarray,
|
||||||
|
X_test: np.ndarray, y_test: np.ndarray) -> dict:
|
||||||
|
"""
|
||||||
|
训练三个独立的 XGBoost 分类器 (短/中/长期)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X_train: (N, T, D) 训练特征,自动展平为 (N, T*D)
|
||||||
|
y_train: (N, 3) 标签矩阵,列顺序: short, medium, long
|
||||||
|
X_test: (N_test, T, D) 测试特征
|
||||||
|
y_test: (N_test, 3) 测试标签
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: {
|
||||||
|
"short": {"model": ..., "accuracy": ..., "f1_macro": ..., "predictions": ...},
|
||||||
|
"medium": {...},
|
||||||
|
"long": {...},
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
# 展平时序特征为二维
|
||||||
|
N_train, T, D = X_train.shape
|
||||||
|
X_train_flat = X_train.reshape(N_train, T * D)
|
||||||
|
N_test = X_test.shape[0]
|
||||||
|
X_test_flat = X_test.reshape(N_test, T * D)
|
||||||
|
|
||||||
|
horizon_names = ["short", "medium", "long"]
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for i, name in enumerate(horizon_names):
|
||||||
|
model = xgb.XGBClassifier(
|
||||||
|
n_estimators=200,
|
||||||
|
max_depth=6,
|
||||||
|
learning_rate=0.05,
|
||||||
|
subsample=0.8,
|
||||||
|
colsample_bytree=0.8,
|
||||||
|
objective="multi:softmax",
|
||||||
|
num_class=4,
|
||||||
|
eval_metric="mlogloss",
|
||||||
|
random_state=42,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
model.fit(
|
||||||
|
X_train_flat, y_train[:, i],
|
||||||
|
eval_set=[(X_test_flat, y_test[:, i])],
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
y_pred = model.predict(X_test_flat)
|
||||||
|
acc = accuracy_score(y_test[:, i], y_pred)
|
||||||
|
f1 = f1_score(y_test[:, i], y_pred, average="macro")
|
||||||
|
|
||||||
|
results[name] = {
|
||||||
|
"model": model,
|
||||||
|
"accuracy": acc,
|
||||||
|
"f1_macro": f1,
|
||||||
|
"predictions": y_pred,
|
||||||
|
}
|
||||||
|
print(f"XGBoost {name}: Accuracy={acc:.4f}, F1 Macro={f1:.4f}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def train_xgboost_single(X_train: np.ndarray, y_train: np.ndarray,
|
||||||
|
X_test: np.ndarray, y_test: np.ndarray,
|
||||||
|
horizon_idx: int = 0) -> dict:
|
||||||
|
"""训练单个时间尺度的XGBoost模型(用于单独调用)"""
|
||||||
|
N_train, T, D = X_train.shape
|
||||||
|
X_train_flat = X_train.reshape(N_train, T * D)
|
||||||
|
N_test = X_test.shape[0]
|
||||||
|
X_test_flat = X_test.reshape(N_test, T * D)
|
||||||
|
|
||||||
|
model = xgb.XGBClassifier(
|
||||||
|
n_estimators=200, max_depth=6, learning_rate=0.05,
|
||||||
|
subsample=0.8, colsample_bytree=0.8,
|
||||||
|
objective="multi:softmax", num_class=4,
|
||||||
|
eval_metric="mlogloss", random_state=42,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
model.fit(X_train_flat, y_train[:, horizon_idx], verbose=False)
|
||||||
|
y_pred = model.predict(X_test_flat)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"model": model,
|
||||||
|
"accuracy": accuracy_score(y_test[:, horizon_idx], y_pred),
|
||||||
|
"f1_macro": f1_score(y_test[:, horizon_idx], y_pred, average="macro"),
|
||||||
|
"predictions": y_pred,
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user