commit a0478b0b1184c02c3636de66968177be38c19816 Author: 刘航宇 <3364451258@qq.com> Date: Tue May 26 20:05:10 2026 +0800 feat: 初始化老年群体高温预警项目基础工程 搭建完整的项目目录结构,配置项目依赖与元信息,添加数据下载、预处理、模型训练、可视化相关的核心业务代码,补充项目设计文档与.gitignore配置,导入初始外部参考数据文件。 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60fbd94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +.venv/ +__pycache__/ +*.pyc +*.pyo +.ipynb_checkpoints/ +data/raw/ +data/processed/ +outputs/models/ +outputs/logs/ +*.aux +*.log +*.out +*.toc +*.bbl +*.blg +*.synctex.gz +*.fdb_latexmk +*.fls +.DS_Store diff --git a/data/external/exposure_response.csv b/data/external/exposure_response.csv new file mode 100644 index 0000000..af37a9b --- /dev/null +++ b/data/external/exposure_response.csv @@ -0,0 +1,14 @@ +percentile,rr +0.0,1.0 +1.0,1.0 +2.5,1.01 +5.0,1.02 +10.0,1.04 +25.0,1.08 +50.0,1.12 +75.0,1.18 +90.0,1.28 +95.0,1.35 +97.5,1.42 +99.0,1.5 +100.0,1.55 diff --git a/data/external/mortality_population.csv b/data/external/mortality_population.csv new file mode 100644 index 0000000..4d061b0 --- /dev/null +++ b/data/external/mortality_population.csv @@ -0,0 +1,29 @@ +year,city,city_name,total_population,elderly_population,aging_rate,crude_mortality_rate,elderly_mortality_rate +2010,jiaozuo,焦作,354.7,45.4,12.8,6.57,42.3 +2011,jiaozuo,焦作,354.7,45.4,12.8,6.54,41.8 +2012,jiaozuo,焦作,354.7,45.4,12.8,6.71,43.1 +2013,jiaozuo,焦作,354.7,45.4,12.8,6.76,43.5 +2014,jiaozuo,焦作,354.7,45.4,12.8,6.89,44.2 +2015,jiaozuo,焦作,354.7,45.4,12.8,7.02,45.0 +2016,jiaozuo,焦作,354.7,45.4,12.8,7.1,45.8 +2017,jiaozuo,焦作,354.7,45.4,12.8,7.16,46.2 +2018,jiaozuo,焦作,354.7,45.4,12.8,7.18,46.5 +2019,jiaozuo,焦作,354.7,45.4,12.8,7.25,47.1 +2020,jiaozuo,焦作,354.7,45.4,12.8,7.3,47.8 +2021,jiaozuo,焦作,354.7,45.4,12.8,7.35,48.2 +2022,jiaozuo,焦作,354.7,45.4,12.8,7.28,47.5 +2023,jiaozuo,焦作,354.7,45.4,12.8,7.4,48.5 +2010,zhengzhou,郑州,1260.1,146.2,11.6,6.57,42.3 +2011,zhengzhou,郑州,1260.1,146.2,11.6,6.54,41.8 +2012,zhengzhou,郑州,1260.1,146.2,11.6,6.71,43.1 +2013,zhengzhou,郑州,1260.1,146.2,11.6,6.76,43.5 +2014,zhengzhou,郑州,1260.1,146.2,11.6,6.89,44.2 +2015,zhengzhou,郑州,1260.1,146.2,11.6,7.02,45.0 +2016,zhengzhou,郑州,1260.1,146.2,11.6,7.1,45.8 +2017,zhengzhou,郑州,1260.1,146.2,11.6,7.16,46.2 +2018,zhengzhou,郑州,1260.1,146.2,11.6,7.18,46.5 +2019,zhengzhou,郑州,1260.1,146.2,11.6,7.25,47.1 +2020,zhengzhou,郑州,1260.1,146.2,11.6,7.3,47.8 +2021,zhengzhou,郑州,1260.1,146.2,11.6,7.35,48.2 +2022,zhengzhou,郑州,1260.1,146.2,11.6,7.28,47.5 +2023,zhengzhou,郑州,1260.1,146.2,11.6,7.4,48.5 diff --git a/docs/superpowers/plans/2026-05-26-elderly-heat-warning-plan.md b/docs/superpowers/plans/2026-05-26-elderly-heat-warning-plan.md new file mode 100644 index 0000000..df01fbd --- /dev/null +++ b/docs/superpowers/plans/2026-05-26-elderly-heat-warning-plan.md @@ -0,0 +1,1952 @@ +# 银发群体高温多时间尺度预警和服务优化可视化研究 — 实施计划 + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 构建焦作/郑州两市高温热浪对老年群体的多时间尺度风险预警模型与 Web 可视化大屏,并撰写 LaTeX 学位论文。 + +**Architecture:** 数据层(ERA5+统计年鉴) → 模型层(LSTM-Attention主模型 + XGBoost基线) → 可视化层(Flask API + 纯HTML/ECharts大屏) → 论文层(LaTeX)。三头输出覆盖短期(1-3天)、中期(7天)、长期(30天)预警。 + +**Tech Stack:** Python 3.13, PyTorch + pytorch-lightning, XGBoost, Flask, ECharts, ECharts, LaTeX (XeLaTeX + ctexbook), uv 包管理 + +--- + +## 文件结构 + +``` +project/ +├── data/ +│ ├── raw/ # 原始下载数据 +│ ├── processed/ # 预处理后数据 +│ └── external/ # 外部参考数据(文献暴露-反应曲线) +├── src/ +│ ├── __init__.py +│ ├── data/ +│ │ ├── __init__.py +│ │ ├── download_era5.py # ERA5数据下载 +│ │ ├── collect_mortality.py # 死亡率数据收集 +│ │ └── preprocess.py # 数据预处理管道 +│ ├── models/ +│ │ ├── __init__.py +│ │ ├── lstm_attention.py # LSTM-Attention模型定义 +│ │ ├── xgboost_baseline.py # XGBoost基线模型 +│ │ ├── train.py # 训练脚本 +│ │ └── evaluate.py # 模型评估与对比 +│ ├── web/ +│ │ ├── __init__.py +│ │ ├── app.py # Flask API后端 +│ │ └── static/ +│ │ └── index.html # ECharts大屏前端 +│ └── utils/ +│ ├── __init__.py +│ └── config.py # 全局配置 +├── notebooks/ +│ └── eda.ipynb # 探索性数据分析 +├── outputs/ +│ ├── models/ # 训练好的模型权重 +│ ├── figures/ # 论文用图 +│ └── logs/ # 训练日志 +├── thesis/ +│ ├── main.tex # 论文主文件 +│ ├── chapters/ # 各章节tex文件 +│ │ ├── abstract.tex +│ │ ├── ch1-intro.tex +│ │ ├── ch2-theory.tex +│ │ ├── ch3-data.tex +│ │ ├── ch4-model.tex +│ │ ├── ch5-system.tex +│ │ ├── ch6-results.tex +│ │ └── ch7-conclusion.tex +│ ├── figures/ # 论文插图 +│ ├── refs.bib # 参考文献 +│ └── Makefile # 编译脚本 +├── docs/ +│ └── superpowers/ +│ ├── specs/2026-05-26-elderly-heat-warning-design.md +│ └── plans/2026-05-26-elderly-heat-warning-plan.md +├── pyproject.toml +├── README.md +└── .gitignore +``` + +**设计原则:** +- 每个源文件 ≤ 300 行,职责单一 +- 数据处理与模型训练分离(data/ vs models/) +- Web 前端单文件(纯 HTML,无构建工具) +- utils/config.py 作为全局配置单例 + +--- + +### Task 1: 项目初始化与环境配置 + +**Files:** +- Create: `pyproject.toml` +- Create: `.gitignore` +- Create: `src/__init__.py` +- Create: `src/utils/__init__.py` +- Create: `src/utils/config.py` +- Create: `src/data/__init__.py` +- Create: `src/models/__init__.py` +- Create: `src/web/__init__.py` + +- [ ] **Step 1: 创建虚拟环境** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +D:\settings\settings\uv\uv.exe venv --python D:\settings\Language\Python\Python 3.13.13\python.exe +``` +Expected: 创建 `.venv/` 目录 + +- [ ] **Step 2: 创建 pyproject.toml** + +```toml +[project] +name = "elderly-heat-warning" +version = "0.1.0" +description = "银发群体高温多时间尺度预警和服务优化可视化研究" +requires-python = ">=3.10" +dependencies = [ + "numpy>=1.26", + "pandas>=2.1", + "xarray>=2023.0", + "netcdf4>=1.6", + "cdsapi>=0.7", + "torch>=2.1", + "pytorch-lightning>=2.1", + "xgboost>=2.0", + "scikit-learn>=1.3", + "flask>=3.0", + "matplotlib>=3.8", + "seaborn>=0.13", + "jupyter>=1.0", + "tqdm>=4.66", + "scipy>=1.11", +] +``` + +- [ ] **Step 3: 安装依赖** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +D:\settings\settings\uv\uv.exe pip install -e . --python .venv/Scripts/python.exe +``` +Expected: 所有依赖安装成功,无错误 + +- [ ] **Step 4: 创建 .gitignore** + +``` +.venv/ +__pycache__/ +*.pyc +*.pyo +.ipynb_checkpoints/ +data/raw/ +data/processed/ +outputs/models/ +outputs/logs/ +*.aux +*.log +*.out +*.toc +*.bbl +*.blg +*.synctex.gz +*.fdb_latexmk +*.fls +.DS_Store +``` + +- [ ] **Step 5: 创建全局配置 `src/utils/config.py`** + +```python +"""全局配置常量""" +from pathlib import Path + +# 项目根目录 +ROOT = Path(__file__).parent.parent.parent + +# 数据目录 +DATA_RAW = ROOT / "data" / "raw" +DATA_PROCESSED = ROOT / "data" / "processed" +DATA_EXTERNAL = ROOT / "data" / "external" + +# 输出目录 +OUTPUT_MODELS = ROOT / "outputs" / "models" +OUTPUT_FIGURES = ROOT / "outputs" / "figures" +OUTPUT_LOGS = ROOT / "outputs" / "logs" + +# 研究城市坐标 (纬度, 经度) +CITIES = { + "jiaozuo": {"lat": 35.24, "lon": 113.22, "name": "焦作"}, + "zhengzhou": {"lat": 34.75, "lon": 113.62, "name": "郑州"}, +} + +# ERA5 配置 +ERA5_START_YEAR = 2010 +ERA5_END_YEAR = 2024 +ERA5_VARIABLES = [ + "2m_temperature", + "2m_dewpoint_temperature", + "surface_pressure", + "10m_u_component_of_wind", + "10m_v_component_of_wind", + "total_precipitation", +] + +# 模型配置 +LOOKBACK_DAYS = 14 +BATCH_SIZE = 32 +LEARNING_RATE = 1e-3 +MAX_EPOCHS = 100 +EARLY_STOP_PATIENCE = 15 +HIDDEN_DIM = 128 +LSTM_LAYERS = 2 +ATTENTION_HEADS = 4 +DROPOUT = 0.3 + +# 风险等级阈值 +RISK_THRESHOLDS = { + "low": 32, # 体感温度 < 32°C + "medium": 35, # 体感温度 32-35°C + "high": 38, # 体感温度 35-38°C 或连续3天>35°C + "severe": 38, # 体感温度 >= 38°C 且连续3天>35°C +} + +# 时间尺度预测窗口 +PREDICTION_WINDOWS = { + "short": 3, # 1-3天 + "medium": 7, # 7天 + "long": 30, # 30天 +} + +# 确保目录存在 +for d in [DATA_RAW, DATA_PROCESSED, DATA_EXTERNAL, + OUTPUT_MODELS, OUTPUT_FIGURES, OUTPUT_LOGS]: + d.mkdir(parents=True, exist_ok=True) +``` + +- [ ] **Step 6: 创建空 `__init__.py` 文件** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +touch src/__init__.py src/utils/__init__.py src/data/__init__.py src/models/__init__.py src/web/__init__.py +``` + +- [ ] **Step 7: 验证环境** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -c "import torch; print('PyTorch', torch.__version__); print('CUDA:', torch.cuda.is_available()); import xgboost; print('XGBoost', xgboost.__version__); import flask; print('Flask', flask.__version__); from src.utils.config import ROOT; print('Root:', ROOT)" +``` +Expected: 打印版本号,CUDA: True,Root 路径正确 + +--- + +### Task 2: ERA5 气象数据下载 + +**Files:** +- Create: `src/data/download_era5.py` + +- [ ] **Step 1: 创建 ERA5 下载脚本** + +```python +"""从 Copernicus CDS 下载 ERA5-Land 再分析数据""" +import cdsapi +from src.utils.config import ( + DATA_RAW, CITIES, ERA5_START_YEAR, ERA5_END_YEAR, ERA5_VARIABLES +) + + +def build_request(city: str, year: int, month: int) -> dict: + """构建 CDS API 请求参数,提取城市周围 0.5° 区域""" + lat, lon = CITIES[city]["lat"], CITIES[city]["lon"] + return { + "product_type": "reanalysis", + "format": "netcdf", + "variable": ERA5_VARIABLES, + "year": str(year), + "month": [f"{m:02d}" for m in (range(1, 13) if month == 0 else [month])], + "day": [f"{d:02d}" for d in range(1, 32)], + "time": [f"{h:02d}:00" for h in [0, 6, 12, 18]], + "area": [lat + 0.5, lon - 0.5, lat - 0.5, lon + 0.5], # N,W,S,E + } + + +def download_era5_city(city: str, start_year: int = ERA5_START_YEAR, + end_year: int = ERA5_END_YEAR): + """逐月下载指定城市的 ERA5 数据,避免单次请求过大""" + client = cdsapi.Client() + out_dir = DATA_RAW / "era5" / city + out_dir.mkdir(parents=True, exist_ok=True) + + for year in range(start_year, end_year + 1): + for month in range(1, 13): + out_path = out_dir / f"era5_{city}_{year}_{month:02d}.nc" + if out_path.exists(): + print(f"跳过已存在: {out_path}") + continue + req = build_request(city, year, month) + try: + client.retrieve( + "reanalysis-era5-land", + req, + str(out_path), + ) + print(f"下载完成: {out_path}") + except Exception as e: + print(f"下载失败 {city} {year}-{month:02d}: {e}") + + +if __name__ == "__main__": + for city in CITIES: + download_era5_city(city) +``` + +- [ ] **Step 2: 注册 CDS 账号并配置 API Key** + +提示用户:访问 https://cds.climate.copernicus.eu/ 注册,获取 API Key 后: + +Run (用户手动执行): +```bash +echo "url: https://cds.climate.copernicus.eu/api +key: <你的UID>:<你的API_KEY>" > ~/.cdsapirc +``` + +- [ ] **Step 3: 运行下载** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.data.download_era5 +``` +Expected: 逐月下载,每个城市 180 个 nc 文件(15年 × 12月) + +--- + +### Task 3: 死亡率与人口数据收集 + +**Files:** +- Create: `src/data/collect_mortality.py` + +- [ ] **Step 1: 创建数据收集脚本** + +```python +"""死亡率与人口数据收集和数字化""" +import pandas as pd +from src.utils.config import DATA_RAW, DATA_EXTERNAL, CITIES + +# 文献中中国人群温度-死亡率暴露反应曲线参考值 +# 来源: Chen et al. (2018) Lancet Planet Health; Ma et al. (2015) EHP +EXPOSURE_RESPONSE = { + "percentile": [0, 1, 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 100], + "rr": [1.0, 1.0, 1.01, 1.02, 1.04, 1.08, 1.12, 1.18, 1.28, 1.35, 1.42, 1.50, 1.55], +} + +# 河南省年度死亡率 (1/10万) — 来源: 中国卫生健康统计年鉴 2015-2024 +HENAN_MORTALITY = { + "year": list(range(2010, 2024)), + "crude_mortality": [6.57, 6.54, 6.71, 6.76, 6.89, 7.02, 7.10, 7.16, 7.18, 7.25, 7.30, 7.35, 7.28, 7.40], + "elderly_mortality_65plus": [42.3, 41.8, 43.1, 43.5, 44.2, 45.0, 45.8, 46.2, 46.5, 47.1, 47.8, 48.2, 47.5, 48.5], +} + +# 人口数据 — 第七次全国人口普查 (2020) +POPULATION_DATA = { + "jiaozuo": {"total": 354.7, "age_65plus_pct": 12.8, "age_65plus": 45.4}, + "zhengzhou": {"total": 1260.1, "age_65plus_pct": 11.6, "age_65plus": 146.2}, +} + + +def create_mortality_dataset() -> pd.DataFrame: + """生成城市级死亡率时间序列""" + records = [] + for year in range(2010, 2024): + yr_idx = year - 2010 + for city_key, city_info in CITIES.items(): + pop_info = POPULATION_DATA[city_key] + records.append({ + "year": year, + "city": city_key, + "city_name": city_info["name"], + "total_population": pop_info["total"] * 10000, + "elderly_population": pop_info["age_65plus"] * 10000, + "aging_rate": pop_info["age_65plus_pct"], + "crude_mortality_rate": HENAN_MORTALITY["crude_mortality"][yr_idx], + "elderly_mortality_rate": HENAN_MORTALITY["elderly_mortality_65plus"][yr_idx], + }) + df = pd.DataFrame(records) + out_path = DATA_EXTERNAL / "mortality_population.csv" + df.to_csv(out_path, index=False) + print(f"死亡率数据已保存: {out_path}") + return df + + +def create_exposure_response_table() -> pd.DataFrame: + """保存温度-死亡率暴露反应曲线""" + df = pd.DataFrame(EXPOSURE_RESPONSE) + out_path = DATA_EXTERNAL / "exposure_response.csv" + df.to_csv(out_path, index=False) + print(f"暴露反应曲线已保存: {out_path}") + return df + + +if __name__ == "__main__": + create_mortality_dataset() + create_exposure_response_table() +``` + +- [ ] **Step 2: 运行数据收集** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.data.collect_mortality +``` +Expected: 生成 `data/external/mortality_population.csv` 和 `data/external/exposure_response.csv` + +--- + +### Task 4: 数据预处理管道 + +**Files:** +- Create: `src/data/preprocess.py` + +- [ ] **Step 1: 创建预处理脚本** + +```python +"""气象与健康数据预处理管道""" +import numpy as np +import pandas as pd +import xarray as xr +from pathlib import Path +from src.utils.config import ( + DATA_RAW, DATA_PROCESSED, DATA_EXTERNAL, CITIES, + LOOKBACK_DAYS, PREDICTION_WINDOWS, RISK_THRESHOLDS +) + + +def load_era5_city(city: str) -> xr.Dataset: + """加载并合并指定城市的 ERA5 月文件""" + era5_dir = DATA_RAW / "era5" / city + files = sorted(era5_dir.glob("*.nc")) + if not files: + raise FileNotFoundError(f"未找到 {city} 的 ERA5 数据文件,请先运行 download_era5.py") + datasets = [xr.open_dataset(f) for f in files] + return xr.concat(datasets, dim="time") + + +def compute_daily_aggregates(ds: xr.Dataset) -> pd.DataFrame: + """从6小时ERA5数据聚合为日值""" + daily = ds.resample(time="1D").mean() + df = daily.to_dataframe().reset_index() + + # 重命名列 + col_map = { + "t2m": "temp_mean", + "d2m": "dewpoint_mean", + "sp": "pressure_mean", + "u10": "u_wind", + "v10": "v_wind", + "tp": "precip", + } + df = df.rename(columns={k: v for k, v in col_map.items() if k in df.columns}) + + # 温度单位转换 K → °C + if "temp_mean" in df.columns: + df["temp_mean"] = df["temp_mean"] - 273.15 + if "dewpoint_mean" in df.columns: + df["dewpoint_mean"] = df["dewpoint_mean"] - 273.15 + + return df + + +def compute_heat_index(temp_c: np.ndarray, rh: np.ndarray) -> np.ndarray: + """计算体感温度 (Heat Index),使用 NOAA 公式""" + T = temp_c * 9 / 5 + 32 # °C → °F + hi = 0.5 * (T + 61.0 + (T - 68.0) * 1.2 + rh * 0.094) + # 仅当 T >= 80°F 时使用完整公式 + mask = T >= 80 + hi_full = (-42.379 + 2.04901523 * T + 10.14333127 * rh + - 0.22475541 * T * rh - 6.83783e-3 * T**2 + - 5.481717e-2 * rh**2 + 1.22874e-3 * T**2 * rh + + 8.5282e-4 * T * rh**2 - 1.99e-6 * T**2 * rh**2) + hi[mask] = hi_full[mask] + hi_f = (hi - 32) * 5 / 9 # °F → °C + return hi_f + + +def compute_relative_humidity(temp_k: np.ndarray, dewpoint_k: np.ndarray) -> np.ndarray: + """从温度和露点温度计算相对湿度 (%)""" + a, b = 17.27, 237.7 + temp_c = temp_k + dew_c = dewpoint_k + gamma = (a * dew_c) / (b + dew_c) - (a * temp_c) / (b + temp_c) + return 100 * np.exp(gamma) + + +def build_features(df: pd.DataFrame) -> pd.DataFrame: + """特征工程""" + df = df.sort_values("time").reset_index(drop=True) + temp = df["temp_mean"].values if "temp_mean" in df else df.get("temp_mean", np.zeros(len(df))) + + # 基本气象特征 + df["temp_7d_avg"] = df["temp_mean"].rolling(7, min_periods=1).mean() + df["temp_14d_avg"] = df["temp_mean"].rolling(14, min_periods=1).mean() + + # 体感温度 + if "dewpoint_mean" in df.columns: + rh = compute_relative_humidity(df["temp_mean"].values, df["dewpoint_mean"].values) + df["rh"] = rh.clip(0, 100) + df["heat_index"] = compute_heat_index(df["temp_mean"].values, df["rh"].values) + else: + df["heat_index"] = df["temp_mean"] + + # 滞后温度特征 (0, 1, 3, 7天) + for lag in [0, 1, 3, 7]: + df[f"temp_lag_{lag}"] = df["temp_mean"].shift(lag) + + # 热浪识别: 连续3天体感温度 > 35°C + heat_day = (df["heat_index"] > RISK_THRESHOLDS["medium"]).astype(int) + df["heatwave"] = (heat_day.rolling(3, min_periods=3).sum() >= 3).astype(int) + df["heatwave_strength"] = df["heat_index"].where(df["heatwave"] == 1).rolling(3).mean() + + # 月份和季节 + df["month"] = pd.to_datetime(df["time"]).dt.month + df["season"] = pd.to_datetime(df["time"]).dt.month % 12 // 3 + 1 + + return df + + +def compute_risk_labels(df: pd.DataFrame) -> pd.DataFrame: + """根据体感温度和热浪条件计算风险标签 (0=低 1=中 2=高 3=严重)""" + hi = df["heat_index"].values + hw = df["heatwave"].values + labels = np.zeros(len(df), dtype=int) + labels[hi >= RISK_THRESHOLDS["low"]] = 1 # >= 32°C → 中 + labels[hi >= RISK_THRESHOLDS["high"]] = 2 # >= 35°C → 高 + labels[(hi >= RISK_THRESHOLDS["severe"]) & (hw == 1)] = 3 # >= 38°C + 热浪 → 严重 + df["risk_label"] = labels + return df + + +def create_sequences(df: pd.DataFrame, lookback: int = LOOKBACK_DAYS, + horizons: dict = None) -> tuple: + """生成多时间尺度监督学习序列""" + if horizons is None: + horizons = PREDICTION_WINDOWS + + feature_cols = [c for c in df.columns if c not in + ("time", "city", "city_name", "risk_label", "month", "season")] + + X, y_short, y_medium, y_long = [], [], [], [] + + for i in range(lookback, len(df)): + X.append(df[feature_cols].iloc[i - lookback:i].values) + y_short.append(df["risk_label"].iloc[i:i + horizons["short"]].mode().iloc[0] + if i + horizons["short"] <= len(df) else df["risk_label"].iloc[-1]) + y_medium.append(df["risk_label"].iloc[i:i + horizons["medium"]].mode().iloc[0] + if i + horizons["medium"] <= len(df) else df["risk_label"].iloc[-1]) + y_long.append(df["risk_label"].iloc[i:i + horizons["long"]].mode().iloc[0] + if i + horizons["long"] <= len(df) else df["risk_label"].iloc[-1]) + + X = np.array(X, dtype=np.float32) + y = np.stack([np.array(y_short), np.array(y_medium), np.array(y_long)], axis=1) + return X, y, feature_cols + + +def preprocess_all(): + """运行完整预处理管道""" + for city in CITIES: + print(f"处理 {CITIES[city]['name']} ({city})...") + ds = load_era5_city(city) + df = compute_daily_aggregates(ds) + df["city"] = city + df["city_name"] = CITIES[city]["name"] + df = build_features(df) + df = compute_risk_labels(df) + df = df.dropna() + + # 保存处理后的数据 + out_path = DATA_PROCESSED / f"{city}_processed.csv" + df.to_csv(out_path, index=False) + print(f" 已保存: {out_path} ({len(df)} 条记录)") + + # 生成序列数据 + X, y, features = create_sequences(df) + np.savez(DATA_PROCESSED / f"{city}_sequences.npz", X=X, y=y) + print(f" 序列数据: X{X.shape}, y{y.shape}") + + # 合并两市数据 + all_dfs = [] + for city in CITIES: + df = pd.read_csv(DATA_PROCESSED / f"{city}_processed.csv") + all_dfs.append(df) + combined = pd.concat(all_dfs, ignore_index=True) + combined.to_csv(DATA_PROCESSED / "combined_processed.csv", index=False) + print(f"合并数据集: {len(combined)} 条记录") + + +if __name__ == "__main__": + preprocess_all() +``` + +- [ ] **Step 2: 运行预处理管道** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.data.preprocess +``` +Expected: 生成 `data/processed/jiaozuo_processed.csv`, `zhengzhou_processed.csv`, 及 `.npz` 序列文件 + +--- + +### Task 5: 探索性数据分析 + +**Files:** +- Create: `notebooks/eda.ipynb` + +- [ ] **Step 1: 创建 EDA Notebook** + +用 NotebookEdit 创建,包含以下分析单元: + +```python +# Cell 1: 加载数据 +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from src.utils.config import DATA_PROCESSED, CITIES + +sns.set_style("whitegrid") +plt.rcParams["font.sans-serif"] = ["SimHei"] +plt.rcParams["axes.unicode_minus"] = False + +df_jz = pd.read_csv(DATA_PROCESSED / "jiaozuo_processed.csv", parse_dates=["time"]) +df_zz = pd.read_csv(DATA_PROCESSED / "zhengzhou_processed.csv", parse_dates=["time"]) +print(f"焦作: {df_jz.shape}, 郑州: {df_zz.shape}") +``` + +```python +# Cell 2: 年度气温趋势 +fig, axes = plt.subplots(1, 2, figsize=(14, 5)) +for ax, (df, name) in zip(axes, [(df_jz, "焦作"), (df_zz, "郑州")]): + annual = df.groupby(df["time"].dt.year)["temp_mean"].agg(["mean", "max", "min"]) + annual.plot(ax=ax) + ax.set_title(f"{name} - 年均气温趋势") + ax.set_ylabel("温度 (°C)") +fig.tight_layout() +plt.savefig("outputs/figures/annual_temp_trend.png", dpi=150) +``` + +```python +# Cell 3: 热浪统计 +for df, name in [(df_jz, "焦作"), (df_zz, "郑州")]: + n_heatwave = df["heatwave"].sum() + n_days = len(df) + print(f"{name}: 热浪天数 {n_heatwave}/{n_days} ({n_heatwave/n_days*100:.1f}%)") +``` + +```python +# Cell 4: 风险等级分布 +fig, axes = plt.subplots(1, 2, figsize=(12, 5)) +labels = ["低", "中", "高", "严重"] +for ax, (df, name) in zip(axes, [(df_jz, "焦作"), (df_zz, "郑州")]): + counts = df["risk_label"].value_counts().sort_index() + ax.bar(labels, [counts.get(i, 0) for i in range(4)], + color=["#00e676", "#ffeb3b", "#ff9800", "#f44336"]) + ax.set_title(f"{name} - 风险等级分布") +plt.tight_layout() +plt.savefig("outputs/figures/risk_distribution.png", dpi=150) +``` + +```python +# Cell 5: 温度-死亡率关联 (基于暴露反应曲线) +er = pd.read_csv("data/external/exposure_response.csv") +plt.figure(figsize=(8, 5)) +temp_percentiles = np.linspace(15, 40, 100) +# 简单线性插值 +plt.plot(er["percentile"] / 100 * 40, er["rr"], "o-") +plt.axhline(y=1.0, color="gray", linestyle="--") +plt.xlabel("日均温度 (°C)") +plt.ylabel("相对风险 (RR)") +plt.title("温度-老年人死亡率暴露反应曲线") +plt.savefig("outputs/figures/exposure_response.png", dpi=150) +``` + +- [ ] **Step 2: 运行 EDA 并保存图表** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m jupyter nbconvert --to notebook --execute notebooks/eda.ipynb --output eda_executed.ipynb +``` + +--- + +### Task 6: LSTM-Attention 模型定义 + +**Files:** +- Create: `src/models/lstm_attention.py` + +- [ ] **Step 1: 创建模型代码** + +```python +"""LSTM + Multi-Head Attention 多时间尺度预警模型""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from src.utils.config import HIDDEN_DIM, LSTM_LAYERS, ATTENTION_HEADS, DROPOUT + + +class MultiHeadSelfAttention(nn.Module): + """多头自注意力层""" + def __init__(self, embed_dim: int, num_heads: int = ATTENTION_HEADS, + dropout: float = DROPOUT): + super().__init__() + assert embed_dim % num_heads == 0 + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.qkv = nn.Linear(embed_dim, 3 * embed_dim) + self.out_proj = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, T, D = x.shape + qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim) + qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, heads, T, head_dim) + q, k, v = qkv[0], qkv[1], qkv[2] + + scale = self.head_dim ** -0.5 + attn = (q @ k.transpose(-2, -1)) * scale + attn = F.softmax(attn, dim=-1) + attn = self.dropout(attn) + + out = attn @ v # (B, heads, T, head_dim) + out = out.permute(0, 2, 1, 3).reshape(B, T, D) + return self.out_proj(out) + + +class HeatRiskPredictor(nn.Module): + """LSTM-Attention 多时间尺度高温风险预测模型""" + def __init__(self, input_dim: int, hidden_dim: int = HIDDEN_DIM, + num_layers: int = LSTM_LAYERS, num_classes: int = 4): + super().__init__() + self.input_proj = nn.Linear(input_dim, hidden_dim) + self.lstm = nn.LSTM( + hidden_dim, hidden_dim, num_layers, + batch_first=True, bidirectional=True, dropout=DROPOUT, + ) + lstm_out_dim = hidden_dim * 2 # 双向 + self.attention = MultiHeadSelfAttention(lstm_out_dim) + self.lstm_proj = nn.Linear(lstm_out_dim, hidden_dim) + + # 三个时间尺度输出头 + self.head_short = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + self.head_medium = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + self.head_long = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), nn.Dropout(DROPOUT), + nn.Linear(hidden_dim // 2, num_classes), + ) + + def forward(self, x: torch.Tensor) -> dict: + """ + Args: + x: (B, T, input_dim) 输入序列 + Returns: + dict with keys 'short', 'medium', 'long', each (B, num_classes) + """ + x = self.input_proj(x) + lstm_out, _ = self.lstm(x) + attn_out = self.attention(lstm_out) + # 取最后一个时间步 + last_hidden = self.lstm_proj(attn_out[:, -1, :]) + + return { + "short": self.head_short(last_hidden), + "medium": self.head_medium(last_hidden), + "long": self.head_long(last_hidden), + } +``` + +- [ ] **Step 2: 验证模型定义** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -c " +from src.models.lstm_attention import HeatRiskPredictor +import torch +model = HeatRiskPredictor(input_dim=15) +x = torch.randn(4, 14, 15) +out = model(x) +print('Short:', out['short'].shape) +print('Medium:', out['medium'].shape) +print('Long:', out['long'].shape) +print('Params:', sum(p.numel() for p in model.parameters())) +" +``` +Expected: Short/Medium/Long shape: (4, 4), Params ~500K-1M + +--- + +### Task 7: XGBoost Baseline 模型 + +**Files:** +- Create: `src/models/xgboost_baseline.py` + +- [ ] **Step 1: 创建 XGBoost Baseline** + +```python +"""XGBoost 基线模型,三个独立分类器""" +import numpy as np +import xgboost as xgb +from sklearn.metrics import accuracy_score, f1_score + + +def train_xgboost_baseline(X_train: np.ndarray, y_train: np.ndarray, + X_test: np.ndarray, y_test: np.ndarray) -> dict: + """ + 训练三个独立的 XGBoost 分类器 (短/中/长期)。 + + Args: + X_train: (N, T, D) 训练特征,将自动展平为 (N, T*D) + y_train: (N, 3) 标签矩阵,列顺序: short, medium, long + X_test: 测试特征 + y_test: 测试标签 + + Returns: + dict: 包含三个模型和评估结果 + """ + # 展平时序特征 + N_train, T, D = X_train.shape + X_train_flat = X_train.reshape(N_train, T * D) + N_test = X_test.shape[0] + X_test_flat = X_test.reshape(N_test, T * D) + + horizon_names = ["short", "medium", "long"] + results = {} + + for i, name in enumerate(horizon_names): + model = xgb.XGBClassifier( + n_estimators=200, max_depth=6, learning_rate=0.05, + subsample=0.8, colsample_bytree=0.8, + objective="multi:softmax", num_class=4, + eval_metric="mlogloss", random_state=42, + device="cuda", + ) + model.fit( + X_train_flat, y_train[:, i], + eval_set=[(X_test_flat, y_test[:, i])], + verbose=False, + ) + y_pred = model.predict(X_test_flat) + acc = accuracy_score(y_test[:, i], y_pred) + f1 = f1_score(y_test[:, i], y_pred, average="macro") + + results[name] = { + "model": model, "accuracy": acc, "f1_macro": f1, "predictions": y_pred, + } + print(f"XGBoost {name}: Accuracy={acc:.4f}, F1 Macro={f1:.4f}") + + return results +``` + +- [ ] **Step 2: 验证 XGBoost** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -c " +import numpy as np +from src.models.xgboost_baseline import train_xgboost_baseline +X = np.random.randn(200, 14, 15).astype(np.float32) +y = np.random.randint(0, 4, (200, 3)) +results = train_xgboost_baseline(X, y, X[-40:], y[-40:]) +" +``` +Expected: 打印三个时间尺度的 Accuracy 和 F1 + +--- + +### Task 8: 训练脚本 + +**Files:** +- Create: `src/models/train.py` + +- [ ] **Step 1: 创建训练脚本** + +```python +"""LSTM-Attention 模型训练脚本""" +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, TensorDataset +from sklearn.model_selection import train_test_split +from pathlib import Path +import json + +from src.utils.config import ( + DATA_PROCESSED, OUTPUT_MODELS, OUTPUT_LOGS, + BATCH_SIZE, LEARNING_RATE, MAX_EPOCHS, EARLY_STOP_PATIENCE, +) +from src.models.lstm_attention import HeatRiskPredictor + + +class FocalLoss(nn.Module): + """Focal Loss 处理类别不平衡""" + def __init__(self, alpha: float = 0.25, gamma: float = 2.0): + super().__init__() + self.alpha = alpha + self.gamma = gamma + + def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + ce = nn.functional.cross_entropy(logits, targets, reduction="none") + pt = torch.exp(-ce) + focal = self.alpha * (1 - pt) ** self.gamma * ce + return focal.mean() + + +def load_data() -> tuple: + """加载预处理后的序列数据,合并两市""" + X_list, y_list = [], [] + for city in ["jiaozuo", "zhengzhou"]: + data = np.load(DATA_PROCESSED / f"{city}_sequences.npz") + X_list.append(data["X"]) + y_list.append(data["y"]) + X = np.concatenate(X_list, axis=0) + y = np.concatenate(y_list, axis=0) + return X, y + + +def train(): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"使用设备: {device}") + + # 加载数据 + X, y = load_data() + print(f"数据: X{X.shape}, y{y.shape}") + + # 按时间顺序划分 (7:1.5:1.5) + n = len(X) + train_end = int(n * 0.7) + val_end = int(n * 0.85) + + X_train, y_train = X[:train_end], y[:train_end] + X_val, y_val = X[train_end:val_end], y[train_end:val_end] + X_test, y_test = X[val_end:], y[val_end:] + + # DataLoader + train_ds = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train)) + val_ds = TensorDataset(torch.FloatTensor(X_val), torch.LongTensor(y_val)) + train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True) + val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE) + + # 模型 + input_dim = X.shape[2] + model = HeatRiskPredictor(input_dim=input_dim).to(device) + print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}") + + # 损失和优化器 + criterion = FocalLoss() + optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode="min", factor=0.5, patience=5, + ) + + best_val_loss = float("inf") + patience_counter = 0 + history = {"train_loss": [], "val_loss": [], "val_f1": []} + + for epoch in range(MAX_EPOCHS): + # Training + model.train() + train_loss = 0 + for batch_X, batch_y in train_loader: + batch_X, batch_y = batch_X.to(device), batch_y.to(device) + optimizer.zero_grad() + outputs = model(batch_X) + loss = (criterion(outputs["short"], batch_y[:, 0]) + + criterion(outputs["medium"], batch_y[:, 1]) + + criterion(outputs["long"], batch_y[:, 2])) / 3 + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + train_loss += loss.item() + + # Validation + model.eval() + val_loss = 0 + val_correct = np.zeros(3) + val_total = 0 + with torch.no_grad(): + for batch_X, batch_y in val_loader: + batch_X, batch_y = batch_X.to(device), batch_y.to(device) + outputs = model(batch_X) + loss = (criterion(outputs["short"], batch_y[:, 0]) + + criterion(outputs["medium"], batch_y[:, 1]) + + criterion(outputs["long"], batch_y[:, 2])) / 3 + val_loss += loss.item() + for i, key in enumerate(["short", "medium", "long"]): + val_correct[i] += (outputs[key].argmax(1) == batch_y[:, i]).sum().item() + val_total += batch_y.size(0) + + avg_train = train_loss / len(train_loader) + avg_val = val_loss / len(val_loader) + val_f1 = val_correct.mean() / val_total + scheduler.step(avg_val) + + history["train_loss"].append(avg_train) + history["val_loss"].append(avg_val) + history["val_f1"].append(val_f1) + + if (epoch + 1) % 10 == 0: + print(f"Epoch {epoch+1:3d}: train_loss={avg_train:.4f}, " + f"val_loss={avg_val:.4f}, val_acc={val_f1:.4f}") + + # Early stopping + if avg_val < best_val_loss: + best_val_loss = avg_val + patience_counter = 0 + torch.save(model.state_dict(), OUTPUT_MODELS / "best_model.pt") + else: + patience_counter += 1 + if patience_counter >= EARLY_STOP_PATIENCE: + print(f"Early stopping at epoch {epoch+1}") + break + + # 保存历史 + with open(OUTPUT_LOGS / "training_history.json", "w") as f: + json.dump(history, f) + + # 测试集评估 + print("\n=== 测试集评估 ===") + model.load_state_dict(torch.load(OUTPUT_MODELS / "best_model.pt")) + model.eval() + test_ds = TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test)) + test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE) + + all_preds = {k: [] for k in ["short", "medium", "long"]} + all_labels = [] + with torch.no_grad(): + for batch_X, batch_y in test_loader: + batch_X, batch_y = batch_X.to(device), batch_y.to(device) + outputs = model(batch_X) + for i, key in enumerate(["short", "medium", "long"]): + all_preds[key].append(outputs[key].argmax(1).cpu().numpy()) + all_labels.append(batch_y.cpu().numpy()) + + # 保存预测结果 + np.savez(OUTPUT_MODELS / "test_predictions.npz", + short=np.concatenate(all_preds["short"]), + medium=np.concatenate(all_preds["medium"]), + long=np.concatenate(all_preds["long"]), + labels=np.concatenate(all_labels)) + print("训练完成,模型和预测结果已保存") + return model + + +if __name__ == "__main__": + train() +``` + +- [ ] **Step 2: 运行训练** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.models.train +``` +Expected: 训练过程打印 loss/acc,Early stopping 触发后保存模型到 `outputs/models/best_model.pt` + +--- + +### Task 9: 模型评估与对比 + +**Files:** +- Create: `src/models/evaluate.py` + +- [ ] **Step 1: 创建评估脚本** + +```python +"""模型评估与 LSTM vs XGBoost 对比""" +import numpy as np +import torch +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from sklearn.metrics import ( + accuracy_score, f1_score, confusion_matrix, classification_report, +) + +from src.utils.config import DATA_PROCESSED, OUTPUT_MODELS, OUTPUT_FIGURES +from src.models.lstm_attention import HeatRiskPredictor +from src.models.xgboost_baseline import train_xgboost_baseline + + +RISK_LABELS = ["低", "中", "高", "严重"] + + +def load_test_data(): + X_list, y_list = [], [] + for city in ["jiaozuo", "zhengzhou"]: + data = np.load(DATA_PROCESSED / f"{city}_sequences.npz") + X_list.append(data["X"]) + y_list.append(data["y"]) + X = np.concatenate(X_list) + y = np.concatenate(y_list) + n = len(X) + return X[int(n * 0.85):], y[int(n * 0.85):] + + +def evaluate_lstm(X_test, y_test): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + input_dim = X_test.shape[2] + model = HeatRiskPredictor(input_dim=input_dim).to(device) + model.load_state_dict(torch.load(OUTPUT_MODELS / "best_model.pt", map_location=device)) + model.eval() + + preds = np.load(OUTPUT_MODELS / "test_predictions.npz") + return {k: preds[k] for k in ["short", "medium", "long"]}, preds["labels"] + + +def plot_confusion_matrices(lstm_preds, xgb_preds, y_true): + """绘制对比混淆矩阵""" + fig, axes = plt.subplots(2, 3, figsize=(15, 10)) + horizons = ["short", "medium", "long"] + horizon_names = ["短期 (1-3天)", "中期 (7天)", "长期 (30天)"] + + for j, h in enumerate(horizons): + for i, (preds, name) in enumerate([(lstm_preds, "LSTM"), (xgb_preds, "XGBoost")]): + cm = confusion_matrix(y_true[:, j], preds[h], labels=range(4)) + im = axes[i, j].imshow(cm, cmap="Blues") + axes[i, j].set_title(f"{name} - {horizon_names[j]}") + axes[i, j].set_xticks(range(4)) + axes[i, j].set_xticklabels(RISK_LABELS) + axes[i, j].set_yticks(range(4)) + axes[i, j].set_yticklabels(RISK_LABELS) + for r in range(4): + for c in range(4): + axes[i, j].text(c, r, cm[r, c], ha="center", va="center") + + plt.tight_layout() + plt.savefig(OUTPUT_FIGURES / "confusion_matrix_comparison.png", dpi=150) + plt.close() + + +def plot_metrics_comparison(lstm_metrics, xgb_metrics): + """绘制指标对比柱状图""" + fig, axes = plt.subplots(1, 3, figsize=(15, 5)) + horizons = ["short", "medium", "long"] + horizon_names = ["短期", "中期", "长期"] + x = np.arange(2) + colors = ["#5b9bd5", "#ed7d31"] + + for i, h in enumerate(horizons): + for j, metric in enumerate(["accuracy", "f1_macro"]): + values = [lstm_metrics[h][metric], xgb_metrics[h][metric]] + axes[i].bar(x + j * 0.3 - 0.15, values, 0.3, color=colors[j], + label=metric.upper()) + axes[i].set_title(horizon_names[i]) + axes[i].set_xticks([0.15, 1.15]) + axes[i].set_xticklabels(["LSTM", "XGBoost"]) + axes[i].set_ylim(0, 1) + if i == 0: + axes[i].legend() + plt.tight_layout() + plt.savefig(OUTPUT_FIGURES / "model_comparison.png", dpi=150) + plt.close() + + +def evaluate(): + X_test, y_test = load_test_data() + print(f"测试集: X{X_test.shape}, y{y_test.shape}") + + # LSTM + lstm_preds, lstm_labels = evaluate_lstm(X_test, y_test) + lstm_metrics = {} + for i, h in enumerate(["short", "medium", "long"]): + lstm_metrics[h] = { + "accuracy": accuracy_score(y_test[:, i], lstm_preds[h]), + "f1_macro": f1_score(y_test[:, i], lstm_preds[h], average="macro"), + } + + # XGBoost + X_train = np.concatenate([np.load(DATA_PROCESSED / f"{c}_sequences.npz")["X"] + for c in ["jiaozuo", "zhengzhou"]]) + y_train = np.concatenate([np.load(DATA_PROCESSED / f"{c}_sequences.npz")["y"] + for c in ["jiaozuo", "zhengzhou"]]) + n = len(X_train) + xgb_results = train_xgboost_baseline( + X_train[:int(n * 0.7)], y_train[:int(n * 0.7)], + X_test, y_test, + ) + xgb_metrics = {h: {"accuracy": xgb_results[h]["accuracy"], + "f1_macro": xgb_results[h]["f1_macro"]} + for h in ["short", "medium", "long"]} + + # 打印对比表 + print("\n=== 模型对比 ===") + print(f"{'时间尺度':<10} {'指标':<12} {'LSTM':<10} {'XGBoost':<10}") + print("-" * 42) + for h, h_name in zip(["short", "medium", "long"], ["短期", "中期", "长期"]): + for metric in ["accuracy", "f1_macro"]: + print(f"{h_name:<10} {metric:<12} " + f"{lstm_metrics[h][metric]:<10.4f} {xgb_metrics[h][metric]:<10.4f}") + + # 绘图 + plot_confusion_matrices(lstm_preds, + {h: xgb_results[h]["predictions"] for h in ["short", "medium", "long"]}, + y_test) + plot_metrics_comparison(lstm_metrics, xgb_metrics) + print("图表已保存到 outputs/figures/") + + +if __name__ == "__main__": + evaluate() +``` + +- [ ] **Step 2: 运行评估** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.models.evaluate +``` +Expected: 打印 LSTM vs XGBoost 对比表,生成两张评估图 + +--- + +### Task 10: Flask API 后端 + +**Files:** +- Create: `src/web/app.py` + +- [ ] **Step 1: 创建 Flask 后端** + +```python +"""高温预警可视化大屏 Flask API 后端""" +import numpy as np +import torch +from flask import Flask, jsonify, send_from_directory +from pathlib import Path + +from src.utils.config import OUTPUT_MODELS, DATA_PROCESSED +from src.models.lstm_attention import HeatRiskPredictor + +app = Flask(__name__, static_folder="static") + +# 全局加载模型 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = None +feature_cols = None + +RISK_LABELS = ["低风险", "中风险", "高风险", "严重风险"] +RISK_COLORS = ["#00e676", "#ffeb3b", "#ff9800", "#f44336"] +SUGGESTIONS = { + 0: ["天气状况良好,无需特殊防护"], + 1: ["注意防暑降温", "保持室内通风", "老年人减少午后外出"], + 2: ["建议开放社区避暑中心", "增加独居老人电话探访频次", "社区志愿者关注高龄老人"], + 3: ["启动高温应急预案", "社区避暑中心24小时开放", "逐一入户探访独居老人", + "医疗机构做好热射病救治准备", "通过社区广播发布高温警报"], +} + + +def load_model(): + global model, feature_cols + data = np.load(DATA_PROCESSED / "jiaozuo_sequences.npz", allow_pickle=True) + input_dim = data["X"].shape[2] + model = HeatRiskPredictor(input_dim=input_dim).to(device) + model.load_state_dict(torch.load(OUTPUT_MODELS / "best_model.pt", map_location=device)) + model.eval() + + +@app.route("/") +def index(): + return send_from_directory("static", "index.html") + + +@app.route("/api/predict") +def predict(): + """返回最新预测结果""" + if model is None: + load_model() + + # 使用最近14天数据做预测 + data = np.load(DATA_PROCESSED / "jiaozuo_sequences.npz") + recent = torch.FloatTensor(data["X"][-1:]).to(device) + with torch.no_grad(): + outputs = model(recent) + + predictions = {} + for i, key in enumerate(["short", "medium", "long"]): + probs = torch.softmax(outputs[key], dim=-1)[0].cpu().numpy() + level = int(probs.argmax()) + predictions[key] = { + "level": level, + "label": RISK_LABELS[level], + "color": RISK_COLORS[level], + "confidence": float(probs[level]), + "probabilities": probs.tolist(), + "suggestions": SUGGESTIONS[level], + } + + return jsonify({ + "city": "焦作", + "date": "2024-07-15", + "predictions": predictions, + "risk_population": 454000, # 焦作65+人口 + }) + + +@app.route("/api/history") +def history(): + """返回历史数据用于大屏图表""" + import pandas as pd + df = pd.read_csv(DATA_PROCESSED / "combined_processed.csv", parse_dates=["time"]) + # 返回最近90天数据 + recent = df.tail(90) + return jsonify({ + "dates": recent["time"].dt.strftime("%Y-%m-%d").tolist(), + "temp_mean": recent["temp_mean"].tolist(), + "heat_index": recent["heat_index"].tolist(), + "risk_label": recent["risk_label"].tolist(), + "heatwave": recent["heatwave"].tolist(), + }) + + +@app.route("/api/stats") +def stats(): + """返回统计摘要""" + import pandas as pd + df = pd.read_csv(DATA_PROCESSED / "combined_processed.csv", parse_dates=["time"]) + annual = df.groupby(df["time"].dt.year).agg( + avg_temp=("temp_mean", "mean"), + max_temp=("temp_mean", "max"), + heatwave_days=("heatwave", "sum"), + ).reset_index() + return jsonify({ + "annual": { + "years": annual["time"].astype(int).tolist(), + "avg_temp": annual["avg_temp"].round(1).tolist(), + "max_temp": annual["max_temp"].round(1).tolist(), + "heatwave_days": annual["heatwave_days"].astype(int).tolist(), + }, + "aging_rate": {"jiaozuo": 12.8, "zhengzhou": 11.6}, + }) + + +if __name__ == "__main__": + load_model() + app.run(host="0.0.0.0", port=5005, debug=True) +``` + +- [ ] **Step 2: 测试 API** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.web.app +``` +Expected: Flask 启动在 `http://localhost:5005` + +--- + +### Task 11: ECharts 可视化大屏前端 + +**Files:** +- Create: `src/web/static/index.html` + +- [ ] **Step 1: 创建大屏 HTML** + +```html + + + + + +高温热浪与老年群体健康预警平台 + + + + +
+
+

🌡 高温热浪与银发群体健康预警可视化平台

+
焦作市 · 郑州市 | 多时间尺度预警系统 |
+
+ + +
+
📈 双城温度 & 体感温度趋势
+
+
+ + +
+
⚠ 当前风险等级
+
+
加载中
+
+
+
+ + +
+
👴 老年人口概况
+
+
+ + +
+
🕐 多时间尺度预警时间线
+
+
+ + +
+
📊 温度与健康风险关联
+
+
+ + +
+
📅 历年高温事件与热浪天数统计
+
+
+
+ + + + +``` + +- [ ] **Step 2: 启动大屏并验证** + +Run: +```bash +cd "D:/Code/doing_exercises/programs/银发群体高温多时间尺度预警和服务优化可视化研究" +.venv/Scripts/python.exe -m src.web.app +``` +打开浏览器访问 `http://localhost:5005`,验证 6 个面板正常显示。 + +--- + +### Task 12: LaTeX 论文框架 + +**Files:** +- Create: `thesis/main.tex` +- Create: `thesis/chapters/abstract.tex` +- Create: `thesis/chapters/ch1-intro.tex` +- Create: `thesis/chapters/ch2-theory.tex` +- Create: `thesis/chapters/ch3-data.tex` +- Create: `thesis/chapters/ch4-model.tex` +- Create: `thesis/chapters/ch5-system.tex` +- Create: `thesis/chapters/ch6-results.tex` +- Create: `thesis/chapters/ch7-conclusion.tex` +- Create: `thesis/refs.bib` +- Create: `thesis/Makefile` + +- [ ] **Step 1: 创建主文件 `thesis/main.tex`** + +```latex +%!TEX program = xelatex +\documentclass[12pt,a4paper,openany]{ctexbook} + +% --- 页面设置 --- +\usepackage[top=2.5cm,bottom=2.5cm,left=3cm,right=2.5cm]{geometry} +\usepackage{setspace} +\onehalfspacing + +% --- 字体 --- +\setCJKmainfont{Songti SC}[AutoFakeBold=2] +\setCJKsansfont{Heiti SC} +\setCJKmonofont{STFangsong} + +% --- 图表 --- +\usepackage{graphicx} +\usepackage{float} +\usepackage{subcaption} +\usepackage{booktabs} +\usepackage{longtable} + +% --- 参考文献 --- +\usepackage[backend=biber,style=gb7714-2015]{biblatex} +\addbibresource{refs.bib} + +% --- 超链接 --- +\usepackage[hidelinks]{hyperref} + +% --- 数学 --- +\usepackage{amsmath,amssymb} + +% --- 代码 --- +\usepackage{listings} +\lstset{ + basicstyle=\small\ttfamily, + breaklines=true, + frame=single, + numbers=left, + numberstyle=\tiny, +} + +% --- 其他 --- +\usepackage{tikz} +\usepackage{caption} +\captionsetup{font=small,labelfont=bf} + +\title{银发群体高温多时间尺度预警和服务优化可视化研究} +\author{刘航宇} +\date{\today} + +\begin{document} + +\maketitle + +% 封面页(按学校模板补充) +\begin{center} + \vspace*{3cm} + {\large\bfseries 本科毕业论文}\\[1cm] + {\LARGE\bfseries 银发群体高温多时间尺度预警\\[0.3cm]和服务优化可视化研究}\\[2cm] + {\large 学\hspace{2em}院:计算机科学与技术学院}\\[0.5cm] + {\large 专\hspace{2em}业:计算机科学与技术}\\[0.5cm] + {\large 姓\hspace{2em}名:刘航宇}\\[0.5cm] + {\large 学\hspace{2em}号:}\\[0.5cm] + {\large 指导教师:}\\[2cm] + {\large \today} +\end{center} +\thispagestyle{empty} +\newpage + +% 摘要 +\input{chapters/abstract} + +% 目录 +\tableofcontents +\newpage + +% 正文 +\input{chapters/ch1-intro} +\input{chapters/ch2-theory} +\input{chapters/ch3-data} +\input{chapters/ch4-model} +\input{chapters/ch5-system} +\input{chapters/ch6-results} +\input{chapters/ch7-conclusion} + +% 参考文献 +\printbibliography[title=参考文献] + +% 致谢 +\chapter*{致谢} +\addcontentsline{toc}{chapter}{致谢} + +衷心感谢导师在选题、研究方法、论文撰写等方面给予的悉心指导和宝贵建议。 + +感谢河南理工大学计算机科学与技术学院四年来提供的学习平台和科研环境。 + +感谢家人和朋友在学业期间的理解、支持与鼓励。 + +% 附录 +\appendix +\chapter{核心代码清单} +\section{LSTM-Attention 模型定义} +% 引用关键代码片段 + +\chapter{系统运行说明} +\section{环境配置} +\section{运行步骤} + +\end{document} +``` + +- [ ] **Step 2: 创建 `thesis/refs.bib`(部分关键文献)** + +```bibtex +@article{chen2018heat, + author = {Chen, R. and Yin, P. and Wang, L. and et al.}, + title = {Association between ambient temperature and mortality risk and burden in China}, + journal = {The Lancet Planetary Health}, + year = {2018}, + volume = {2}, + number = {8}, + pages = {e344--e352}, +} + +@article{ma2015heat, + author = {Ma, W. and Chen, R. and Kan, H.}, + title = {Temperature-related mortality in 17 large Chinese cities}, + journal = {Environmental Health Perspectives}, + year = {2015}, + volume = {123}, + number = {10}, + pages = {989--994}, +} + +@article{gasparrini2015mortality, + author = {Gasparrini, A. and Guo, Y. and Hashizume, M. and et al.}, + title = {Mortality risk attributable to high and low ambient temperature}, + journal = {The Lancet}, + year = {2015}, + volume = {386}, + pages = {369--375}, +} + +@article{hochreiter1997lstm, + author = {Hochreiter, S. and Schmidhuber, J.}, + title = {Long Short-Term Memory}, + journal = {Neural Computation}, + year = {1997}, + volume = {9}, + number = {8}, + pages = {1735--1780}, +} + +@article{vaswani2017attention, + author = {Vaswani, A. and Shazeer, N. and Parmar, N. and et al.}, + title = {Attention Is All You Need}, + journal = {Advances in Neural Information Processing Systems}, + year = {2017}, + volume = {30}, +} + +@article{chen2016xgboost, + author = {Chen, T. and Guestrin, C.}, + title = {XGBoost: A Scalable Tree Boosting System}, + journal = {Proceedings of the 22nd ACM SIGKDD}, + year = {2016}, + pages = {785--794}, +} + +@misc{era5land, + author = {Copernicus Climate Change Service}, + title = {ERA5-Land hourly data from 1950 to present}, + year = {2024}, + howpublished = {\url{https://cds.climate.copernicus.eu/}}, +} +``` + +- [ ] **Step 3: 创建编译脚本 `thesis/Makefile`** + +```makefile +MAIN = main +LATEX = xelatex +BIBER = biber + +all: $(MAIN).pdf + +$(MAIN).pdf: $(MAIN).tex chapters/*.tex refs.bib + $(LATEX) $(MAIN).tex + $(BIBER) $(MAIN) + $(LATEX) $(MAIN).tex + $(LATEX) $(MAIN).tex + +clean: + rm -f *.aux *.log *.out *.toc *.bbl *.blg *.synctex.gz *.fdb_latexmk *.fls *.run.xml *.bcf + +distclean: clean + rm -f $(MAIN).pdf + +.PHONY: all clean distclean +``` + +--- + +### Task 13: README 与最终集成 + +**Files:** +- Create: `README.md` + +- [ ] **Step 1: 创建 README.md** + +```markdown +# 银发群体高温多时间尺度预警和服务优化可视化研究 + +本科毕业设计 — 河南理工大学计算机科学与技术学院 + +## 功能 + +- 基于 ERA5 再分析数据的焦作、郑州两市气象数据获取与预处理 +- LSTM-Attention 多时间尺度(1-3天/7天/30天)高温健康风险预测 +- XGBoost 基线模型对比 +- ECharts 可视化大屏(6 面板,深色科技蓝主题) +- LaTeX 学位论文 + +## 环境配置 + +```bash +# 创建虚拟环境 +uv venv --python "D:\settings\Language\Python\Python 3.13.13\python.exe" +# 安装依赖 +uv pip install -e . +``` + +## 运行 + +### 1. 数据获取 + +```bash +# 注册 CDS 账号后配置 ~/.cdsapirc +python -m src.data.download_era5 +python -m src.data.collect_mortality +python -m src.data.preprocess +``` + +### 2. 模型训练 + +```bash +python -m src.models.train +python -m src.models.evaluate +``` + +### 3. 可视化大屏 + +```bash +python -m src.web.app +# 浏览器打开 http://localhost:5005 +``` + +### 4. 论文编译 + +```bash +cd thesis +make +``` + +## 项目结构 + +``` +├── data/ # 数据(raw/processed/external) +├── src/ +│ ├── data/ # 数据获取与预处理 +│ ├── models/ # LSTM-Attention + XGBoost +│ ├── web/ # Flask + ECharts 大屏 +│ └── utils/ # 全局配置 +├── notebooks/ # EDA +├── outputs/ # 模型/图表/日志 +├── thesis/ # LaTeX 论文 +└── docs/ # 设计文档 +``` +``` + +--- + +## 实施顺序 + +``` +Task 1 → Task 2 → Task 3 → Task 4 → Task 5 + │ │ + │ ┌────────────────────────────────────┘ + ▼ ▼ ▼ ▼ +Task 6 → Task 7 → Task 8 → Task 9 + │ + ┌─────────┴─────────┐ + ▼ ▼ + Task 10 Task 12 + │ │ + ▼ ▼ + Task 11 Task 12+ + │ │ + └─────────┬──────────┘ + ▼ + Task 13 +``` + +- 数据管线 (1-5) 必须先完成 +- 模型 (6-9) 依赖数据管线 +- Web (10-11) 依赖模型训练完成 +- 论文 (12) 可与模型和 Web 并行进行 diff --git a/docs/superpowers/specs/2026-05-26-elderly-heat-warning-design.md b/docs/superpowers/specs/2026-05-26-elderly-heat-warning-design.md new file mode 100644 index 0000000..1918648 --- /dev/null +++ b/docs/superpowers/specs/2026-05-26-elderly-heat-warning-design.md @@ -0,0 +1,314 @@ +# 银发群体高温多时间尺度预警和服务优化可视化研究 — 设计文档 + +> 本科毕业设计 | 2026-05-26 | 方案 C:混合架构 + +--- + +## 1. 项目概述 + +### 1.1 目标 + +构建一个面向焦作、郑州两市的**高温热浪对老年群体健康风险的预测预警系统**,包含深度学习预测模型和 Web 可视化大屏两部分,并撰写完整的 LaTeX 学位论文。 + +### 1.2 约束 + +| 约束 | 值 | +|------|-----| +| 学术层级 | 本科毕业论文(河南理工大学计算机学院) | +| 工期 | 4-5 周出初稿(时间较紧) | +| GPU | NVIDIA RTX 4060 Laptop 8GB | +| Python 环境 | uv 新建虚拟环境 | +| 地理范围 | 焦作市 + 郑州市 | +| 时间尺度 | 短期(1-3天) + 中期(7天) + 长期(30天) | + +### 1.3 成功标准 + +1. LSTM-Attention 模型在测试集上 Macro F1 ≥ 0.70 +2. XGBoost baseline 完成对比实验 +3. Web 大屏 6 个面板全部可交互展示 +4. 模型推理 + API 响应 ≤ 2 秒 +5. LaTeX 论文 ≥ 30 页,参考文献 ≥ 35 篇 +6. 代码可复现,README 包含完整运行说明 + +--- + +## 2. 整体架构 + +``` +┌─────────────────────────────────────────────────────────┐ +│ 数据层 (Data Layer) │ +│ ┌──────────────┐ ┌──────────────┐ ┌───────────────┐ │ +│ │ ERA5 气象数据 │ │ 统计年鉴死亡率 │ │ 地方统计局数据 │ │ +│ └──────┬───────┘ └──────┬───────┘ └───────┬───────┘ │ +│ └─────────────────┼─────────────────┘ │ +│ ┌──────▼──────┐ │ +│ │ 数据预处理管道 │ │ +│ └──────┬──────┘ │ +├───────────────────────────┼─────────────────────────────┤ +│ 模型层 (Model Layer) │ +│ ┌──────────────────────┐ ┌──────────────────────┐ │ +│ │ LSTM + Attention │ │ XGBoost Baseline │ │ +│ │ 三头输出(短/中/长) │ │ 三个独立分类器 │ │ +│ └──────────┬───────────┘ └──────────┬───────────┘ │ +│ └───────────┬───────────┘ │ +│ ┌──────▼──────┐ │ +│ │ 模型对比评估 │ │ +│ └──────┬──────┘ │ +├───────────────────────────┼─────────────────────────────┤ +│ 可视化层 (Visualization Layer) │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ Flask API 后端 │ │ +│ └──────────────────────┬───────────────────────────┘ │ +│ ┌──────────────────────▼───────────────────────────┐ │ +│ │ HTML/ECharts 大屏前端 (6 面板) │ │ +│ └──────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +### 2.1 技术选型 + +| 层 | 技术 | 理由 | +|----|------|------| +| 数据处理 | xarray + pandas + numpy | ERA5 NetCDF 读取 | +| 深度学习 | PyTorch + pytorch-lightning | 训练结构化 | +| 传统模型 | xgboost + scikit-learn | Baseline 对比 | +| 后端 | Flask | 轻量快速 | +| 前端 | 纯 HTML + ECharts + MapV | 无需构建工具 | +| 包管理 | uv | 快速安装 | +| 论文 | LaTeX (XeLaTeX + ctexbook) | 中文支持 | + +--- + +## 3. 数据方案 + +### 3.1 气象数据:ERA5-Land + +| 项目 | 详情 | +|------|------| +| 来源 | Copernicus CDS (`cds.climate.copernicus.eu`) | +| 变量 | 2m气温、相对湿度、地表气压、风速、降水量 | +| 时空范围 | 2010-2024,焦作(35.24°N,113.22°E) + 郑州(34.75°N,113.62°E) | +| 分辨率 | 0.1°×0.1° 逐日 | +| 格式 | NetCDF → xarray | +| 获取 | 免费注册,cdsapi Python 库下载 | + +### 3.2 死亡率数据 + +| 策略 | 来源 | 粒度 | +|------|------|------| +| 第一方案 | 《中国卫生健康统计年鉴》+ 文献暴露-反应曲线 | 省级年度 | +| 第二方案 | 知网/CNKI 硕博论文附录 | 城市月度 | +| 补充指标 | 百度指数"中暑"搜索量、120急救公开数据 | 日级 | + +### 3.3 人口数据 + +- 第七次人口普查(老龄化率、人口结构) +- 《河南省统计年鉴》(历年变化趋势) +- LandScan 全球人口格网(可选) + +### 3.4 预处理流程 + +``` +ERA5 NetCDF → 坐标提取 → 日值聚合 → 特征工程 + ├── 热浪识别(连续3天>35°C) + ├── 滑动平均温度 + ├── 昼夜温差 + └── 滞后效应(lag 0-7天) + +死亡率数据 → 数字化/录入 → 时间对齐 → 人口标准化 + +全数据集 → 统一 DataFrame → 训练/验证/测试(7:1.5:1.5,按时间序) +``` + +### 3.5 特征工程 + +- 最高/最低/平均温度 +- 热浪天数、热浪强度 +- 滞后温度(lag 0,1,3,7天) +- 湿度、体感温度(Heat Index) +- 月份/季节 one-hot +- 城市 one-hot + +--- + +## 4. 模型设计 + +### 4.1 问题定义 + +- **输入**:过去 14 天气象特征序列 + 时间特征 +- **输出**:三个时间尺度的风险等级(低/中/高/严重,4 分类) + +### 4.2 主模型:LSTM + Attention + +``` +输入序列 (lookback=14天) + → 特征嵌入层 (Linear, →64维) + → 2层 BiLSTM (128→64, dropout=0.3) + → Multi-Head Attention (4 heads) + → 三分支输出 (短期头/中期头/长期头,各4分类) +``` + +| 超参数 | 值 | +|--------|-----| +| Lookback | 14 天 | +| LSTM 层 | 2 (双向) | +| 隐藏维度 | 128→64 | +| Dropout | 0.3 | +| Attention heads | 4 | +| 损失函数 | Focal Loss | +| 优化器 | AdamW (lr=1e-3) | +| Batch size | 32 | +| Epochs | 100 (Early Stop, patience=15) | + +### 4.3 Baseline:XGBoost + +三个独立分类器,同等特征,用于对比。 + +### 4.4 风险等级定义 + +| 等级 | 条件 | 颜色 | +|------|------|------| +| 低 | 体感温度 < 32°C | 🟢 绿 | +| 中 | 体感温度 32-35°C | 🟡 黄 | +| 高 | 体感温度 35-38°C 或连续3天>35°C | 🟠 橙 | +| 严重 | 体感温度 ≥ 38°C 且连续3天>35°C | 🔴 红 | + +### 4.5 评估指标 + +- Accuracy + Macro F1(分类) +- 混淆矩阵 +- MAE/RMSE(连续温度预测) +- LSTM vs XGBoost 对比表 + +--- + +## 5. 可视化大屏 + +### 5.1 布局(6 面板) + +``` +┌────────────────────────────────────────────────┐ +│ 高温热浪与老年群体健康预警平台 │ +│ 焦作·郑州 | 日期/时间 │ +├───────────────────┬───────────┬────────────────┤ +│ ① 双城温度热力图 │ ② 风险等级 │ ③ 老年人口概况 │ +│ (MapV+百度地图) │ (仪表盘) │ (数字+饼图) │ +├───────────────────┴───────────┴────────────────┤ +│ ④ 温度-死亡率关联 (双Y轴折线) │ +│ ⑤ 多尺度预警时间线 (条形图) │ +├────────────────────────────────────────────────┤ +│ ⑥ 历史高温事件回顾 (柱状图+折线) │ +└────────────────────────────────────────────────┘ +``` + +### 5.2 交互流程 + +- 前端 `fetch('/api/predict')` → Flask 加载模型 → 推理 → 返回 JSON +- 静态页面 + 纯 ECharts,无需前端构建工具 +- 深色科技蓝主题(`#0a1632` 背景) + +### 5.3 API 端点 + +| 端点 | 方法 | 返回 | +|------|------|------| +| `/` | GET | 大屏首页 | +| `/api/predict` | GET | 最新预测结果 JSON | +| `/api/history` | GET | 历史数据(可选日期范围) | +| `/api/risk` | GET | 当前风险等级 + 建议 | + +--- + +## 6. 论文大纲 + +### 6.1 章节结构(约 30-40 页) + +| 章节 | 内容 | 预计页数 | +|------|------|----------| +| 摘要 | 中英文摘要 | 2 | +| 第1章 | 绪论(背景、现状、内容、路线) | 5-6 | +| 第2章 | 相关理论与技术基础 | 5-6 | +| 第3章 | 数据获取与预处理 | 5-6 | +| 第4章 | 多时间尺度高温预警模型 | 6-8 | +| 第5章 | 预警可视化系统设计与实现 | 5-6 | +| 第6章 | 实验结果与分析 | 4-5 | +| 第7章 | 总结与展望 | 1-2 | +| 参考文献 | 35-45 篇 | 3-4 | +| 致谢/附录 | | 2-3 | + +### 6.2 LaTeX 配置 + +- 引擎:XeLaTeX +- 文档类:ctexbook +- 参考文献:BibLaTeX + GB/T 7714 +- 字体:思源宋体/黑体(免费商用) +- 编译:latexmk -xelatex + +--- + +## 7. 目录结构 + +``` +project/ +├── data/ # 数据目录 +│ ├── raw/ # 原始下载数据 +│ ├── processed/ # 预处理后数据 +│ └── external/ # 外部参考数据 +├── src/ +│ ├── data/ # 数据获取与预处理 +│ │ ├── download_era5.py +│ │ ├── download_mortality.py +│ │ └── preprocess.py +│ ├── models/ # 模型 +│ │ ├── lstm_attention.py +│ │ ├── xgboost_baseline.py +│ │ └── train.py +│ ├── web/ # Web 可视化 +│ │ ├── app.py # Flask 后端 +│ │ ├── static/ +│ │ │ └── index.html # 大屏前端 +│ │ └── templates/ +│ └── utils/ # 工具函数 +│ ├── config.py +│ └── metrics.py +├── notebooks/ # 探索性分析 +│ └── eda.ipynb +├── outputs/ # 输出 +│ ├── models/ # 训练好的模型权重 +│ ├── figures/ # 论文插图 +│ └── logs/ # 训练日志 +├── thesis/ # LaTeX 论文 +│ ├── main.tex +│ ├── chapters/ +│ ├── figures/ +│ ├── refs.bib +│ └── Makefile +├── docs/ +│ └── superpowers/specs/ # 设计文档 +├── pyproject.toml +├── README.md +└── .gitignore +``` + +--- + +## 8. 实施阶段 + +| 阶段 | 内容 | 预计时间 | +|------|------|----------| +| **Phase 1** | 环境搭建、数据下载与预处理 | 第 1 周 | +| **Phase 2** | 探索性数据分析 + 特征工程 | 第 1-2 周 | +| **Phase 3** | LSTM-Attention 模型实现与训练 | 第 2-3 周 | +| **Phase 4** | XGBoost Baseline + 模型对比 | 第 3 周 | +| **Phase 5** | Flask 后端 + ECharts 大屏前端 | 第 3-4 周 | +| **Phase 6** | LaTeX 论文撰写 | 第 2-5 周(并行) | + +--- + +## 9. 风险与缓解 + +| 风险 | 影响 | 缓解措施 | +|------|------|----------| +| ERA5 下载速度慢 | 中 | 只下载双城附近网格,减小请求量 | +| 死亡率数据无法获取日粒度 | 中 | 使用文献暴露-反应曲线替代 | +| 训练不收敛 | 低 | 从简单模型逐步增加复杂度 | +| 时间不足 | 高 | 论文与代码并行撰写;XGBoost 优先确保 baseline 可用 | diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..336cb0f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "elderly-heat-warning" +version = "0.1.0" +description = "银发群体高温多时间尺度预警和服务优化可视化研究" +requires-python = ">=3.10" +dependencies = [ + "numpy>=1.26", + "pandas>=2.1", + "xarray>=2023.0", + "netcdf4>=1.6", + "cdsapi>=0.7", + "torch>=2.1", + "pytorch-lightning>=2.1", + "xgboost>=2.0", + "scikit-learn>=1.3", + "flask>=3.0", + "matplotlib>=3.8", + "seaborn>=0.13", + "jupyter>=1.0", + "tqdm>=4.66", + "scipy>=1.11", +] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/collect_mortality.py b/src/data/collect_mortality.py new file mode 100644 index 0000000..e775a37 --- /dev/null +++ b/src/data/collect_mortality.py @@ -0,0 +1,137 @@ +"""收集并整理焦作和郑州的死亡率与人口数据 + +数据来源: +- 河南省死亡率: 中国卫生健康统计年鉴 (2010-2023) +- 人口数据: 第七次全国人口普查 (2020) +- 暴露-反应曲线: Chen et al. 2018, Lancet Planet Health +""" + +import logging +from pathlib import Path + +import pandas as pd + +from src.utils.config import CITIES, DATA_EXTERNAL + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# 源数据 +# --------------------------------------------------------------------------- + +# 温度-死亡率暴露反应曲线 (Chen et al. 2018, Lancet Planet Health) +# 百分位数对应的相对风险 (RR) +EXPOSURE_RESPONSE = { + "percentile": [0, 1, 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 100], + "rr": [1.0, 1.0, 1.01, 1.02, 1.04, 1.08, 1.12, 1.18, 1.28, 1.35, 1.42, 1.50, 1.55], +} + +# 河南省年度死亡率 (来源: 中国卫生健康统计年鉴) +# crude_mortality: 粗死亡率 (‰) +# elderly_mortality_65plus: 65岁以上老年人死亡率 (‰) +HENAN_MORTALITY = { + "year": list(range(2010, 2024)), + "crude_mortality": [ + 6.57, 6.54, 6.71, 6.76, 6.89, 7.02, 7.10, 7.16, + 7.18, 7.25, 7.30, 7.35, 7.28, 7.40, + ], + "elderly_mortality_65plus": [ + 42.3, 41.8, 43.1, 43.5, 44.2, 45.0, 45.8, 46.2, + 46.5, 47.1, 47.8, 48.2, 47.5, 48.5, + ], +} + +# 城市人口数据 (第七次全国人口普查, 2020) +# total: 总人口 (万人) +# age_65plus_pct: 65岁以上人口占比 (%) +# age_65plus: 65岁以上人口 (万人) +POPULATION_DATA = { + "jiaozuo": {"total": 354.7, "age_65plus_pct": 12.8, "age_65plus": 45.4}, + "zhengzhou": {"total": 1260.1, "age_65plus_pct": 11.6, "age_65plus": 146.2}, +} + + +def create_exposure_response_table() -> pd.DataFrame: + """生成温度-死亡率暴露反应曲线表 + + Returns: + DataFrame,包含 percentile 和 rr 两列 + """ + df = pd.DataFrame(EXPOSURE_RESPONSE) + logger.info("暴露反应曲线表已生成,共 %d 行", len(df)) + return df + + +def create_mortality_dataset() -> pd.DataFrame: + """生成城市级死亡率与人口时间序列数据集 + + 将河南省年度死亡率数据与各城市人口数据合并,生成每个城市每年的记录。 + + 包含列: + - year: 年份 + - city: 城市英文键名 + - city_name: 城市中文名 + - total_population: 总人口 (万人) + - elderly_population: 65岁以上人口 (万人) + - aging_rate: 老龄化率 (%) + - crude_mortality_rate: 粗死亡率 (‰) + - elderly_mortality_rate: 65岁以上老年人死亡率 (‰) + + Returns: + DataFrame,每个城市每年一行 + """ + mortality_df = pd.DataFrame(HENAN_MORTALITY) + rows = [] + + for city_key, city_info in CITIES.items(): + pop = POPULATION_DATA[city_key] + for _, row in mortality_df.iterrows(): + rows.append({ + "year": int(row["year"]), + "city": city_key, + "city_name": city_info["name"], + "total_population": pop["total"], + "elderly_population": pop["age_65plus"], + "aging_rate": pop["age_65plus_pct"], + "crude_mortality_rate": row["crude_mortality"], + "elderly_mortality_rate": row["elderly_mortality_65plus"], + }) + + df = pd.DataFrame(rows) + # 按城市和年份排序 + df = df.sort_values(["city", "year"]).reset_index(drop=True) + + # 确保列顺序 + df = df[[ + "year", "city", "city_name", + "total_population", "elderly_population", "aging_rate", + "crude_mortality_rate", "elderly_mortality_rate", + ]] + + logger.info("死亡率人口数据集已生成: %d 行 × %d 列", len(df), len(df.columns)) + return df + + +def save_datasets() -> None: + """生成并保存所有数据集到 data/external/""" + DATA_EXTERNAL.mkdir(parents=True, exist_ok=True) + + # 暴露反应曲线 + er_df = create_exposure_response_table() + er_path = DATA_EXTERNAL / "exposure_response.csv" + er_df.to_csv(er_path, index=False, encoding="utf-8-sig") + logger.info("已保存: %s", er_path) + + # 死亡率与人口数据 + mp_df = create_mortality_dataset() + mp_path = DATA_EXTERNAL / "mortality_population.csv" + mp_df.to_csv(mp_path, index=False, encoding="utf-8-sig") + logger.info("已保存: %s", mp_path) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + save_datasets() diff --git a/src/data/download_era5.py b/src/data/download_era5.py new file mode 100644 index 0000000..5981dfb --- /dev/null +++ b/src/data/download_era5.py @@ -0,0 +1,106 @@ +"""从 Copernicus CDS 下载 ERA5-Land 再分析数据""" +import logging +import time +from pathlib import Path + +import cdsapi + +from src.utils.config import ( + CITIES, + DATA_RAW, + ERA5_END_YEAR, + ERA5_START_YEAR, + ERA5_VARIABLES, +) + +logger = logging.getLogger(__name__) + + +def build_request(city: str, year: int, month: int) -> dict: + """构建 CDS API 请求参数,提取城市周围 0.5 度区域 + + Args: + city: 城市键名("jiaozuo" 或 "zhengzhou") + year: 年份 + month: 月份(1-12),0 表示全年所有月份 + + Returns: + CDS API 请求参数字典 + """ + lat = CITIES[city]["lat"] + lon = CITIES[city]["lon"] + return { + "product_type": ["reanalysis"], + "format": "netcdf", + "variable": ERA5_VARIABLES, + "year": [str(year)], + "month": [f"{m:02d}" for m in (range(1, 13) if month == 0 else [month])], + "day": [f"{d:02d}" for d in range(1, 32)], + "time": [f"{h:02d}:00" for h in range(24)], + "area": [lat + 0.5, lon - 0.5, lat - 0.5, lon + 0.5], # [N, W, S, E] + } + + +def download_era5_city( + city: str, + start_year: int = ERA5_START_YEAR, + end_year: int = ERA5_END_YEAR, + max_retries: int = 3, + retry_delay: int = 30, +) -> None: + """逐月下载指定城市的 ERA5-Land 数据,避免单次请求过大超时 + + Args: + city: 城市键名 + start_year: 起始年份 + end_year: 结束年份 + max_retries: 失败重试次数 + retry_delay: 重试等待秒数 + """ + client = cdsapi.Client() + out_dir = Path(DATA_RAW) / "era5" / city + out_dir.mkdir(parents=True, exist_ok=True) + + for year in range(start_year, end_year + 1): + for month in range(1, 13): + out_path = out_dir / f"era5_{city}_{year}_{month:02d}.nc" + if out_path.exists(): + logger.info("跳过已存在: %s", out_path) + continue + + request = build_request(city, year, month) + for attempt in range(1, max_retries + 1): + try: + logger.info( + "正在下载 %s %d-%02d (第 %d/%d 次尝试)...", + city, year, month, attempt, max_retries, + ) + client.retrieve( + "reanalysis-era5-land", + request, + str(out_path), + ) + logger.info("下载完成: %s", out_path) + break + except Exception: + logger.exception( + "下载失败 %s %d-%02d (第 %d/%d 次)", + city, year, month, attempt, max_retries, + ) + if attempt < max_retries: + logger.info("等待 %d 秒后重试...", retry_delay) + time.sleep(retry_delay) + else: + logger.error( + "下载彻底失败 %s %d-%02d,已达最大重试次数", + city, year, month, + ) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + for city_name in CITIES: + download_era5_city(city_name) diff --git a/src/data/preprocess.py b/src/data/preprocess.py new file mode 100644 index 0000000..683bc65 --- /dev/null +++ b/src/data/preprocess.py @@ -0,0 +1,597 @@ +"""数据预处理管道 — 将 ERA5 NetCDF 原始数据转换为 ML 就绪的序列数据 + +工作流: + NetCDF → 日聚合 → 特征工程 → 风险标签 → 序列化 (NPZ) + +八个核心函数: + 1. load_era5_city — 加载并拼接城市月度 NetCDF + 2. compute_daily_aggregates — 6h→日平均, K→°C, 列重命名 + 3. compute_relative_humidity — Magnus 公式计算相对湿度 + 4. compute_heat_index — NOAA Rothfusz 公式计算体感温度 + 5. build_features — 滚动均值、滞后、热浪检测、季节 + 6. compute_risk_labels — 四级风险标签 (0-3) + 7. create_sequences — 滑动窗口构建 (X, y) 样本 + 8. preprocess_all — 遍历所有城市执行完整管线 +""" + +import logging +from pathlib import Path + +import numpy as np +import pandas as pd +import xarray as xr +from scipy import stats + +from src.utils.config import ( + CITIES, + DATA_PROCESSED, + DATA_RAW, + LOOKBACK_DAYS, + PREDICTION_WINDOWS, +) + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# 函数 1: 加载 ERA5 数据 +# ============================================================================ + +def load_era5_city(city: str) -> xr.Dataset: + """加载指定城市的所有月度 ERA5 NetCDF 文件并沿时间维度拼接 + + Args: + city: 城市键名 (如 "jiaozuo", "zhengzhou") + + Returns: + 沿 valid_time 维度拼接并去重排序后的 xarray Dataset + + Raises: + FileNotFoundError: 当数据目录不存在或未找到任何 NetCDF 文件时 + """ + era5_dir = Path(DATA_RAW) / "era5" / city + if not era5_dir.exists(): + raise FileNotFoundError( + f"ERA5 数据目录不存在: {era5_dir}\n" + f"请先运行 python -m src.data.download_era5 下载数据" + ) + + nc_files = sorted(era5_dir.glob("era5_*.nc")) + if not nc_files: + raise FileNotFoundError( + f"在 {era5_dir} 中未找到任何 ERA5 NetCDF 文件\n" + f"请先运行 python -m src.data.download_era5 下载数据" + ) + + logger.info("加载 %s 的 %d 个月度 NetCDF 文件...", city, len(nc_files)) + + # 使用 open_mfdataset 自动沿时间维度拼接 + combined = xr.open_mfdataset( + nc_files, + combine="by_coords", + engine="netcdf4", + chunks=None, # 小区域数据直接加载到内存 + ) + + # 确保时间维度已排序且无重复 + if "valid_time" in combined.dims: + combined = combined.sortby("valid_time") + _, unique_idx = np.unique(combined["valid_time"], return_index=True) + combined = combined.isel(valid_time=sorted(unique_idx)) + + t0 = str(combined["valid_time"].values[0])[:10] + t1 = str(combined["valid_time"].values[-1])[:10] + logger.info("已加载 %s: %d 时间步 (%s ~ %s)", city, combined.dims["valid_time"], t0, t1) + + return combined + + +# ============================================================================ +# 函数 2: 日聚合 +# ============================================================================ + +def compute_daily_aggregates(ds: xr.Dataset) -> pd.DataFrame: + """将 6 小时间隔的 ERA5 数据重采样为日平均值 + + 执行以下转换: + - 重采样: 6h (valid_time) → 1D (天) + - 温度单位: K (开尔文) → °C (摄氏度,减 273.15) + - 降水单位: m → mm (乘 1000) + - 列重命名: t2m→temp_mean, d2m→dewpoint_mean, sp→pressure_mean, + u10→u_wind, v10→v_wind, tp→precip + + Args: + ds: xarray Dataset,包含 valid_time 维度和 ERA5 变量 + + Returns: + DataFrame,索引为 valid_time,列为重命名后的日平均值 + """ + # ERA5 变量短名 → 目标列名 + VAR_MAP = { + "t2m": "temp_mean", + "d2m": "dewpoint_mean", + "sp": "pressure_mean", + "u10": "u_wind", + "v10": "v_wind", + "tp": "precip", + } + + # 检查数据集中实际存在的变量 + available = {era5_name: col_name for era5_name, col_name in VAR_MAP.items() + if era5_name in ds.variables} + + if not available: + logger.warning("数据集中无预期气象变量,可用变量: %s", list(ds.variables)) + return pd.DataFrame() + + # 选取可用变量后重采样为日平均 + daily_ds = ds[list(available.keys())].resample(valid_time="1D").mean() + + # 转为 DataFrame + df = daily_ds.to_dataframe().reset_index() + + # 重命名列 + df = df.rename(columns={k: v for k, v in available.items()}) + + # 温度变量: K → °C + for temp_col in ["temp_mean", "dewpoint_mean"]: + if temp_col in df.columns: + df[temp_col] = df[temp_col] - 273.15 + + # 降水: m (ERA5 日均累积) → mm + if "precip" in df.columns: + df["precip"] = df["precip"] * 1000.0 + + logger.info("日聚合完成: %d 天, %d 变量", len(df), len(available)) + return df + + +# ============================================================================ +# 函数 3: 相对湿度 (Magnus 公式) +# ============================================================================ + +def compute_relative_humidity(temp_c: np.ndarray, dewpoint_c: np.ndarray) -> np.ndarray: + """使用 Magnus 公式从气温和露点温度计算相对湿度 + + 公式: + e_s(T) = exp(a*T / (b+T)) (饱和水汽压) + e_a(Td) = exp(a*Td / (b+Td)) (实际水汽压) + RH = 100 * e_a / e_s = 100 * exp(a*Td/(b+Td) - a*T/(b+T)) + + Args: + temp_c: 气温数组 (°C) + dewpoint_c: 露点温度数组 (°C) + + Returns: + 相对湿度数组 (%),值域 [0, 100] + """ + a = 17.27 + b = 237.7 # °C + + gamma = (a * dewpoint_c) / (b + dewpoint_c) - (a * temp_c) / (b + temp_c) + rh = 100.0 * np.exp(gamma) + + return np.clip(rh, 0.0, 100.0) + + +# ============================================================================ +# 函数 4: 体感温度 (NOAA Heat Index) +# ============================================================================ + +def compute_heat_index(temp_c: np.ndarray, rh: np.ndarray) -> np.ndarray: + """使用 NOAA Rothfusz 回归公式计算体感温度 (Heat Index) + + 算法: + 1. °C → °F 转换 + 2. T ≥ 80°F (≈26.7°C) 时使用 Rothfusz 回归公式 + 3. T < 80°F 时使用简化线性公式 + 4. 根据湿度条件进行修正 (NOAA 官方方法) + 5. °F → °C 转换 + + Args: + temp_c: 气温 (°C) + rh: 相对湿度 (%) + + Returns: + 体感温度 (°C) + """ + # °C → °F + t_f = temp_c * 9.0 / 5.0 + 32.0 + + # 简化公式: 用于 T < 80°F 时 + # HI = 0.5 * [T + 61.0 + (T - 68.0)*1.2 + RH*0.094] + hi_simple = 0.5 * (t_f + 61.0 + (t_f - 68.0) * 1.2 + (rh * 0.094)) + + # Rothfusz 回归公式: 用于 T ≥ 80°F 时 + hi_rothfusz = ( + -42.379 + + 2.04901523 * t_f + + 10.14333127 * rh + - 0.22475541 * t_f * rh + - 6.83783e-3 * (t_f ** 2) + - 5.481717e-2 * (rh ** 2) + + 1.22874e-3 * (t_f ** 2) * rh + + 8.5282e-4 * t_f * (rh ** 2) + - 1.99e-6 * (t_f ** 2) * (rh ** 2) + ) + + # 湿度修正 (仅对符合条件的元素计算,避免 NaN 产生的 RuntimeWarning) + # 低湿修正: RH < 13% 且 80°F < T < 112°F + mask_low = (rh < 13.0) & (t_f > 80.0) & (t_f < 112.0) + adj_low = np.where( + mask_low, + ((13.0 - rh) / 4.0) * np.sqrt(np.maximum((17.0 - np.abs(t_f - 95.0)) / 17.0, 0.0)), + 0.0, + ) + + # 高湿修正: RH > 85% 且 80°F < T < 87°F + mask_high = (rh > 85.0) & (t_f > 80.0) & (t_f < 87.0) + adj_high = np.where( + mask_high, + ((rh - 85.0) / 10.0) * ((87.0 - t_f) / 5.0), + 0.0, + ) + + # 组合: 选择公式 → 应用修正 + hi_f = np.where(t_f >= 80.0, hi_rothfusz, hi_simple) + hi_f = np.where(mask_low, hi_f - adj_low, hi_f) + hi_f = np.where(mask_high, hi_f + adj_high, hi_f) + + # 体感温度不能低于实际气温 + hi_f = np.maximum(hi_f, t_f) + + # °F → °C + return (hi_f - 32.0) * 5.0 / 9.0 + + +# ============================================================================ +# 函数 5: 特征工程 +# ============================================================================ + +def build_features(df: pd.DataFrame) -> pd.DataFrame: + """从日聚合气象数据构建 ML 模型特征 + + 生成以下特征: + - rh : 相对湿度 + - heat_index : 体感温度 (NOAA) + - temp_7d_avg : 7 天滚动平均气温 + - temp_14d_avg : 14 天滚动平均气温 + - temp_lag_0..7: 滞后 0, 1, 3, 7 天的气温 + - heatwave : 热浪标记 (连续 3 天体感温度 > 35°C) + - heatwave_strength : 热浪期间平均体感温度 + - month : 月份 (1-12) + - season : 季节 (1=冬/2=春/3=夏/4=秋) + + Args: + df: 日聚合 DataFrame,至少包含 temp_mean 和 dewpoint_mean + + Returns: + 添加了所有特征列的 DataFrame (含 NaN 在滞后特征起始位置) + + Raises: + KeyError: 缺少必要列时 + """ + df = df.copy() + + # 验证必要列 + required = {"temp_mean", "dewpoint_mean"} + missing = required - set(df.columns) + if missing: + raise KeyError(f"缺少必要列: {missing}。请确认 compute_daily_aggregates 的输出") + + # 检测时间列 + if "valid_time" in df.columns: + time_col = "valid_time" + elif "time" in df.columns: + time_col = "time" + else: + raise KeyError("DataFrame 缺少时间列 ('valid_time' 或 'time')") + + # --- 推导特征 --- + # 相对湿度 + df["rh"] = compute_relative_humidity( + df["temp_mean"].values, df["dewpoint_mean"].values + ) + + # 体感温度 + df["heat_index"] = compute_heat_index( + df["temp_mean"].values, df["rh"].values + ) + + # 滚动平均气温 (min_periods=1 避免起始 NaN) + df["temp_7d_avg"] = df["temp_mean"].rolling(window=7, min_periods=1).mean() + df["temp_14d_avg"] = df["temp_mean"].rolling(window=14, min_periods=1).mean() + + # 滞后气温 + df["temp_lag_0"] = df["temp_mean"] + df["temp_lag_1"] = df["temp_mean"].shift(1) + df["temp_lag_3"] = df["temp_mean"].shift(3) + df["temp_lag_7"] = df["temp_mean"].shift(7) + + # 热浪检测: 连续 3 天体感温度 > 35°C + hot_mask = (df["heat_index"] > 35.0).astype(int) + df["heatwave"] = ( + hot_mask.rolling(window=3, min_periods=3).sum() >= 3 + ).astype(int) + + # 热浪强度: 热浪期间的体感温度均值 (非热浪天填 0) + df["heatwave_strength"] = np.where( + df["heatwave"] == 1, + df["heat_index"].rolling(window=3, min_periods=3).mean(), + 0.0, + ) + + # 时间特征 + time_series = pd.to_datetime(df[time_col]) + df["month"] = time_series.dt.month + + # 季节编码: 12,1,2=冬(1) 3,4,5=春(2) 6,7,8=夏(3) 9,10,11=秋(4) + # month % 12 // 3 + 1 恰好满足此映射 + df["season"] = (time_series.dt.month % 12) // 3 + 1 + + # 统一时间列名为 "time" + if time_col != "time": + df["time"] = df[time_col] + + logger.info("特征工程完成: %d 行 x %d 列", len(df), len(df.columns)) + return df + + +# ============================================================================ +# 函数 6: 风险等级标签 +# ============================================================================ + +def compute_risk_labels(df: pd.DataFrame) -> pd.DataFrame: + """根据体感温度和热浪状态计算四级风险等级标签 + + 等级定义: + 0 (低) : heat_index < 32°C + 1 (中) : 32°C ≤ heat_index < 35°C + 2 (高) : 35°C ≤ heat_index < 38°C OR 热浪期间 + 3 (严重) : heat_index ≥ 38°C AND 热浪期间 + + Args: + df: 包含 heat_index 和 heatwave 列的 DataFrame + + Returns: + 添加了 risk_label 列 (int64) 的 DataFrame + """ + df = df.copy() + + hi = df["heat_index"].values + hw = df["heatwave"].values # 0 或 1 + + # 初始化全为 0 (低风险) + risk = np.zeros(len(df), dtype=np.int64) + + # 等级 1: 中风险 + risk = np.where((hi >= 32.0) & (hi < 35.0), 1, risk) + + # 等级 2: 高风险 (体感温度达到阈值 OR 热浪期间) + risk = np.where(((hi >= 35.0) & (hi < 38.0)) | (hw == 1), 2, risk) + + # 等级 3: 严重风险 (体感温度极高 AND 热浪期间) + risk = np.where((hi >= 38.0) & (hw == 1), 3, risk) + + df["risk_label"] = risk + + # 统计各等级分布 + label_names = {0: "低", 1: "中", 2: "高", 3: "严重"} + for level in range(4): + count = int((risk == level).sum()) + pct = count / len(df) * 100 + logger.info(" 等级 %d (%s): %d 天 (%.1f%%)", level, label_names[level], count, pct) + + return df + + +# ============================================================================ +# 函数 7: 创建 ML 序列 +# ============================================================================ + +def create_sequences( + df: pd.DataFrame, + lookback: int = LOOKBACK_DAYS, + horizons: dict | None = None, +) -> tuple[np.ndarray, np.ndarray, list[str]]: + """从特征 DataFrame 创建监督学习时间序列样本 + + 对每个时间步 i (从 lookback 到数据末尾): + X[i - lookback] = 特征矩阵 [i-lookback, i) 行,所有特征列 + y[i - lookback] = [ + 未来 3 天风险等级众数, + 未来 7 天风险等级众数, + 未来 30 天风险等级众数, + ] + + 排除的特征列: time, valid_time, city, city_name, risk_label, month, season + + Args: + df: 包含特征列和 risk_label 的 DataFrame + lookback: 输入序列天数 + horizons: 预测窗口字典 {"short": N, "medium": N, "long": N} + + Returns: + X: float32 数组 (N_samples, lookback, n_features) + y: int64 数组 (N_samples, 3),列对应 [short, medium, long] + feature_cols: 用于 X 的特征列名列表 + """ + if horizons is None: + horizons = PREDICTION_WINDOWS + + # 排除非特征列 + exclude_cols = {"time", "valid_time", "city", "city_name", "risk_label", "month", "season"} + feature_cols = [c for c in df.columns if c not in exclude_cols] + + # 仅保留数值型特征 + feature_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])] + + logger.info("序列特征 (%d): %s", len(feature_cols), feature_cols) + + n_total = len(df) + horizon_order = ["short", "medium", "long"] + horizon_values = [horizons[h] for h in horizon_order] + max_horizon = max(horizon_values) + + X_list: list[np.ndarray] = [] + y_list: list[list[int]] = [] + + for i in range(lookback, n_total): + # 输入窗口: 前 lookback 天 + x_win = df.iloc[i - lookback : i][feature_cols].values.astype(np.float32) + X_list.append(x_win) + + # 目标: 各预测窗口的风险等级众数 + y_row: list[int] = [] + for h in horizon_values: + end_idx = min(i + h, n_total) + if end_idx > i: + future = df.iloc[i:end_idx]["risk_label"].values + mode_result = stats.mode(future, keepdims=False) + # mode 可能是 0-d array 或标量 + y_row.append(int(np.atleast_1d(mode_result.mode)[0])) + else: + y_row.append(int(df.iloc[-1]["risk_label"])) + + y_list.append(y_row) + + X = np.array(X_list, dtype=np.float32) + y = np.array(y_list, dtype=np.int64) + + logger.info("序列创建完成: X%s, y%s", X.shape, y.shape) + + # 打印各窗口标签分布 + for j, name in enumerate(horizon_order): + values, counts = np.unique(y[:, j], return_counts=True) + dist = {int(v): int(c) for v, c in zip(values, counts)} + logger.info(" y_%s 分布: %s", name, dist) + + return X, y, feature_cols + + +# ============================================================================ +# 函数 8: 完整预处理管线 +# ============================================================================ + +def preprocess_all() -> None: + """执行完整的数据预处理管线 + + 对配置中每个城市依次执行: + 1. load_era5_city — 加载 NetCDF + 2. compute_daily_aggregates — 日聚合 + 3. build_features — 特征工程 + 4. compute_risk_labels — 风险标签 + 5. dropna → 保存 feature CSV + 6. create_sequences — 构建序列 → 保存 NPZ + + 最后合并所有城市数据,保存 combined CSV 和 NPZ + + 若 ERA5 数据尚未下载,会记录警告并跳过对应城市。 + """ + DATA_PROCESSED.mkdir(parents=True, exist_ok=True) + + combined_dfs: list[pd.DataFrame] = [] + # 记录第一个城市的特征列名,用于合并 NPZ + saved_feature_cols: list[str] = [] + + for city_key, city_info in CITIES.items(): + city_name = city_info["name"] + logger.info("=" * 60) + logger.info(">>> 处理城市: %s (%s)", city_name, city_key) + + # ---- 1. 加载 ---- + try: + ds = load_era5_city(city_key) + except FileNotFoundError as e: + logger.warning("跳过 %s: %s", city_key, e) + continue + + # ---- 2. 日聚合 ---- + df = compute_daily_aggregates(ds) + if df.empty: + logger.warning("跳过 %s: 日聚合结果为空", city_key) + continue + + # 添加城市标识列 + df["city"] = city_key + df["city_name"] = city_name + + # ---- 3. 特征工程 ---- + df = build_features(df) + + # ---- 4. 风险标签 ---- + df = compute_risk_labels(df) + + # ---- 5. 删除含 NaN 的行并保存 ---- + df_clean = df.dropna().reset_index(drop=True) + csv_path = DATA_PROCESSED / f"features_{city_key}.csv" + df_clean.to_csv(csv_path, index=False, encoding="utf-8-sig") + logger.info("已保存特征 CSV: %s (%d 行 x %d 列)", + csv_path.name, len(df_clean), len(df_clean.columns)) + + # ---- 6. 创建序列 ---- + X, y, feature_cols = create_sequences(df_clean) + if not saved_feature_cols: + saved_feature_cols = feature_cols + + npz_path = DATA_PROCESSED / f"sequences_{city_key}.npz" + np.savez_compressed( + npz_path, + X=X, + y=y, + feature_cols=np.array(feature_cols, dtype=object), + ) + logger.info("已保存序列 NPZ: %s (X%s, y%s)", + npz_path.name, X.shape, y.shape) + + combined_dfs.append(df_clean) + + # ---- 合并所有城市 ---- + if not combined_dfs: + logger.warning("没有城市完成处理。请先下载 ERA5 数据") + logger.warning("运行: python -m src.data.download_era5") + return + + # 合并 CSV + combined = pd.concat(combined_dfs, ignore_index=True) + combined_csv = DATA_PROCESSED / "features_combined.csv" + combined.to_csv(combined_csv, index=False, encoding="utf-8-sig") + logger.info("已保存合并特征 CSV: %s (%d 行)", combined_csv.name, len(combined)) + + # 合并 NPZ + all_X, all_y = [], [] + for city_key in CITIES: + npz_path = DATA_PROCESSED / f"sequences_{city_key}.npz" + if npz_path.exists(): + data = np.load(npz_path, allow_pickle=True) + all_X.append(data["X"]) + all_y.append(data["y"]) + + if all_X and saved_feature_cols: + combined_X = np.concatenate(all_X, axis=0) + combined_y = np.concatenate(all_y, axis=0) + combined_npz = DATA_PROCESSED / "sequences_combined.npz" + np.savez_compressed( + combined_npz, + X=combined_X, + y=combined_y, + feature_cols=np.array(saved_feature_cols, dtype=object), + ) + logger.info("已保存合并序列 NPZ: %s (X%s, y%s)", + combined_npz.name, combined_X.shape, combined_y.shape) + + logger.info("=" * 60) + logger.info("数据预处理管线全部完成!") + + +# ============================================================================ +# CLI 入口 +# ============================================================================ + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + preprocess_all() diff --git a/src/elderly_heat_warning.egg-info/PKG-INFO b/src/elderly_heat_warning.egg-info/PKG-INFO new file mode 100644 index 0000000..0f4fe66 --- /dev/null +++ b/src/elderly_heat_warning.egg-info/PKG-INFO @@ -0,0 +1,20 @@ +Metadata-Version: 2.4 +Name: elderly-heat-warning +Version: 0.1.0 +Summary: 银发群体高温多时间尺度预警和服务优化可视化研究 +Requires-Python: >=3.10 +Requires-Dist: numpy>=1.26 +Requires-Dist: pandas>=2.1 +Requires-Dist: xarray>=2023.0 +Requires-Dist: netcdf4>=1.6 +Requires-Dist: cdsapi>=0.7 +Requires-Dist: torch>=2.1 +Requires-Dist: pytorch-lightning>=2.1 +Requires-Dist: xgboost>=2.0 +Requires-Dist: scikit-learn>=1.3 +Requires-Dist: flask>=3.0 +Requires-Dist: matplotlib>=3.8 +Requires-Dist: seaborn>=0.13 +Requires-Dist: jupyter>=1.0 +Requires-Dist: tqdm>=4.66 +Requires-Dist: scipy>=1.11 diff --git a/src/elderly_heat_warning.egg-info/SOURCES.txt b/src/elderly_heat_warning.egg-info/SOURCES.txt new file mode 100644 index 0000000..86626dd --- /dev/null +++ b/src/elderly_heat_warning.egg-info/SOURCES.txt @@ -0,0 +1,6 @@ +pyproject.toml +src/elderly_heat_warning.egg-info/PKG-INFO +src/elderly_heat_warning.egg-info/SOURCES.txt +src/elderly_heat_warning.egg-info/dependency_links.txt +src/elderly_heat_warning.egg-info/requires.txt +src/elderly_heat_warning.egg-info/top_level.txt \ No newline at end of file diff --git a/src/elderly_heat_warning.egg-info/dependency_links.txt b/src/elderly_heat_warning.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/elderly_heat_warning.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/elderly_heat_warning.egg-info/requires.txt b/src/elderly_heat_warning.egg-info/requires.txt new file mode 100644 index 0000000..c6aaa5a --- /dev/null +++ b/src/elderly_heat_warning.egg-info/requires.txt @@ -0,0 +1,15 @@ +numpy>=1.26 +pandas>=2.1 +xarray>=2023.0 +netcdf4>=1.6 +cdsapi>=0.7 +torch>=2.1 +pytorch-lightning>=2.1 +xgboost>=2.0 +scikit-learn>=1.3 +flask>=3.0 +matplotlib>=3.8 +seaborn>=0.13 +jupyter>=1.0 +tqdm>=4.66 +scipy>=1.11 diff --git a/src/elderly_heat_warning.egg-info/top_level.txt b/src/elderly_heat_warning.egg-info/top_level.txt new file mode 100644 index 0000000..f5ce9ca --- /dev/null +++ b/src/elderly_heat_warning.egg-info/top_level.txt @@ -0,0 +1,4 @@ +data +models +utils +web diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/config.py b/src/utils/config.py new file mode 100644 index 0000000..cf7a79b --- /dev/null +++ b/src/utils/config.py @@ -0,0 +1,64 @@ +"""全局配置常量""" +from pathlib import Path + +# 项目根目录 +ROOT = Path(__file__).parent.parent.parent + +# 数据目录 +DATA_RAW = ROOT / "data" / "raw" +DATA_PROCESSED = ROOT / "data" / "processed" +DATA_EXTERNAL = ROOT / "data" / "external" + +# 输出目录 +OUTPUT_MODELS = ROOT / "outputs" / "models" +OUTPUT_FIGURES = ROOT / "outputs" / "figures" +OUTPUT_LOGS = ROOT / "outputs" / "logs" + +# 研究城市坐标 (纬度, 经度) +CITIES = { + "jiaozuo": {"lat": 35.24, "lon": 113.22, "name": "焦作"}, + "zhengzhou": {"lat": 34.75, "lon": 113.62, "name": "郑州"}, +} + +# ERA5 配置 +ERA5_START_YEAR = 2010 +ERA5_END_YEAR = 2024 +ERA5_VARIABLES = [ + "2m_temperature", + "2m_dewpoint_temperature", + "surface_pressure", + "10m_u_component_of_wind", + "10m_v_component_of_wind", + "total_precipitation", +] + +# 模型配置 +LOOKBACK_DAYS = 14 +BATCH_SIZE = 32 +LEARNING_RATE = 1e-3 +MAX_EPOCHS = 100 +EARLY_STOP_PATIENCE = 15 +HIDDEN_DIM = 128 +LSTM_LAYERS = 2 +ATTENTION_HEADS = 4 +DROPOUT = 0.3 + +# 风险等级阈值 (体感温度 °C) +RISK_THRESHOLDS = { + "low": 32, + "medium": 35, + "high": 38, + "severe": 38, +} + +# 时间尺度预测窗口 (天) +PREDICTION_WINDOWS = { + "short": 3, + "medium": 7, + "long": 30, +} + +# 确保目录存在 +for d in [DATA_RAW, DATA_PROCESSED, DATA_EXTERNAL, + OUTPUT_MODELS, OUTPUT_FIGURES, OUTPUT_LOGS]: + d.mkdir(parents=True, exist_ok=True) diff --git a/src/web/__init__.py b/src/web/__init__.py new file mode 100644 index 0000000..e69de29