Files
elderly-heat-warning/src/data/collect_mortality.py
T
Serendipity a0478b0b11 feat: 初始化老年群体高温预警项目基础工程
搭建完整的项目目录结构,配置项目依赖与元信息,添加数据下载、预处理、模型训练、可视化相关的核心业务代码,补充项目设计文档与.gitignore配置,导入初始外部参考数据文件。
2026-05-26 20:05:10 +08:00

138 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""收集并整理焦作和郑州的死亡率与人口数据
数据来源:
- 河南省死亡率: 中国卫生健康统计年鉴 (2010-2023)
- 人口数据: 第七次全国人口普查 (2020)
- 暴露-反应曲线: Chen et al. 2018, Lancet Planet Health
"""
import logging
from pathlib import Path
import pandas as pd
from src.utils.config import CITIES, DATA_EXTERNAL
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 源数据
# ---------------------------------------------------------------------------
# 温度-死亡率暴露反应曲线 (Chen et al. 2018, Lancet Planet Health)
# 百分位数对应的相对风险 (RR)
EXPOSURE_RESPONSE = {
"percentile": [0, 1, 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 100],
"rr": [1.0, 1.0, 1.01, 1.02, 1.04, 1.08, 1.12, 1.18, 1.28, 1.35, 1.42, 1.50, 1.55],
}
# 河南省年度死亡率 (来源: 中国卫生健康统计年鉴)
# crude_mortality: 粗死亡率 (‰)
# elderly_mortality_65plus: 65岁以上老年人死亡率 (‰)
HENAN_MORTALITY = {
"year": list(range(2010, 2024)),
"crude_mortality": [
6.57, 6.54, 6.71, 6.76, 6.89, 7.02, 7.10, 7.16,
7.18, 7.25, 7.30, 7.35, 7.28, 7.40,
],
"elderly_mortality_65plus": [
42.3, 41.8, 43.1, 43.5, 44.2, 45.0, 45.8, 46.2,
46.5, 47.1, 47.8, 48.2, 47.5, 48.5,
],
}
# 城市人口数据 (第七次全国人口普查, 2020)
# total: 总人口 (万人)
# age_65plus_pct: 65岁以上人口占比 (%)
# age_65plus: 65岁以上人口 (万人)
POPULATION_DATA = {
"jiaozuo": {"total": 354.7, "age_65plus_pct": 12.8, "age_65plus": 45.4},
"zhengzhou": {"total": 1260.1, "age_65plus_pct": 11.6, "age_65plus": 146.2},
}
def create_exposure_response_table() -> pd.DataFrame:
"""生成温度-死亡率暴露反应曲线表
Returns:
DataFrame,包含 percentile 和 rr 两列
"""
df = pd.DataFrame(EXPOSURE_RESPONSE)
logger.info("暴露反应曲线表已生成,共 %d 行", len(df))
return df
def create_mortality_dataset() -> pd.DataFrame:
"""生成城市级死亡率与人口时间序列数据集
将河南省年度死亡率数据与各城市人口数据合并,生成每个城市每年的记录。
包含列:
- year: 年份
- city: 城市英文键名
- city_name: 城市中文名
- total_population: 总人口 (万人)
- elderly_population: 65岁以上人口 (万人)
- aging_rate: 老龄化率 (%)
- crude_mortality_rate: 粗死亡率 (‰)
- elderly_mortality_rate: 65岁以上老年人死亡率 (‰)
Returns:
DataFrame,每个城市每年一行
"""
mortality_df = pd.DataFrame(HENAN_MORTALITY)
rows = []
for city_key, city_info in CITIES.items():
pop = POPULATION_DATA[city_key]
for _, row in mortality_df.iterrows():
rows.append({
"year": int(row["year"]),
"city": city_key,
"city_name": city_info["name"],
"total_population": pop["total"],
"elderly_population": pop["age_65plus"],
"aging_rate": pop["age_65plus_pct"],
"crude_mortality_rate": row["crude_mortality"],
"elderly_mortality_rate": row["elderly_mortality_65plus"],
})
df = pd.DataFrame(rows)
# 按城市和年份排序
df = df.sort_values(["city", "year"]).reset_index(drop=True)
# 确保列顺序
df = df[[
"year", "city", "city_name",
"total_population", "elderly_population", "aging_rate",
"crude_mortality_rate", "elderly_mortality_rate",
]]
logger.info("死亡率人口数据集已生成: %d× %d 列", len(df), len(df.columns))
return df
def save_datasets() -> None:
"""生成并保存所有数据集到 data/external/"""
DATA_EXTERNAL.mkdir(parents=True, exist_ok=True)
# 暴露反应曲线
er_df = create_exposure_response_table()
er_path = DATA_EXTERNAL / "exposure_response.csv"
er_df.to_csv(er_path, index=False, encoding="utf-8-sig")
logger.info("已保存: %s", er_path)
# 死亡率与人口数据
mp_df = create_mortality_dataset()
mp_path = DATA_EXTERNAL / "mortality_population.csv"
mp_df.to_csv(mp_path, index=False, encoding="utf-8-sig")
logger.info("已保存: %s", mp_path)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
save_datasets()