a0478b0b11
搭建完整的项目目录结构,配置项目依赖与元信息,添加数据下载、预处理、模型训练、可视化相关的核心业务代码,补充项目设计文档与.gitignore配置,导入初始外部参考数据文件。
138 lines
4.5 KiB
Python
138 lines
4.5 KiB
Python
"""收集并整理焦作和郑州的死亡率与人口数据
|
||
|
||
数据来源:
|
||
- 河南省死亡率: 中国卫生健康统计年鉴 (2010-2023)
|
||
- 人口数据: 第七次全国人口普查 (2020)
|
||
- 暴露-反应曲线: Chen et al. 2018, Lancet Planet Health
|
||
"""
|
||
|
||
import logging
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
|
||
from src.utils.config import CITIES, DATA_EXTERNAL
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 源数据
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# 温度-死亡率暴露反应曲线 (Chen et al. 2018, Lancet Planet Health)
|
||
# 百分位数对应的相对风险 (RR)
|
||
EXPOSURE_RESPONSE = {
|
||
"percentile": [0, 1, 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 100],
|
||
"rr": [1.0, 1.0, 1.01, 1.02, 1.04, 1.08, 1.12, 1.18, 1.28, 1.35, 1.42, 1.50, 1.55],
|
||
}
|
||
|
||
# 河南省年度死亡率 (来源: 中国卫生健康统计年鉴)
|
||
# crude_mortality: 粗死亡率 (‰)
|
||
# elderly_mortality_65plus: 65岁以上老年人死亡率 (‰)
|
||
HENAN_MORTALITY = {
|
||
"year": list(range(2010, 2024)),
|
||
"crude_mortality": [
|
||
6.57, 6.54, 6.71, 6.76, 6.89, 7.02, 7.10, 7.16,
|
||
7.18, 7.25, 7.30, 7.35, 7.28, 7.40,
|
||
],
|
||
"elderly_mortality_65plus": [
|
||
42.3, 41.8, 43.1, 43.5, 44.2, 45.0, 45.8, 46.2,
|
||
46.5, 47.1, 47.8, 48.2, 47.5, 48.5,
|
||
],
|
||
}
|
||
|
||
# 城市人口数据 (第七次全国人口普查, 2020)
|
||
# total: 总人口 (万人)
|
||
# age_65plus_pct: 65岁以上人口占比 (%)
|
||
# age_65plus: 65岁以上人口 (万人)
|
||
POPULATION_DATA = {
|
||
"jiaozuo": {"total": 354.7, "age_65plus_pct": 12.8, "age_65plus": 45.4},
|
||
"zhengzhou": {"total": 1260.1, "age_65plus_pct": 11.6, "age_65plus": 146.2},
|
||
}
|
||
|
||
|
||
def create_exposure_response_table() -> pd.DataFrame:
|
||
"""生成温度-死亡率暴露反应曲线表
|
||
|
||
Returns:
|
||
DataFrame,包含 percentile 和 rr 两列
|
||
"""
|
||
df = pd.DataFrame(EXPOSURE_RESPONSE)
|
||
logger.info("暴露反应曲线表已生成,共 %d 行", len(df))
|
||
return df
|
||
|
||
|
||
def create_mortality_dataset() -> pd.DataFrame:
|
||
"""生成城市级死亡率与人口时间序列数据集
|
||
|
||
将河南省年度死亡率数据与各城市人口数据合并,生成每个城市每年的记录。
|
||
|
||
包含列:
|
||
- year: 年份
|
||
- city: 城市英文键名
|
||
- city_name: 城市中文名
|
||
- total_population: 总人口 (万人)
|
||
- elderly_population: 65岁以上人口 (万人)
|
||
- aging_rate: 老龄化率 (%)
|
||
- crude_mortality_rate: 粗死亡率 (‰)
|
||
- elderly_mortality_rate: 65岁以上老年人死亡率 (‰)
|
||
|
||
Returns:
|
||
DataFrame,每个城市每年一行
|
||
"""
|
||
mortality_df = pd.DataFrame(HENAN_MORTALITY)
|
||
rows = []
|
||
|
||
for city_key, city_info in CITIES.items():
|
||
pop = POPULATION_DATA[city_key]
|
||
for _, row in mortality_df.iterrows():
|
||
rows.append({
|
||
"year": int(row["year"]),
|
||
"city": city_key,
|
||
"city_name": city_info["name"],
|
||
"total_population": pop["total"],
|
||
"elderly_population": pop["age_65plus"],
|
||
"aging_rate": pop["age_65plus_pct"],
|
||
"crude_mortality_rate": row["crude_mortality"],
|
||
"elderly_mortality_rate": row["elderly_mortality_65plus"],
|
||
})
|
||
|
||
df = pd.DataFrame(rows)
|
||
# 按城市和年份排序
|
||
df = df.sort_values(["city", "year"]).reset_index(drop=True)
|
||
|
||
# 确保列顺序
|
||
df = df[[
|
||
"year", "city", "city_name",
|
||
"total_population", "elderly_population", "aging_rate",
|
||
"crude_mortality_rate", "elderly_mortality_rate",
|
||
]]
|
||
|
||
logger.info("死亡率人口数据集已生成: %d 行 × %d 列", len(df), len(df.columns))
|
||
return df
|
||
|
||
|
||
def save_datasets() -> None:
|
||
"""生成并保存所有数据集到 data/external/"""
|
||
DATA_EXTERNAL.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 暴露反应曲线
|
||
er_df = create_exposure_response_table()
|
||
er_path = DATA_EXTERNAL / "exposure_response.csv"
|
||
er_df.to_csv(er_path, index=False, encoding="utf-8-sig")
|
||
logger.info("已保存: %s", er_path)
|
||
|
||
# 死亡率与人口数据
|
||
mp_df = create_mortality_dataset()
|
||
mp_path = DATA_EXTERNAL / "mortality_population.csv"
|
||
mp_df.to_csv(mp_path, index=False, encoding="utf-8-sig")
|
||
logger.info("已保存: %s", mp_path)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
)
|
||
save_datasets()
|