"""收集并整理焦作和郑州的死亡率与人口数据 数据来源: - 河南省死亡率: 中国卫生健康统计年鉴 (2010-2023) - 人口数据: 第七次全国人口普查 (2020) - 暴露-反应曲线: Chen et al. 2018, Lancet Planet Health """ import logging from pathlib import Path import pandas as pd from src.utils.config import CITIES, DATA_EXTERNAL logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # 源数据 # --------------------------------------------------------------------------- # 温度-死亡率暴露反应曲线 (Chen et al. 2018, Lancet Planet Health) # 百分位数对应的相对风险 (RR) EXPOSURE_RESPONSE = { "percentile": [0, 1, 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 100], "rr": [1.0, 1.0, 1.01, 1.02, 1.04, 1.08, 1.12, 1.18, 1.28, 1.35, 1.42, 1.50, 1.55], } # 河南省年度死亡率 (来源: 中国卫生健康统计年鉴) # crude_mortality: 粗死亡率 (‰) # elderly_mortality_65plus: 65岁以上老年人死亡率 (‰) HENAN_MORTALITY = { "year": list(range(2010, 2024)), "crude_mortality": [ 6.57, 6.54, 6.71, 6.76, 6.89, 7.02, 7.10, 7.16, 7.18, 7.25, 7.30, 7.35, 7.28, 7.40, ], "elderly_mortality_65plus": [ 42.3, 41.8, 43.1, 43.5, 44.2, 45.0, 45.8, 46.2, 46.5, 47.1, 47.8, 48.2, 47.5, 48.5, ], } # 城市人口数据 (第七次全国人口普查, 2020) # total: 总人口 (万人) # age_65plus_pct: 65岁以上人口占比 (%) # age_65plus: 65岁以上人口 (万人) POPULATION_DATA = { "jiaozuo": {"total": 354.7, "age_65plus_pct": 12.8, "age_65plus": 45.4}, "zhengzhou": {"total": 1260.1, "age_65plus_pct": 11.6, "age_65plus": 146.2}, } def create_exposure_response_table() -> pd.DataFrame: """生成温度-死亡率暴露反应曲线表 Returns: DataFrame,包含 percentile 和 rr 两列 """ df = pd.DataFrame(EXPOSURE_RESPONSE) logger.info("暴露反应曲线表已生成,共 %d 行", len(df)) return df def create_mortality_dataset() -> pd.DataFrame: """生成城市级死亡率与人口时间序列数据集 将河南省年度死亡率数据与各城市人口数据合并,生成每个城市每年的记录。 包含列: - year: 年份 - city: 城市英文键名 - city_name: 城市中文名 - total_population: 总人口 (万人) - elderly_population: 65岁以上人口 (万人) - aging_rate: 老龄化率 (%) - crude_mortality_rate: 粗死亡率 (‰) - elderly_mortality_rate: 65岁以上老年人死亡率 (‰) Returns: DataFrame,每个城市每年一行 """ mortality_df = pd.DataFrame(HENAN_MORTALITY) rows = [] for city_key, city_info in CITIES.items(): pop = POPULATION_DATA[city_key] for _, row in mortality_df.iterrows(): rows.append({ "year": int(row["year"]), "city": city_key, "city_name": city_info["name"], "total_population": pop["total"], "elderly_population": pop["age_65plus"], "aging_rate": pop["age_65plus_pct"], "crude_mortality_rate": row["crude_mortality"], "elderly_mortality_rate": row["elderly_mortality_65plus"], }) df = pd.DataFrame(rows) # 按城市和年份排序 df = df.sort_values(["city", "year"]).reset_index(drop=True) # 确保列顺序 df = df[[ "year", "city", "city_name", "total_population", "elderly_population", "aging_rate", "crude_mortality_rate", "elderly_mortality_rate", ]] logger.info("死亡率人口数据集已生成: %d 行 × %d 列", len(df), len(df.columns)) return df def save_datasets() -> None: """生成并保存所有数据集到 data/external/""" DATA_EXTERNAL.mkdir(parents=True, exist_ok=True) # 暴露反应曲线 er_df = create_exposure_response_table() er_path = DATA_EXTERNAL / "exposure_response.csv" er_df.to_csv(er_path, index=False, encoding="utf-8-sig") logger.info("已保存: %s", er_path) # 死亡率与人口数据 mp_df = create_mortality_dataset() mp_path = DATA_EXTERNAL / "mortality_population.csv" mp_df.to_csv(mp_path, index=False, encoding="utf-8-sig") logger.info("已保存: %s", mp_path) if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) save_datasets()