Files
rl-atari/强化学习个人课程作业报告/notebooks/insurance_premium_risk.ipynb
T
Serendipity d353133b31 feat: 添加强化学习项目报告及重构课程作业报告代码结构
- 新增强化学习个人项目报告,包含基于PyTorch从零实现的PPO算法
- 重构课程作业报告代码结构,提取运行时路径管理和notebook执行逻辑到独立模块
- 更新依赖文件requirements.txt,添加强化学习相关依赖
- 简化模型比较结果表格,仅保留基线逻辑回归模型数据
2026-04-30 16:54:41 +08:00

1063 lines
38 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "170d0b4f",
"metadata": {},
"source": [
"# Insurance Premium Risk Classification\n",
"## DTS304TC Machine Learning - Coursework 1\n",
"\n",
"**Student ID**: 1234560 (Last digit = 0)\n",
"**Compulsory Category**: A - Data Quality & Missingness\n",
"**Optional Category**: D - Robustness & Soft Voting Ensemble\n",
"\n",
"**Primary metric**: macro-F1 (imbalanced dataset)\n",
"**Secondary metric**: accuracy\n",
"\n",
"---\n",
"\n",
"## Workflow\n",
"1. Data loading & EDA\n",
"2. Leakage feature identification & removal\n",
"3. Preprocessing pipeline construction\n",
"4. Baseline model (Logistic Regression)\n",
"5. Controlled comparison: Random Forest vs XGBoost\n",
"6. Advanced hyperparameter optimisation (Optuna/TPE)\n",
"7. Personalised improvement (Category A + Category D)\n",
"8. K-Means & GMM unsupervised exploration\n",
"9. Final model selection\n",
"10. Hidden-test CSV export"
]
},
{
"cell_type": "markdown",
"id": "463d3e6d",
"metadata": {},
"source": [
"## Step 1: Setup & Data Loading"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a12f069a",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'my_env (Python 3.10.18)' requires the ipykernel package.\n",
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
"\u001b[1;31mOr install 'ipykernel' using the command: 'conda install -n my_env ipykernel --update-deps --force-reinstall'"
]
}
],
"source": [
"import xgboost as xgb\n",
"import optuna\n",
"optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
"\n",
"# GPU Fallback: 自动检测CUDA可用性,无GPU时自动切换到CPU\n",
"try:\n",
" import subprocess\n",
" result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)\n",
" USE_GPU = result.returncode == 0\n",
"except:\n",
" USE_GPU = False\n",
"\n",
"XGB_TREE_METHOD = 'gpu_hist' if USE_GPU else 'hist'\n",
"XGB_DEVICE = 'cuda' if USE_GPU else 'cpu'\n",
"print(f'XGBoost compute method: {\"GPU (CUDA)\" if USE_GPU else \"CPU\"}')\n",
"\n",
"RANDOM_STATE = 42\n",
"np.random.seed(RANDOM_STATE)\n",
"plt.rcParams['figure.figsize'] = (10, 6)\n",
"plt.rcParams['font.size'] = 12\n",
"sns.set_style('whitegrid')\n",
"print('All libraries imported successfully!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c4b453a",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'my_env (Python 3.10.18)' requires the ipykernel package.\n",
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
"\u001b[1;31mOr install 'ipykernel' using the command: 'conda install -n my_env ipykernel --update-deps --force-reinstall'"
]
}
],
"source": [
"train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))\n",
"val_df = pd.read_csv(os.path.join(DATA_DIR, 'val.csv'))\n",
"test_df = pd.read_csv(os.path.join(DATA_DIR, 'test_features.csv'))\n",
"\n",
"print(f'Train shape: {train_df.shape}')\n",
"print(f'Val shape: {val_df.shape}')\n",
"print(f'Test shape: {test_df.shape}')"
]
},
{
"cell_type": "markdown",
"id": "8b8e7ad9",
"metadata": {},
"source": [
"## Step 2: Exploratory Data Analysis (EDA)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45e520e2",
"metadata": {},
"outputs": [],
"source": [
"print('=== TARGET DISTRIBUTION (TRAIN) ===')\n",
"target_counts = train_df['premium_risk'].value_counts()\n",
"print(target_counts)\n",
"print((target_counts / len(train_df) * 100).round(2))\n",
"\n",
"fig, ax = plt.subplots(figsize=(8, 5))\n",
"colors = ['#4CAF50', '#FFC107', '#F44336']\n",
"target_counts.sort_index().plot(kind='bar', ax=ax, color=colors)\n",
"ax.set_title('Target Variable Distribution (Train)', fontsize=14)\n",
"ax.set_xlabel('Premium Risk')\n",
"ax.set_ylabel('Count')\n",
"ax.set_xticklabels(ax.get_xticklabels(), rotation=0)\n",
"for i, (idx, val) in enumerate(target_counts.sort_index().items()):\n",
" ax.text(i, val + 300, f'{val}\\n({val/len(train_df)*100:.1f}%)', ha='center')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'target_distribution.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e2428e4",
"metadata": {},
"outputs": [],
"source": [
"print('=== MISSING VALUES (TRAIN) ===')\n",
"missing = train_df.isnull().sum()\n",
"missing = missing[missing > 0].sort_values(ascending=False)\n",
"print(missing)\n",
"\n",
"fig, ax = plt.subplots(figsize=(12, 6))\n",
"missing.plot(kind='barh', ax=ax, color='coral')\n",
"ax.set_title('Missing Values per Column (Train)', fontsize=14)\n",
"ax.set_xlabel('Count')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'missing_values.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5cafc5e",
"metadata": {},
"outputs": [],
"source": [
"noise_cols = [c for c in train_df.columns if 'noise' in c.lower()]\n",
"print(f'Noise features: {noise_cols}')\n",
"\n",
"print('\\n=== bureau_risk_index stats ===')\n",
"print(train_df['bureau_risk_index'].describe())\n",
"\n",
"fig, ax = plt.subplots(figsize=(8, 5))\n",
"train_df.boxplot(column='bureau_risk_index', by='premium_risk', ax=ax)\n",
"ax.set_title('bureau_risk_index by Premium Risk')\n",
"ax.set_xlabel('Premium Risk')\n",
"ax.set_ylabel('bureau_risk_index')\n",
"plt.suptitle('')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'bureau_risk_boxplot.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "4db79797",
"metadata": {},
"source": [
"## Step 3: Leakage Feature Identification & Removal\n",
"\n",
"**Strategy**: Train a DecisionTree with each feature individually.\n",
"Features with abnormally high macro-F1 are suspected leakage."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fbf59f43",
"metadata": {},
"outputs": [],
"source": [
"def screen_single_feature_leakage(df, target_col, feature_cols, scoring='f1_macro'):\n",
" from sklearn.tree import DecisionTreeClassifier\n",
" results = []\n",
" for col in feature_cols:\n",
" temp_df = df[[col, target_col]].dropna()\n",
" X_temp = temp_df[[col]].values\n",
" y_temp = temp_df[target_col].values\n",
" le = LabelEncoder()\n",
" y_enc = le.fit_transform(y_temp)\n",
" try:\n",
" clf = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=3)\n",
" scores = cross_val_score(clf, X_temp, y_enc, cv=3, scoring=scoring)\n",
" results.append({'feature': col, 'mean_f1_macro': scores.mean(), 'std': scores.std()})\n",
" except:\n",
" results.append({'feature': col, 'mean_f1_macro': 0.0, 'std': 0.0})\n",
" return pd.DataFrame(results).sort_values('mean_f1_macro', ascending=False)\n",
"\n",
"feature_to_test = [c for c in train_df.columns if c not in ['applicant_id', 'customer_key', 'premium_risk']]\n",
"print('Screening single features for leakage detection (this may take a few minutes)...')\n",
"leakage_results = screen_single_feature_leakage(train_df, 'premium_risk', feature_to_test)\n",
"print('\\n=== TOP 10 SINGLE-FEATURE F1 MACRO SCORES ===')\n",
"print(leakage_results.head(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec03b578",
"metadata": {},
"outputs": [],
"source": [
"LEAKAGE_THRESHOLD = 0.85\n",
"print('=== LEAKAGE DETECTION RESULTS ===')\n",
"print(leakage_results.head(10))\n",
"\n",
"bureau_score = leakage_results[leakage_results['feature'] == 'bureau_risk_index']['mean_f1_macro'].values[0]\n",
"print(f'\\nbureau_risk_index F1 macro: {bureau_score:.4f}')\n",
"\n",
"if bureau_score > LEAKAGE_THRESHOLD:\n",
" print('\\n*** ALERT: bureau_risk_index shows abnormally high predictive power! ***')\n",
" print('*** This is consistent with a leakage feature. ***')\n",
" print('*** ACTION: bureau_risk_index will be removed from features. ***')\n",
" LEAKAGE_FEATURE = 'bureau_risk_index'\n",
"else:\n",
" top_feat = leakage_results.iloc[0]['feature']\n",
" top_score = leakage_results.iloc[0]['mean_f1_macro']\n",
" print(f'\\nTop feature: {top_feat} with F1 macro = {top_score:.4f}')\n",
" if top_score > 0.80:\n",
" LEAKAGE_FEATURE = top_feat\n",
" else:\n",
" LEAKAGE_FEATURE = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f01746fe",
"metadata": {},
"outputs": [],
"source": [
"if LEAKAGE_FEATURE:\n",
" print(f'Removing leakage feature: {LEAKAGE_FEATURE}')\n",
" train_df_clean = train_df.drop(columns=[LEAKAGE_FEATURE])\n",
" val_df_clean = val_df.drop(columns=[LEAKAGE_FEATURE])\n",
" test_df_clean = test_df.drop(columns=[LEAKAGE_FEATURE])\n",
"else:\n",
" print('No leakage feature to remove.')\n",
" train_df_clean = train_df.copy()\n",
" val_df_clean = val_df.copy()\n",
" test_df_clean = test_df.copy()\n",
"\n",
"print(f'After removal - Train: {train_df_clean.shape}, Val: {val_df_clean.shape}, Test: {test_df_clean.shape}')"
]
},
{
"cell_type": "markdown",
"id": "ed28be55",
"metadata": {},
"source": [
"## Step 4: Preprocessing Pipeline Construction"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f56180d",
"metadata": {},
"outputs": [],
"source": [
"ID_COLS = ['applicant_id', 'customer_key', 'applicant_ref_code']\n",
"NOISE_COLS = ['noise_feature_1', 'noise_feature_2', 'noise_feature_3', 'noise_feature_4', 'noise_feature_5']\n",
"TARGET_COL = 'premium_risk'\n",
"\n",
"all_cols = train_df_clean.columns.tolist()\n",
"feature_cols_all = [c for c in all_cols if c not in ID_COLS + NOISE_COLS + [TARGET_COL]]\n",
"\n",
"NUMERIC_FEATURES = train_df_clean[feature_cols_all].select_dtypes(include=[np.number]).columns.tolist()\n",
"CATEGORICAL_FEATURES = train_df_clean[feature_cols_all].select_dtypes(include=['object']).columns.tolist()\n",
"\n",
"print(f'Total features: {len(feature_cols_all)}')\n",
"print(f'Numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}')\n",
"print(f'Categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fbe754d",
"metadata": {},
"outputs": [],
"source": [
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
"])\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, NUMERIC_FEATURES),\n",
" ('cat', categorical_transformer, CATEGORICAL_FEATURES)\n",
" ],\n",
" remainder='drop'\n",
")\n",
"print('Preprocessing pipeline created!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e797d98",
"metadata": {},
"outputs": [],
"source": [
"X_train = train_df_clean[feature_cols_all]\n",
"y_train = train_df_clean[TARGET_COL]\n",
"X_val = val_df_clean[feature_cols_all]\n",
"y_val = val_df_clean[TARGET_COL]\n",
"X_test = test_df_clean[feature_cols_all]\n",
"\n",
"le_target = LabelEncoder()\n",
"y_train_enc = le_target.fit_transform(y_train)\n",
"y_val_enc = le_target.transform(y_val)\n",
"\n",
"print(f'Classes: {le_target.classes_}')\n",
"print(f'X_train: {X_train.shape} | X_val: {X_val.shape} | X_test: {X_test.shape}')"
]
},
{
"cell_type": "markdown",
"id": "481e4b48",
"metadata": {},
"source": [
"## Step 5: Baseline Model - Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a900d26",
"metadata": {},
"outputs": [],
"source": [
"def evaluate_model(pipeline, X_tr, y_tr, X_v, y_v, le, model_name='Model'):\n",
" y_tr_pred = pipeline.predict(X_tr)\n",
" y_v_pred = pipeline.predict(X_v)\n",
" results = {\n",
" 'model': model_name,\n",
" 'train_accuracy': accuracy_score(y_tr, y_tr_pred),\n",
" 'val_accuracy': accuracy_score(y_v, y_v_pred),\n",
" 'train_f1_macro': f1_score(y_tr, y_tr_pred, average='macro'),\n",
" 'val_f1_macro': f1_score(y_v, y_v_pred, average='macro'),\n",
" }\n",
" f1_per_class = f1_score(y_v, y_v_pred, average=None)\n",
" for i, cls in enumerate(le.classes_):\n",
" results[f'val_f1_{cls}'] = f1_per_class[i]\n",
" return results\n",
"\n",
"def plot_confusion_matrix(pipeline, X_v, y_v, le, title, save_path):\n",
" y_pred = pipeline.predict(X_v)\n",
" fig, ax = plt.subplots(figsize=(8, 6))\n",
" cm = confusion_matrix(y_v, y_pred)\n",
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)\n",
" disp.plot(ax=ax, cmap='Blues', values_format='d')\n",
" ax.set_title(title, fontsize=14)\n",
" plt.tight_layout()\n",
" plt.savefig(save_path, dpi=150)\n",
" plt.show()\n",
" return cm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8992d98",
"metadata": {},
"outputs": [],
"source": [
"print('Training Baseline: Logistic Regression...')\n",
"baseline_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE, n_jobs=-1))\n",
"])\n",
"baseline_pipeline.fit(X_train, y_train_enc)\n",
"\n",
"baseline_results = evaluate_model(baseline_pipeline, X_train, y_train_enc, X_val, y_val_enc, le_target, 'Baseline_LR')\n",
"\n",
"print('\\n=== BASELINE MODEL RESULTS ===')\n",
"for k, v in baseline_results.items():\n",
" if k != 'model':\n",
" print(f'{k}: {v:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ff29071",
"metadata": {},
"outputs": [],
"source": [
"plot_confusion_matrix(baseline_pipeline, X_val, y_val_enc, le_target,\n",
" 'Baseline: Logistic Regression - Confusion Matrix',\n",
" os.path.join(OUTPUT_DIR, 'figures', 'baseline_confusion_matrix.png'))\n",
"\n",
"print('\\n=== CLASSIFICATION REPORT (VAL) ===')\n",
"y_val_pred = baseline_pipeline.predict(X_val)\n",
"print(classification_report(y_val_enc, y_val_pred, target_names=le_target.classes_))\n",
"\n",
"all_results = [baseline_results]\n",
"pd.DataFrame(all_results).to_csv(\n",
" os.path.join(OUTPUT_DIR, 'tables', 'model_comparison_summary.csv'), index=False)"
]
},
{
"cell_type": "markdown",
"id": "8675fd8e",
"metadata": {},
"source": [
"## Step 6: Controlled Comparison - Random Forest vs XGBoost"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30cd02ce",
"metadata": {},
"outputs": [],
"source": [
"print('Training Random Forest...')\n",
"start = time.time()\n",
"rf_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1))\n",
"])\n",
"rf_pipeline.fit(X_train, y_train_enc)\n",
"rf_time = time.time() - start\n",
"\n",
"rf_results = evaluate_model(rf_pipeline, X_train, y_train_enc, X_val, y_val_enc, le_target, 'RandomForest')\n",
"rf_results['train_time'] = rf_time\n",
"\n",
"print('Training XGBoost...')\n",
"start = time.time()\n",
"xgb_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6,\n",
" objective='multi:softmax', num_class=3,\n",
" tree_method=XGB_TREE_METHOD, device=XGB_DEVICE,\n",
" random_state=RANDOM_STATE, verbosity=0))\n",
"])\n",
"xgb_pipeline.fit(X_train, y_train_enc)\n",
"xgb_time = time.time() - start\n",
"\n",
"xgb_results = evaluate_model(xgb_pipeline, X_train, y_train_enc, X_val, y_val_enc, le_target, 'XGBoost')\n",
"xgb_results['train_time'] = xgb_time\n",
"\n",
"print(f'RF time: {rf_time:.2f}s | XGB time: {xgb_time:.2f}s')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "814e6787",
"metadata": {},
"outputs": [],
"source": [
"all_results.append(rf_results)\n",
"all_results.append(xgb_results)\n",
"results_df = pd.DataFrame(all_results)\n",
"\n",
"print('\\n=== MODEL COMPARISON SUMMARY ===')\n",
"display_cols = ['model', 'train_accuracy', 'val_accuracy', 'train_f1_macro', 'val_f1_macro', 'train_time']\n",
"print(results_df[display_cols].round(4).to_string(index=False))\n",
"\n",
"print('\\n=== CLASS-WISE F1 (VAL) ===')\n",
"class_cols = [c for c in results_df.columns if c.startswith('val_f1_') and c != 'val_f1_macro']\n",
"print(results_df[['model'] + class_cols].round(4).to_string(index=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "704d4061",
"metadata": {},
"outputs": [],
"source": [
"fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
"models = results_df['model'].tolist()\n",
"val_f1 = results_df['val_f1_macro'].tolist()\n",
"val_acc = results_df['val_accuracy'].tolist()\n",
"\n",
"bars1 = axes[0].bar(models, val_f1, color=['#2196F3', '#4CAF50', '#FF9800'])\n",
"axes[0].set_title('Validation Macro-F1 Comparison', fontsize=13)\n",
"axes[0].set_ylabel('Macro-F1')\n",
"axes[0].set_ylim(0, 1)\n",
"for bar, val in zip(bars1, val_f1):\n",
" axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, f'{val:.4f}', ha='center')\n",
"\n",
"bars2 = axes[1].bar(models, val_acc, color=['#2196F3', '#4CAF50', '#FF9800'])\n",
"axes[1].set_title('Validation Accuracy Comparison', fontsize=13)\n",
"axes[1].set_ylabel('Accuracy')\n",
"axes[1].set_ylim(0, 1)\n",
"for bar, val in zip(bars2, val_acc):\n",
" axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, f'{val:.4f}', ha='center')\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'model_comparison.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89747cf4",
"metadata": {},
"outputs": [],
"source": [
"plot_confusion_matrix(rf_pipeline, X_val, y_val_enc, le_target,\n",
" 'Random Forest - Confusion Matrix',\n",
" os.path.join(OUTPUT_DIR, 'figures', 'rf_confusion_matrix.png'))\n",
"\n",
"plot_confusion_matrix(xgb_pipeline, X_val, y_val_enc, le_target,\n",
" 'XGBoost - Confusion Matrix',\n",
" os.path.join(OUTPUT_DIR, 'figures', 'xgb_confusion_matrix.png'))"
]
},
{
"cell_type": "markdown",
"id": "d9e3d57d",
"metadata": {},
"source": [
"### Bagging vs Boosting Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81508463",
"metadata": {},
"outputs": [],
"source": [
"print('=== BAGGING VS BOOSTING ANALYSIS ===')\n",
"rf_val_f1 = rf_results['val_f1_macro']\n",
"rf_train_f1 = rf_results['train_f1_macro']\n",
"rf_gap = rf_train_f1 - rf_val_f1\n",
"\n",
"xgb_val_f1 = xgb_results['val_f1_macro']\n",
"xgb_train_f1 = xgb_results['train_f1_macro']\n",
"xgb_gap = xgb_train_f1 - xgb_val_f1\n",
"\n",
"print(f'Random Forest - val_f1_macro: {rf_val_f1:.4f}, overfitting gap: {rf_gap:.4f}')\n",
"print(f'XGBoost - val_f1_macro: {xgb_val_f1:.4f}, overfitting gap: {xgb_gap:.4f}')"
]
},
{
"cell_type": "markdown",
"id": "de4a5bc9",
"metadata": {},
"source": [
"## Step 7: Advanced Hyperparameter Optimisation (Optuna)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6361576",
"metadata": {},
"outputs": [],
"source": [
"def objective(trial):\n",
" params = {\n",
" 'n_estimators': trial.suggest_int('n_estimators', 100, 500),\n",
" 'max_depth': trial.suggest_int('max_depth', 3, 10),\n",
" 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),\n",
" 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),\n",
" 'subsample': trial.suggest_float('subsample', 0.5, 1.0),\n",
" 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),\n",
" 'gamma': trial.suggest_float('gamma', 0, 5),\n",
" 'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),\n",
" 'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),\n",
" 'objective': 'multi:softmax',\n",
" 'num_class': 3,\n",
" 'random_state': RANDOM_STATE,\n",
" 'tree_method': XGB_TREE_METHOD,\n",
" 'device': XGB_DEVICE,\n",
" 'verbosity': 0\n",
" }\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', xgb.XGBClassifier(**params))\n",
" ])\n",
" pipeline.fit(X_train, y_train_enc)\n",
" y_pred = pipeline.predict(X_val)\n",
" score = f1_score(y_val_enc, y_pred, average='macro')\n",
" return score\n",
"\n",
"print('Starting Optuna hyperparameter optimisation (30 trials)...')\n",
"study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))\n",
"study.optimize(objective, n_trials=30, show_progress_bar=False)\n",
"\n",
"print(f'Best trial: {study.best_trial.number} | Best macro-F1: {study.best_value:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7ba4f2a",
"metadata": {},
"outputs": [],
"source": [
"print('\\n=== BEST HYPERPARAMETERS ===')\n",
"best_params = study.best_params\n",
"for k, v in best_params.items():\n",
" print(f' {k}: {v}')\n",
"\n",
"fig = optuna.visualization.matplotlib.plot_optimization_history(study)\n",
"plt.title('Optuna Optimization History')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'optuna_optimization_history.png'), dpi=150)\n",
"plt.show()\n",
"\n",
"fig = optuna.visualization.matplotlib.plot_param_importances(study)\n",
"plt.title('Hyperparameter Importance')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'optuna_param_importance.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "640263ea",
"metadata": {},
"outputs": [],
"source": [
"best_xgb_params = {\n",
" **study.best_params,\n",
" 'objective': 'multi:softmax',\n",
" 'num_class': 3,\n",
" 'random_state': RANDOM_STATE,\n",
" 'tree_method': XGB_TREE_METHOD,\n",
" 'device': XGB_DEVICE,\n",
" 'verbosity': 0\n",
"}\n",
"\n",
"print('Training tuned XGBoost...')\n",
"import time\n",
"start = time.time()\n",
"tuned_xgb_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', xgb.XGBClassifier(**best_xgb_params))\n",
"])\n",
"tuned_xgb_pipeline.fit(X_train, y_train_enc)\n",
"tuned_time = time.time() - start\n",
"\n",
"tuned_results = evaluate_model(tuned_xgb_pipeline, X_train, y_train_enc, X_val, y_val_enc, le_target, 'XGBoost_Tuned')\n",
"tuned_results['train_time'] = tuned_time\n",
"\n",
"print('\\n=== TUNED XGBOOST RESULTS ===')\n",
"for k, v in tuned_results.items():\n",
" if k != 'model':\n",
" print(f'{k}: {v:.4f}')\n",
"\n",
"print(f'\\nTuning improvement (macro-F1): +{tuned_results[\"val_f1_macro\"] - xgb_results[\"val_f1_macro\"]:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19742e63",
"metadata": {},
"outputs": [],
"source": [
"all_results.append(tuned_results)\n",
"results_df = pd.DataFrame(all_results)\n",
"\n",
"print('\\n=== BEFORE VS AFTER TUNING ===')\n",
"print(results_df[['model', 'val_f1_macro', 'val_accuracy', 'train_time']].round(4).to_string(index=False))"
]
},
{
"cell_type": "markdown",
"id": "d01bcca7",
"metadata": {},
"source": [
"## Step 8: Personalised Improvement (Category A + Category D)\n",
"\n",
"**Student ID last digit = 0 → Category A (Compulsory) + Category D (Optional)**\n",
"\n",
"- **Category A** (Data Quality & Missingness): Add missing value indicator features\n",
"- **Category D** (Robustness & Ensemble): Soft Voting Ensemble (RF + XGBoost)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f662833",
"metadata": {},
"outputs": [],
"source": [
"print('=== CATEGORY A: IMPROVED MISSING VALUE HANDLING ===')\n",
"\n",
"MISSING_COLS = ['net_monthly_income_gbp', 'avg_payment_delay_days', 'monthly_investment_gbp',\n",
" 'prior_debt_products', 'account_tenure']\n",
"\n",
"for col in MISSING_COLS:\n",
" missing_col_name = f'{col}_missing'\n",
" train_df_clean[missing_col_name] = train_df_clean[col].isnull().astype(int)\n",
" val_df_clean[missing_col_name] = val_df_clean[col].isnull().astype(int)\n",
" test_df_clean[missing_col_name] = test_df_clean[col].isnull().astype(int)\n",
" print(f'Added missing indicator: {missing_col_name}')\n",
"\n",
"feature_cols_catA = feature_cols_all + [f'{c}_missing' for c in MISSING_COLS]\n",
"print(f'\\nFeature columns after adding indicators: {len(feature_cols_catA)}')\n",
"\n",
"X_train_A = train_df_clean[feature_cols_catA]\n",
"X_val_A = val_df_clean[feature_cols_catA]\n",
"X_test_A = test_df_clean[feature_cols_catA]\n",
"\n",
"NUMERIC_FEATURES_A = X_train_A.select_dtypes(include=[np.number]).columns.tolist()\n",
"CATEGORICAL_FEATURES_A = X_train_A.select_dtypes(include=['object']).columns.tolist()\n",
"\n",
"preprocessor_A = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, NUMERIC_FEATURES_A),\n",
" ('cat', categorical_transformer, CATEGORICAL_FEATURES_A)\n",
" ],\n",
" remainder='drop'\n",
")\n",
"\n",
"catA_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor_A),\n",
" ('classifier', xgb.XGBClassifier(**best_xgb_params))\n",
"])\n",
"catA_pipeline.fit(X_train_A, y_train_enc)\n",
"\n",
"catA_results = evaluate_model(catA_pipeline, X_train_A, y_train_enc, X_val_A, y_val_enc, le_target, 'XGB_CatA_MissingHandling')\n",
"\n",
"print('\\n=== CATEGORY A RESULTS ===')\n",
"print(f'val_f1_macro: {catA_results[\"val_f1_macro\"]:.4f}')\n",
"print(f'val_accuracy: {catA_results[\"val_accuracy\"]:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0069e9d5",
"metadata": {},
"outputs": [],
"source": [
"print('=== CATEGORY D: SOFT VOTING ENSEMBLE ===')\n",
"print('Training Soft Voting Ensemble (RF + XGBoost)...')\n",
"\n",
"rf_clf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)\n",
"xgb_clf = xgb.XGBClassifier(**best_xgb_params)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[\n",
" ('rf', rf_clf),\n",
" ('xgb', xgb_clf)\n",
" ],\n",
" voting='soft',\n",
" n_jobs=-1\n",
")\n",
"\n",
"ensemble_pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', voting_clf)\n",
"])\n",
"ensemble_pipeline.fit(X_train, y_train_enc)\n",
"\n",
"ensemble_results = evaluate_model(ensemble_pipeline, X_train, y_train_enc, X_val, y_val_enc, le_target, 'Ensemble_SoftVoting')\n",
"\n",
"print(f'Ensemble val_f1_macro: {ensemble_results[\"val_f1_macro\"]:.4f}')\n",
"print(f'Ensemble val_accuracy: {ensemble_results[\"val_accuracy\"]:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c95e0008",
"metadata": {},
"outputs": [],
"source": [
"all_results.append(catA_results)\n",
"all_results.append(ensemble_results)\n",
"results_df = pd.DataFrame(all_results)\n",
"\n",
"print('\\n=== PERSONALISED IMPROVEMENT SUMMARY ===')\n",
"print(results_df[['model', 'val_f1_macro', 'val_accuracy']].round(4).to_string(index=False))\n",
"\n",
"results_df.to_csv(\n",
" os.path.join(OUTPUT_DIR, 'tables', 'personalised_improvement_summary.csv'), index=False)\n",
"\n",
"improve_A = catA_results['val_f1_macro'] - tuned_results['val_f1_macro']\n",
"improve_D = ensemble_results['val_f1_macro'] - tuned_results['val_f1_macro']\n",
"print(f'\\nCategory A improvement (vs Tuned): +{improve_A:.4f}')\n",
"print(f'Category D improvement (vs Tuned): +{improve_D:.4f}')"
]
},
{
"cell_type": "markdown",
"id": "df4d2cc2",
"metadata": {},
"source": [
"## Step 9: K-Means & GMM Unsupervised Exploration"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ddfd4d3",
"metadata": {},
"outputs": [],
"source": [
"print('=== K-MEANS & GMM CLUSTERING ===')\n",
"\n",
"preprocessor_eval = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, NUMERIC_FEATURES),\n",
" ('cat', categorical_transformer, CATEGORICAL_FEATURES)\n",
" ],\n",
" remainder='drop'\n",
")\n",
"\n",
"X_train_scaled = preprocessor_eval.fit_transform(X_train)\n",
"print(f'Scaled training data shape: {X_train_scaled.shape}')\n",
"\n",
"pca = PCA(n_components=2, random_state=RANDOM_STATE)\n",
"X_train_pca = pca.fit_transform(X_train_scaled)\n",
"print(f'PCA explained variance: {pca.explained_variance_ratio_.sum():.4f}')\n",
"\n",
"k_range = range(2, 9)\n",
"kmeans_results = []\n",
"gmm_results = []\n",
"\n",
"for k in k_range:\n",
" print(f' Running k={k}...')\n",
" \n",
" km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)\n",
" km_labels = km.fit_predict(X_train_scaled)\n",
" sil_km = silhouette_score(X_train_scaled, km_labels)\n",
" \n",
" gmm_model = GaussianMixture(n_components=k, random_state=RANDOM_STATE, n_init=5)\n",
" gmm_labels = gmm_model.fit_predict(X_train_scaled)\n",
" sil_gmm = silhouette_score(X_train_scaled, gmm_labels)\n",
" \n",
" kmeans_results.append({\n",
" 'k': k,\n",
" 'inertia': km.inertia_,\n",
" 'silhouette_x': sil_km\n",
" })\n",
" gmm_results.append({\n",
" 'k': k,\n",
" 'log_likelihood': gmm_model.score(X_train_scaled) * X_train_scaled.shape[0],\n",
" 'bic': gmm_model.bic(X_train_scaled),\n",
" 'aic': gmm_model.aic(X_train_scaled),\n",
" 'silhouette_y': sil_gmm\n",
" })\n",
"\n",
"km_df = pd.DataFrame(kmeans_results)\n",
"gmm_df = pd.DataFrame(gmm_results)\n",
"cluster_df = km_df.merge(gmm_df, on='k')\n",
"print('\\n=== CLUSTERING COMPARISON ===')\n",
"print(cluster_df.round(4).to_string(index=False))\n",
"\n",
"cluster_df.to_csv(os.path.join(OUTPUT_DIR, 'tables', 'clustering_comparison.csv'), index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d438c228",
"metadata": {},
"outputs": [],
"source": [
"fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
"\n",
"axes[0].plot(cluster_df['k'], cluster_df['inertia'], 'bo-', label='K-Means Inertia', linewidth=2)\n",
"axes[0].set_xlabel('k')\n",
"axes[0].set_ylabel('Inertia')\n",
"axes[0].set_title('K-Means: Elbow Method')\n",
"axes[0].grid(True)\n",
"\n",
"axes[1].plot(cluster_df['k'], cluster_df['bic'], 'g^-', label='BIC', linewidth=2)\n",
"axes[1].plot(cluster_df['k'], cluster_df['aic'], 'rs--', label='AIC', linewidth=2)\n",
"axes[1].set_xlabel('k')\n",
"axes[1].set_ylabel('Score')\n",
"axes[1].set_title('GMM: BIC & AIC (lower is better)')\n",
"axes[1].legend()\n",
"axes[1].grid(True)\n",
"\n",
"axes[2].plot(cluster_df['k'], cluster_df['silhouette_x'], 'bo-', label='K-Means', linewidth=2)\n",
"axes[2].plot(cluster_df['k'], cluster_df['silhouette_y'], 'g^-', label='GMM', linewidth=2)\n",
"axes[2].set_xlabel('k')\n",
"axes[2].set_ylabel('Silhouette Score')\n",
"axes[2].set_title('Silhouette Score Comparison (higher is better)')\n",
"axes[2].legend()\n",
"axes[2].grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'clustering_comparison.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08ba45ed",
"metadata": {},
"outputs": [],
"source": [
"best_k = cluster_df.loc[cluster_df['silhouette_x'].idxmax(), 'k']\n",
"print(f'Best K for K-Means (by silhouette): {best_k}')\n",
"\n",
"km_best = KMeans(n_clusters=int(best_k), random_state=RANDOM_STATE, n_init=10)\n",
"km_best_labels = km_best.fit_predict(X_train_scaled)\n",
"\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"scatter = ax.scatter(X_train_pca[:, 0], X_train_pca[:, 1],\n",
" c=km_best_labels, cmap='viridis', alpha=0.5, s=10)\n",
"ax.set_xlabel('PC1')\n",
"ax.set_ylabel('PC2')\n",
"ax.set_title(f'K-Means Clustering (k={best_k}) - PCA Visualization')\n",
"plt.colorbar(scatter, ax=ax, label='Cluster')\n",
"plt.tight_layout()\n",
"plt.savefig(os.path.join(OUTPUT_DIR, 'figures', 'clustering_visualization.png'), dpi=150)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "48c4ad67",
"metadata": {},
"source": [
"## Step 10: Final Model Selection & Hidden-Test Export"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34692aa5",
"metadata": {},
"outputs": [],
"source": [
"print('=== FINAL MODEL SELECTION ===')\n",
"print('Based on val_f1_macro (primary metric):')\n",
"final_model_name = results_df.loc[results_df['val_f1_macro'].idxmax(), 'model']\n",
"print(f'Selected model: {final_model_name} (val_f1_macro = {results_df[\"val_f1_macro\"].max():.4f})')\n",
"\n",
"if final_model_name == 'XGB_CatA_MissingHandling':\n",
" final_pipeline = catA_pipeline\n",
" X_test_final = X_test_A\n",
"elif final_model_name == 'Ensemble_SoftVoting':\n",
" final_pipeline = ensemble_pipeline\n",
" X_test_final = X_test\n",
"else:\n",
" final_pipeline = tuned_xgb_pipeline\n",
" X_test_final = X_test\n",
"\n",
"y_val_final_pred = final_pipeline.predict(X_test_final if final_model_name == 'XGBoost_Tuned' else X_test)\n",
"y_val_final_decoded = le_target.inverse_transform(y_val_final_pred)\n",
"\n",
"plot_confusion_matrix(final_pipeline, X_val_A if final_model_name == 'XGB_CatA_MissingHandling' else X_val,\n",
" y_val_enc, le_target,\n",
" f'Final Model: {final_model_name} - Confusion Matrix',\n",
" os.path.join(OUTPUT_DIR, 'figures', 'final_model_confusion_matrix.png'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5d2526b",
"metadata": {},
"outputs": [],
"source": [
"print('\\n=== FINAL CLASSIFICATION REPORT (VAL) ===')\n",
"y_val_pred_final = final_pipeline.predict(X_val_A if final_model_name == 'XGB_CatA_MissingHandling' else X_val)\n",
"print(classification_report(y_val_enc, y_val_pred_final, target_names=le_target.classes_))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89d712c0",
"metadata": {},
"outputs": [],
"source": [
"STUDENT_ID = '1234560'\n",
"\n",
"if final_model_name == 'XGB_CatA_MissingHandling':\n",
" y_test_pred = final_pipeline.predict(X_test_A)\n",
"elif final_model_name == 'Ensemble_SoftVoting':\n",
" y_test_pred = final_pipeline.predict(X_test)\n",
"else:\n",
" y_test_pred = final_pipeline.predict(X_test)\n",
"\n",
"y_test_labels = le_target.inverse_transform(y_test_pred)\n",
"\n",
"submission_df = pd.DataFrame({\n",
" 'applicant_id': test_df['applicant_id'],\n",
" 'customer_key': test_df['customer_key'],\n",
" 'premium_risk': y_test_labels\n",
"})\n",
"\n",
"print('=== SUBMISSION CSV VALIDATION ===')\n",
"print(f'Shape: {submission_df.shape}')\n",
"print(f'Columns: {list(submission_df.columns)}')\n",
"print(submission_df.head())\n",
"\n",
"print('\\nPrediction counts:')\n",
"print(submission_df['premium_risk'].value_counts())\n",
"\n",
"csv_path = os.path.join(OUTPUT_DIR, 'predictions', f'test_result_{STUDENT_ID}.csv')\n",
"submission_df.to_csv(csv_path, index=False)\n",
"print(f'\\n*** CSV saved to: {csv_path} ***')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "my_env",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}