- added xgboost to classify AU and EyeTracking features

This commit is contained in:
TimoKurz 2026-01-11 17:21:07 +01:00
parent 10bc7c568a
commit fc6c593f6b
2 changed files with 884 additions and 0 deletions

View File

@ -0,0 +1,77 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e790b157",
"metadata": {},
"source": [
"Im folgenden wird auf die Daten das MAD Outlier removal angewendet."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46bd036d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0691732",
"metadata": {},
"outputs": [],
"source": [
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = median\n",
" print(df_clean.shape)\n",
" \n",
" print(df_clean.shape)\n",
" return df_clean"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,807 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e3be057e-8d2a-4d05-bd42-6b1dc75df5ed",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13ad96f5",
"metadata": {},
"outputs": [],
"source": [
"# data_path = Path(r\"~/Fahrsimulator_MSY2526_AI/model_training/xgboost/output_windowed.parquet\")\n",
"data_path = Path(r\"~/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4aa1e32c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"def performance_based_split(\n",
" subject_ids,\n",
" performance_df,\n",
" split_ratio=0.33,\n",
" threshold=0.01,\n",
" max_iter=100,\n",
" random_seed=None\n",
"):\n",
" \"\"\"\n",
" Split subjects into two groups based on performance scores with balanced means.\n",
" \n",
" Parameters\n",
" ----------\n",
" subject_ids : array-like\n",
" List or array of subject IDs present in your dataset\n",
" performance_df : pd.DataFrame\n",
" DataFrame containing 'subjectID' and 'overall_score' columns\n",
" split_ratio : float, default=0.33\n",
" Proportion of subjects for the smaller group (0 < split_ratio < 1)\n",
" threshold : float, default=0.01\n",
" Target difference threshold between group means\n",
" max_iter : int, default=100\n",
" Maximum number of swap iterations\n",
" random_seed : int, optional\n",
" Random seed for reproducibility\n",
" \n",
" Returns\n",
" -------\n",
" group_small_ids : np.ndarray\n",
" Subject IDs for the smaller group\n",
" group_large_ids : np.ndarray\n",
" Subject IDs for the larger group\n",
" score_diff : float\n",
" Final absolute difference between group means\n",
" \n",
" Raises\n",
" ------\n",
" ValueError\n",
" If subjects are missing performance scores or no subjects match\n",
" \"\"\"\n",
" if random_seed is not None:\n",
" np.random.seed(random_seed)\n",
" \n",
" # Filter performance data\n",
" perf_filtered = performance_df[\n",
" performance_df[\"subjectID\"].isin(subject_ids)\n",
" ][[\"subjectID\", \"overall_score\"]]\n",
" \n",
" # Merge to get only subjects present in both dataset and performance file\n",
" merged = (\n",
" pd.DataFrame({\"subjectID\": subject_ids})\n",
" .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n",
" )\n",
" \n",
" if len(merged) == 0:\n",
" raise ValueError(\"No subjects found in both dataset and performance file.\")\n",
" \n",
" # Check for missing scores\n",
" if merged[\"overall_score\"].isna().any():\n",
" raise ValueError(\"Missing score values for some subjects.\")\n",
" \n",
" merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n",
" \n",
" scores = merged_sorted[\"overall_score\"].values\n",
" n_total = len(merged_sorted)\n",
" n_small = int(n_total * split_ratio)\n",
" n_large = n_total - n_small\n",
" \n",
" # Initial random split\n",
" idx = np.arange(n_total)\n",
" np.random.shuffle(idx)\n",
" \n",
" small_idx = idx[:n_small]\n",
" large_idx = idx[n_small:]\n",
" \n",
" def score_diff(small_idx, large_idx):\n",
" return abs(scores[small_idx].mean() - scores[large_idx].mean())\n",
" \n",
" diff = score_diff(small_idx, large_idx)\n",
" count = 0\n",
" \n",
" # Optimize via random swaps\n",
" while diff > threshold and count < max_iter:\n",
" si = np.random.choice(small_idx)\n",
" li = np.random.choice(large_idx)\n",
" \n",
" new_small_idx = small_idx.copy()\n",
" new_large_idx = large_idx.copy()\n",
" \n",
" new_small_idx[new_small_idx == si] = li\n",
" new_large_idx[new_large_idx == li] = si\n",
" \n",
" new_diff = score_diff(new_small_idx, new_large_idx)\n",
" \n",
" if new_diff < diff:\n",
" small_idx = new_small_idx\n",
" large_idx = new_large_idx\n",
" diff = new_diff\n",
" \n",
" count += 1\n",
" \n",
" # Extract subject IDs\n",
" group_small_ids = merged_sorted.loc[small_idx, \"subjectID\"].values\n",
" group_large_ids = merged_sorted.loc[large_idx, \"subjectID\"].values\n",
" \n",
" return group_small_ids, group_large_ids, diff"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95e1a351",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(path=data_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "248d519b",
"metadata": {},
"outputs": [],
"source": [
"performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n",
"performance_df = pd.read_csv(performance_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b9992e0",
"metadata": {},
"outputs": [],
"source": [
"train_ids, temp_ids, diff1 = performance_based_split(\n",
" subject_ids=df[\"subjectID\"].unique(),\n",
" performance_df=performance_df,\n",
" split_ratio=0.6, # 60% train, 40% temp\n",
" random_seed=42\n",
")\n",
"\n",
"val_ids, test_ids, diff2 = performance_based_split(\n",
" subject_ids=temp_ids,\n",
" performance_df=performance_df,\n",
" split_ratio=0.5, # 50/50 split of remaining 40%\n",
" random_seed=43\n",
")\n",
"print(diff1, diff2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68afd83e",
"metadata": {},
"outputs": [],
"source": [
"subjects = df['subjectID'].unique()\n",
"print(subjects)\n",
"print(len(subjects))\n",
"print(len(subjects)*0.66)\n",
"print(len(subjects)*0.33)\n",
"print(df.columns)\n",
"print(df['STUDY'].unique())\n",
"print(df['LEVEL'].unique())\n",
"print(df['PHASE'].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52dfd885",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n",
"]\n",
"print(f\"low all: {low_all.shape}\")\n",
"\n",
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"print(f\"high n-back: {high_nback.shape}\")\n",
"\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
"\n",
"high_all = pd.concat([high_nback, high_kdrive])\n",
"print(f\"high all: {high_all.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fba6edf",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"import pandas as pd\n",
"\n",
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
" \"\"\"\n",
" Fit normalization scalers on training data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" train_data : pd.DataFrame\n",
" Training dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" method : str, default='standard'\n",
" Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
" scope : str, default='global'\n",
" Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing fitted scalers\n",
" \"\"\"\n",
" # Select scaler based on method\n",
" if method == 'standard':\n",
" Scaler = StandardScaler\n",
" elif method == 'minmax':\n",
" Scaler = MinMaxScaler\n",
" else:\n",
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
" \n",
" scalers = {}\n",
" \n",
" if scope == 'subject':\n",
" # Fit one scaler per subject\n",
" for subject in train_data['subjectID'].unique():\n",
" subject_mask = train_data['subjectID'] == subject\n",
" scaler = Scaler()\n",
" scaler.fit(train_data.loc[subject_mask, au_columns])\n",
" scalers[subject] = scaler\n",
" \n",
" elif scope == 'global':\n",
" # Fit one scaler for all subjects\n",
" scaler = Scaler()\n",
" scaler.fit(train_data[au_columns])\n",
" scalers['global'] = scaler\n",
" \n",
" else:\n",
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
" \n",
" return {'scalers': scalers, 'method': method, 'scope': scope}\n",
"\n",
"def apply_normalizer(data, au_columns, normalizer_dict):\n",
" \"\"\"\n",
" Apply fitted normalization scalers to data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" data : pd.DataFrame\n",
" Dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" normalizer_dict : dict\n",
" Dictionary containing fitted scalers from fit_normalizer()\n",
" \n",
" Returns:\n",
" --------\n",
" pd.DataFrame\n",
" DataFrame with normalized AU columns\n",
" \"\"\"\n",
" normalized_data = data.copy()\n",
" scalers = normalizer_dict['scalers']\n",
" scope = normalizer_dict['scope']\n",
" \n",
" if scope == 'subject':\n",
" # Apply per-subject normalization\n",
" for subject in data['subjectID'].unique():\n",
" subject_mask = data['subjectID'] == subject\n",
" \n",
" # Use the subject's scaler if available, otherwise use a fitted scaler from training\n",
" if subject in scalers:\n",
" scaler = scalers[subject]\n",
" else:\n",
" # For new subjects not seen in training, use the first available scaler\n",
" # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n",
" print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n",
" scaler = list(scalers.values())[0]\n",
" \n",
" normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n",
" data.loc[subject_mask, au_columns]\n",
" )\n",
" \n",
" elif scope == 'global':\n",
" # Apply global normalization\n",
" scaler = scalers['global']\n",
" normalized_data[au_columns] = scaler.transform(data[au_columns])\n",
" \n",
" return normalized_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24e3a77b",
"metadata": {},
"outputs": [],
"source": [
"%pip install xgboost"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e7fa0fa",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV\n",
"from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix\n",
"import xgboost as xgb\n",
"import joblib\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "325ef71c",
"metadata": {},
"outputs": [],
"source": [
"low = low_all.copy()\n",
"high = high_all.copy()\n",
"\n",
"low[\"label\"] = 0\n",
"high[\"label\"] = 1\n",
"\n",
"data = pd.concat([low, high], ignore_index=True)\n",
"data = data.drop_duplicates()\n",
"\n",
"print(\"Label distribution:\")\n",
"print(data[\"label\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67d70e84",
"metadata": {},
"outputs": [],
"source": [
"face_au_cols = [c for c in train_df.columns if c.startswith(\"FACE_AU\")]\n",
"eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
" 'Pupil_mean', 'Pupil_IPA']\n",
"print(len(eye_cols))\n",
"all_signal_columns = face_au_cols+eye_cols\n",
"print(len(all_signal_columns))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b19eb87b",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n",
"]\n",
"print(f\"low all: {low_all.shape}\")\n",
"\n",
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"print(f\"high n-back: {high_nback.shape}\")\n",
"\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
"\n",
"high_all = pd.concat([high_nback, high_kdrive])\n",
"print(f\"high all: {high_all.shape}\")\n",
"\n",
"\n",
"\n",
"low = low_all.copy()\n",
"high = high_all.copy()\n",
"\n",
"low[\"label\"] = 0\n",
"high[\"label\"] = 1\n",
"\n",
"data = pd.concat([low, high], ignore_index=True)\n",
"df = data.drop_duplicates()\n",
"\n",
"print(\"Label distribution:\")\n",
"print(df[\"label\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "960bb8c7",
"metadata": {},
"outputs": [],
"source": [
"train_df = df[\n",
" (df.subjectID.isin(train_ids)) & (df['label'] == 0)\n",
"].copy()\n",
"\n",
"# Validation: balanced sampling of label=0 and label=1\n",
"val_df_full = df[df.subjectID.isin(val_ids)].copy()\n",
"\n",
"# Get all label=0 samples\n",
"val_df_label0 = val_df_full[val_df_full['label'] == 0]\n",
"\n",
"# Sample same number from label=1\n",
"n_samples = len(val_df_label0)\n",
"val_df_label1 = val_df_full[val_df_full['label'] == 1].sample(\n",
" n=n_samples, random_state=42\n",
")\n",
"\n",
"# Combine\n",
"val_df = pd.concat([val_df_label0, val_df_label1], ignore_index=True)\n",
"test_df = df[df.subjectID.isin(test_ids)]\n",
"print(train_df.shape, val_df.shape,test_df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbb58abd",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params\n",
"\n",
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = median\n",
" \n",
" return df_clean"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f03f1b4",
"metadata": {},
"outputs": [],
"source": [
"# # Step 1: Fit parameters on training data\n",
"# params = calculate_mad_params(train_df, au_columns)\n",
"\n",
"# # Step 2: Apply filter consistently\n",
"# train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
"# val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n",
"# test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "289f6b89",
"metadata": {},
"outputs": [],
"source": [
"print(train_df.subjectID.unique())\n",
"print(df.subjectID.unique())\n",
"\n",
"normalizer = fit_normalizer(df, all_signal_columns, method='standard', scope='subject')\n",
"train_df_norm = apply_normalizer(train_df, all_signal_columns, normalizer)\n",
"val_df_norm = apply_normalizer(val_df, all_signal_columns, normalizer)\n",
"test_df_norm = apply_normalizer(test_df, all_signal_columns, normalizer)\n",
"\n",
"# normalizer = fit_normalizer(train_outlier_removed, au_columns, method=\"standard\", scope=\"global\")\n",
"\n",
"# train_scaled = apply_normalizer(train_outlier_removed, normalizer, au_columns)\n",
"# val_scaled = apply_normalizer(val_df, normalizer, au_columns)\n",
"# test_scaled = apply_normalizer(test_df, normalizer, au_columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5df30e8d",
"metadata": {},
"outputs": [],
"source": [
"X_train, y_train = train_df[all_signal_columns].values, train_df[\"label\"].values\n",
"X_val, y_val = val_df[all_signal_columns].values, val_df[\"label\"].values\n",
"X_test, y_test = test_df[all_signal_columns].values, test_df[\"label\"].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fb7c86a",
"metadata": {},
"outputs": [],
"source": [
"import xgboost as xgb\n",
"from sklearn.model_selection import GroupKFold, GridSearchCV\n",
"\n",
"# EarlyStopping mit kürzerem Patience\n",
"early_stop = xgb.callback.EarlyStopping(\n",
" rounds=25, metric_name='auc', data_name='validation_0', save_best=True\n",
")\n",
"\n",
"# Basis-Modell: nur feste Parameter, keine Optimierungswerte\n",
"xgb_clf = xgb.XGBClassifier(\n",
" objective=\"binary:logistic\",\n",
" scale_pos_weight=1100/1550, # Klassenungleichgewicht berücksichtigen\n",
" eval_metric=[\"logloss\", \"auc\", \"error\"],\n",
" use_label_encoder=False,\n",
" random_state=42,\n",
" callbacks=[early_stop],\n",
" verbosity=0\n",
")\n",
"\n",
"# Parameter-Raster für GridSearch\n",
"param_grid = {\n",
" \"learning_rate\": [0.01, 0.05, 0.1],\n",
" \"max_depth\": [2, 3],\n",
" \"subsample\": [0.5, 0.6, 0.7],\n",
" \"colsample_bytree\": [0.5, 0.6, 0.7],\n",
" \"reg_alpha\": [0.1, 1, 5, 10],\n",
" \"reg_lambda\": [5, 10, 20, 50],\n",
" \"min_child_weight\": [10, 20, 50],\n",
" \"max_delta_step\": [1, 5, 10],\n",
" \"n_estimators\": [500, 1000, 2000]\n",
"}\n",
"\n",
"# K-Fold Cross Validation\n",
"cv = GroupKFold(n_splits=5, shuffle=True, random_state=42)\n",
"\n",
"# Grid Search Setup\n",
"grid_search = GridSearchCV(\n",
" estimator=xgb_clf,\n",
" param_grid=param_grid,\n",
" scoring=\"roc_auc\",\n",
" n_jobs=-1,\n",
" cv=cv,\n",
" verbose=2\n",
")\n",
"\n",
"# Training mit Cross Validation, Gruppen übergeben\n",
"X_train = train_df[all_signal_columns].values\n",
"y_train = train_df[\"label\"].values\n",
"groups = train_df[\"subjectID\"].values\n",
"\n",
"# Training mit Cross Validation\n",
"grid_search.fit(\n",
" X_train, y_train,\n",
" groups=groups,\n",
" eval_set=[(X_train, y_train), (X_val, y_val)],\n",
" verbose=False,\n",
")\n",
"\n",
"print(\"Beste Parameter:\", grid_search.best_params_)\n",
"print(\"Bestes AUC:\", grid_search.best_score_)\n",
"\n",
"# Bestes Modell extrahieren\n",
"model = grid_search.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2681022",
"metadata": {},
"outputs": [],
"source": [
"# Plots\n",
"\n",
"results = model.evals_result()\n",
"epochs = len(results['validation_0']['auc'])\n",
"x_axis = range(0, epochs)\n",
"\n",
"# --- Plot Loss ---\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, results['validation_0']['logloss'], label='Validation Loss')\n",
"plt.plot(x_axis, results['validation_1']['logloss'], label='Training Loss')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Logloss')\n",
"plt.title('XGBoost Loss during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# --- Plot Accuracy ---\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, [1-e for e in results['validation_0']['error']], label='Validation Accuracy')\n",
"plt.plot(x_axis, [1-e for e in results['validation_1']['error']], label='Training Accuracy')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Accuracy')\n",
"plt.title('XGBoost Accuracy during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# Plot AUC\n",
"\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, results['validation_0']['auc'], label='Validation AUC')\n",
"plt.plot(x_axis, results['validation_1']['auc'], marker='o')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('AUC')\n",
"plt.title('XGBoost AUC during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# ROC-Kurve plotten\n",
"y_pred_proba = model.predict_proba(X_val)[:, 1]\n",
"# RocCurveDisplay.from_predictions(y_val, y_pred_proba)\n",
"plt.title(\"ROC Curve (Validation Set)\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# Test: Loss und Accuracy\n",
"y_test_proba = model.predict_proba(X_test)[:,1]\n",
"y_test_pred = (y_test_proba > 0.5).astype(int)\n",
"\n",
"# print(\"Test Loss:\", log_loss(y_test, y_test_proba))\n",
"print(\"Test Accuracy:\", accuracy_score(y_test, y_test_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09a8cd21",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay\n",
"\n",
"def evaluate(model, X, y, title=\"Evaluation\"):\n",
" # Vorhersagen\n",
" preds_proba = model.predict_proba(X)[:, 1]\n",
" preds = (preds_proba > 0.5).astype(int)\n",
"\n",
" # Metriken ausgeben\n",
" print(\"Accuracy:\", accuracy_score(y, preds))\n",
" print(\"F1:\", f1_score(y, preds))\n",
" print(\"AUC:\", roc_auc_score(y, preds))\n",
" print(\"Confusion:\\n\", confusion_matrix(y, preds))\n",
" print(classification_report(y, preds))\n",
"\n",
" # Confusion Matrix plotten\n",
" def plot_confusion_matrix(true_labels, predictions, label_names):\n",
" for normalize in [None, 'true']:\n",
" cm = confusion_matrix(true_labels, predictions, normalize=normalize)\n",
" cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)\n",
" cm_disp.plot(cmap=\"Blues\")\n",
" #cm = confusion_matrix(y, preds)\n",
" plot_confusion_matrix(y,preds, label_names=['Low','High'])\n",
" # plt.figure(figsize=(5,4))\n",
" # sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False,\n",
" # xticklabels=[\"Predicted low\", \"Predicted high\"],\n",
" # yticklabels=[\"Actual low\", \"Actual high\"])\n",
" # plt.title(f\"Confusion Matrix - {title}\")\n",
" # plt.ylabel(\"True label\")\n",
" # plt.xlabel(\"Predicted label\")\n",
" # plt.show()\n",
"\n",
"# Aufrufen für Train/Val/Test\n",
"print(\"TRAIN:\")\n",
"evaluate(model, X_train, y_train, title=\"Train\")\n",
"\n",
"print(\"VAL:\")\n",
"evaluate(model, X_val, y_val, title=\"Validation\")\n",
"\n",
"print(\"TEST:\")\n",
"evaluate(model, X_test, y_test, title=\"Test\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c43b0c80",
"metadata": {},
"outputs": [],
"source": [
"joblib.dump(model, \"xgb_model_with_MAD.joblib\")\n",
"joblib.dump(normalizer, \"normalizer_with_MAD.joblib\")\n",
"print(\"Model gespeichert.\")\n",
"\n",
"model.save_model(\"xgb_model_with_MAD.json\") # als JSON (lesbar, portabel)\n",
"model.save_model(\"xgb_model_with_MAD.bin\") # als Binärdatei (kompakt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3195cc84",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.getcwd()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}