diff --git a/model_training/MAD_outlier_removal/mad_outlier_removal_median.ipynb b/model_training/MAD_outlier_removal/mad_outlier_removal_median.ipynb new file mode 100644 index 0000000..2d8a4fc --- /dev/null +++ b/model_training/MAD_outlier_removal/mad_outlier_removal_median.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e790b157", + "metadata": {}, + "source": [ + "Im folgenden wird auf die Daten das MAD Outlier removal angewendet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46bd036d", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def calculate_mad_params(df, columns):\n", + " \"\"\"\n", + " Calculate median and MAD parameters for each column.\n", + " This should be run ONLY on the training data.\n", + " \n", + " Returns a dictionary: {col: (median, mad)}\n", + " \"\"\"\n", + " params = {}\n", + " for col in columns:\n", + " median = df[col].median()\n", + " mad = np.median(np.abs(df[col] - median))\n", + " params[col] = (median, mad)\n", + " return params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0691732", + "metadata": {}, + "outputs": [], + "source": [ + "def apply_mad_filter(df, params, threshold=3.5):\n", + " \"\"\"\n", + " Apply MAD-based outlier removal using precomputed parameters.\n", + " Works on training, validation, and test data.\n", + " \n", + " df: DataFrame to filter\n", + " params: dictionary {col: (median, mad)} from training data\n", + " threshold: cutoff for robust Z-score\n", + " \"\"\"\n", + " df_clean = df.copy()\n", + "\n", + " for col, (median, mad) in params.items():\n", + " if mad == 0:\n", + " continue # no spread; nothing to remove for this column\n", + "\n", + " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", + " outlier_mask = np.abs(robust_z) > threshold\n", + "\n", + " # Remove values only in this specific column\n", + " df_clean.loc[outlier_mask, col] = median\n", + " print(df_clean.shape)\n", + " \n", + " print(df_clean.shape)\n", + " return df_clean" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/model_training/xgboost/xgboost_new_dataset.ipynb b/model_training/xgboost/xgboost_new_dataset.ipynb new file mode 100644 index 0000000..3c4baa2 --- /dev/null +++ b/model_training/xgboost/xgboost_new_dataset.ipynb @@ -0,0 +1,807 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e3be057e-8d2a-4d05-bd42-6b1dc75df5ed", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13ad96f5", + "metadata": {}, + "outputs": [], + "source": [ + "# data_path = Path(r\"~/Fahrsimulator_MSY2526_AI/model_training/xgboost/output_windowed.parquet\")\n", + "data_path = Path(r\"~/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4aa1e32c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "\n", + "def performance_based_split(\n", + " subject_ids,\n", + " performance_df,\n", + " split_ratio=0.33,\n", + " threshold=0.01,\n", + " max_iter=100,\n", + " random_seed=None\n", + "):\n", + " \"\"\"\n", + " Split subjects into two groups based on performance scores with balanced means.\n", + " \n", + " Parameters\n", + " ----------\n", + " subject_ids : array-like\n", + " List or array of subject IDs present in your dataset\n", + " performance_df : pd.DataFrame\n", + " DataFrame containing 'subjectID' and 'overall_score' columns\n", + " split_ratio : float, default=0.33\n", + " Proportion of subjects for the smaller group (0 < split_ratio < 1)\n", + " threshold : float, default=0.01\n", + " Target difference threshold between group means\n", + " max_iter : int, default=100\n", + " Maximum number of swap iterations\n", + " random_seed : int, optional\n", + " Random seed for reproducibility\n", + " \n", + " Returns\n", + " -------\n", + " group_small_ids : np.ndarray\n", + " Subject IDs for the smaller group\n", + " group_large_ids : np.ndarray\n", + " Subject IDs for the larger group\n", + " score_diff : float\n", + " Final absolute difference between group means\n", + " \n", + " Raises\n", + " ------\n", + " ValueError\n", + " If subjects are missing performance scores or no subjects match\n", + " \"\"\"\n", + " if random_seed is not None:\n", + " np.random.seed(random_seed)\n", + " \n", + " # Filter performance data\n", + " perf_filtered = performance_df[\n", + " performance_df[\"subjectID\"].isin(subject_ids)\n", + " ][[\"subjectID\", \"overall_score\"]]\n", + " \n", + " # Merge to get only subjects present in both dataset and performance file\n", + " merged = (\n", + " pd.DataFrame({\"subjectID\": subject_ids})\n", + " .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n", + " )\n", + " \n", + " if len(merged) == 0:\n", + " raise ValueError(\"No subjects found in both dataset and performance file.\")\n", + " \n", + " # Check for missing scores\n", + " if merged[\"overall_score\"].isna().any():\n", + " raise ValueError(\"Missing score values for some subjects.\")\n", + " \n", + " merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n", + " \n", + " scores = merged_sorted[\"overall_score\"].values\n", + " n_total = len(merged_sorted)\n", + " n_small = int(n_total * split_ratio)\n", + " n_large = n_total - n_small\n", + " \n", + " # Initial random split\n", + " idx = np.arange(n_total)\n", + " np.random.shuffle(idx)\n", + " \n", + " small_idx = idx[:n_small]\n", + " large_idx = idx[n_small:]\n", + " \n", + " def score_diff(small_idx, large_idx):\n", + " return abs(scores[small_idx].mean() - scores[large_idx].mean())\n", + " \n", + " diff = score_diff(small_idx, large_idx)\n", + " count = 0\n", + " \n", + " # Optimize via random swaps\n", + " while diff > threshold and count < max_iter:\n", + " si = np.random.choice(small_idx)\n", + " li = np.random.choice(large_idx)\n", + " \n", + " new_small_idx = small_idx.copy()\n", + " new_large_idx = large_idx.copy()\n", + " \n", + " new_small_idx[new_small_idx == si] = li\n", + " new_large_idx[new_large_idx == li] = si\n", + " \n", + " new_diff = score_diff(new_small_idx, new_large_idx)\n", + " \n", + " if new_diff < diff:\n", + " small_idx = new_small_idx\n", + " large_idx = new_large_idx\n", + " diff = new_diff\n", + " \n", + " count += 1\n", + " \n", + " # Extract subject IDs\n", + " group_small_ids = merged_sorted.loc[small_idx, \"subjectID\"].values\n", + " group_large_ids = merged_sorted.loc[large_idx, \"subjectID\"].values\n", + " \n", + " return group_small_ids, group_large_ids, diff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95e1a351", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(path=data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "248d519b", + "metadata": {}, + "outputs": [], + "source": [ + "performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n", + "performance_df = pd.read_csv(performance_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b9992e0", + "metadata": {}, + "outputs": [], + "source": [ + "train_ids, temp_ids, diff1 = performance_based_split(\n", + " subject_ids=df[\"subjectID\"].unique(),\n", + " performance_df=performance_df,\n", + " split_ratio=0.6, # 60% train, 40% temp\n", + " random_seed=42\n", + ")\n", + "\n", + "val_ids, test_ids, diff2 = performance_based_split(\n", + " subject_ids=temp_ids,\n", + " performance_df=performance_df,\n", + " split_ratio=0.5, # 50/50 split of remaining 40%\n", + " random_seed=43\n", + ")\n", + "print(diff1, diff2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68afd83e", + "metadata": {}, + "outputs": [], + "source": [ + "subjects = df['subjectID'].unique()\n", + "print(subjects)\n", + "print(len(subjects))\n", + "print(len(subjects)*0.66)\n", + "print(len(subjects)*0.33)\n", + "print(df.columns)\n", + "print(df['STUDY'].unique())\n", + "print(df['LEVEL'].unique())\n", + "print(df['PHASE'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52dfd885", + "metadata": {}, + "outputs": [], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n", + "]\n", + "print(f\"low all: {low_all.shape}\")\n", + "\n", + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "print(f\"high n-back: {high_nback.shape}\")\n", + "\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(f\"high k-drive: {high_kdrive.shape}\")\n", + "\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"high all: {high_all.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fba6edf", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "import pandas as pd\n", + "\n", + "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n", + " \"\"\"\n", + " Fit normalization scalers on training data.\n", + " \n", + " Parameters:\n", + " -----------\n", + " train_data : pd.DataFrame\n", + " Training dataframe with AU columns and subjectID\n", + " au_columns : list\n", + " List of AU column names to normalize\n", + " method : str, default='standard'\n", + " Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n", + " scope : str, default='global'\n", + " Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n", + " \n", + " Returns:\n", + " --------\n", + " dict\n", + " Dictionary containing fitted scalers\n", + " \"\"\"\n", + " # Select scaler based on method\n", + " if method == 'standard':\n", + " Scaler = StandardScaler\n", + " elif method == 'minmax':\n", + " Scaler = MinMaxScaler\n", + " else:\n", + " raise ValueError(\"method must be 'standard' or 'minmax'\")\n", + " \n", + " scalers = {}\n", + " \n", + " if scope == 'subject':\n", + " # Fit one scaler per subject\n", + " for subject in train_data['subjectID'].unique():\n", + " subject_mask = train_data['subjectID'] == subject\n", + " scaler = Scaler()\n", + " scaler.fit(train_data.loc[subject_mask, au_columns])\n", + " scalers[subject] = scaler\n", + " \n", + " elif scope == 'global':\n", + " # Fit one scaler for all subjects\n", + " scaler = Scaler()\n", + " scaler.fit(train_data[au_columns])\n", + " scalers['global'] = scaler\n", + " \n", + " else:\n", + " raise ValueError(\"scope must be 'subject' or 'global'\")\n", + " \n", + " return {'scalers': scalers, 'method': method, 'scope': scope}\n", + "\n", + "def apply_normalizer(data, au_columns, normalizer_dict):\n", + " \"\"\"\n", + " Apply fitted normalization scalers to data.\n", + " \n", + " Parameters:\n", + " -----------\n", + " data : pd.DataFrame\n", + " Dataframe with AU columns and subjectID\n", + " au_columns : list\n", + " List of AU column names to normalize\n", + " normalizer_dict : dict\n", + " Dictionary containing fitted scalers from fit_normalizer()\n", + " \n", + " Returns:\n", + " --------\n", + " pd.DataFrame\n", + " DataFrame with normalized AU columns\n", + " \"\"\"\n", + " normalized_data = data.copy()\n", + " scalers = normalizer_dict['scalers']\n", + " scope = normalizer_dict['scope']\n", + " \n", + " if scope == 'subject':\n", + " # Apply per-subject normalization\n", + " for subject in data['subjectID'].unique():\n", + " subject_mask = data['subjectID'] == subject\n", + " \n", + " # Use the subject's scaler if available, otherwise use a fitted scaler from training\n", + " if subject in scalers:\n", + " scaler = scalers[subject]\n", + " else:\n", + " # For new subjects not seen in training, use the first available scaler\n", + " # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n", + " print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n", + " scaler = list(scalers.values())[0]\n", + " \n", + " normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n", + " data.loc[subject_mask, au_columns]\n", + " )\n", + " \n", + " elif scope == 'global':\n", + " # Apply global normalization\n", + " scaler = scalers['global']\n", + " normalized_data[au_columns] = scaler.transform(data[au_columns])\n", + " \n", + " return normalized_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24e3a77b", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e7fa0fa", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV\n", + "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix\n", + "import xgboost as xgb\n", + "import joblib\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "325ef71c", + "metadata": {}, + "outputs": [], + "source": [ + "low = low_all.copy()\n", + "high = high_all.copy()\n", + "\n", + "low[\"label\"] = 0\n", + "high[\"label\"] = 1\n", + "\n", + "data = pd.concat([low, high], ignore_index=True)\n", + "data = data.drop_duplicates()\n", + "\n", + "print(\"Label distribution:\")\n", + "print(data[\"label\"].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67d70e84", + "metadata": {}, + "outputs": [], + "source": [ + "face_au_cols = [c for c in train_df.columns if c.startswith(\"FACE_AU\")]\n", + "eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", + " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", + " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", + " 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n", + " 'Pupil_mean', 'Pupil_IPA']\n", + "print(len(eye_cols))\n", + "all_signal_columns = face_au_cols+eye_cols\n", + "print(len(all_signal_columns))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b19eb87b", + "metadata": {}, + "outputs": [], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n", + "]\n", + "print(f\"low all: {low_all.shape}\")\n", + "\n", + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "print(f\"high n-back: {high_nback.shape}\")\n", + "\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(f\"high k-drive: {high_kdrive.shape}\")\n", + "\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"high all: {high_all.shape}\")\n", + "\n", + "\n", + "\n", + "low = low_all.copy()\n", + "high = high_all.copy()\n", + "\n", + "low[\"label\"] = 0\n", + "high[\"label\"] = 1\n", + "\n", + "data = pd.concat([low, high], ignore_index=True)\n", + "df = data.drop_duplicates()\n", + "\n", + "print(\"Label distribution:\")\n", + "print(df[\"label\"].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "960bb8c7", + "metadata": {}, + "outputs": [], + "source": [ + "train_df = df[\n", + " (df.subjectID.isin(train_ids)) & (df['label'] == 0)\n", + "].copy()\n", + "\n", + "# Validation: balanced sampling of label=0 and label=1\n", + "val_df_full = df[df.subjectID.isin(val_ids)].copy()\n", + "\n", + "# Get all label=0 samples\n", + "val_df_label0 = val_df_full[val_df_full['label'] == 0]\n", + "\n", + "# Sample same number from label=1\n", + "n_samples = len(val_df_label0)\n", + "val_df_label1 = val_df_full[val_df_full['label'] == 1].sample(\n", + " n=n_samples, random_state=42\n", + ")\n", + "\n", + "# Combine\n", + "val_df = pd.concat([val_df_label0, val_df_label1], ignore_index=True)\n", + "test_df = df[df.subjectID.isin(test_ids)]\n", + "print(train_df.shape, val_df.shape,test_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbb58abd", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def calculate_mad_params(df, columns):\n", + " \"\"\"\n", + " Calculate median and MAD parameters for each column.\n", + " This should be run ONLY on the training data.\n", + " \n", + " Returns a dictionary: {col: (median, mad)}\n", + " \"\"\"\n", + " params = {}\n", + " for col in columns:\n", + " median = df[col].median()\n", + " mad = np.median(np.abs(df[col] - median))\n", + " params[col] = (median, mad)\n", + " return params\n", + "\n", + "def apply_mad_filter(df, params, threshold=3.5):\n", + " \"\"\"\n", + " Apply MAD-based outlier removal using precomputed parameters.\n", + " Works on training, validation, and test data.\n", + " \n", + " df: DataFrame to filter\n", + " params: dictionary {col: (median, mad)} from training data\n", + " threshold: cutoff for robust Z-score\n", + " \"\"\"\n", + " df_clean = df.copy()\n", + "\n", + " for col, (median, mad) in params.items():\n", + " if mad == 0:\n", + " continue # no spread; nothing to remove for this column\n", + "\n", + " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", + " outlier_mask = np.abs(robust_z) > threshold\n", + "\n", + " # Remove values only in this specific column\n", + " df_clean.loc[outlier_mask, col] = median\n", + " \n", + " return df_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f03f1b4", + "metadata": {}, + "outputs": [], + "source": [ + "# # Step 1: Fit parameters on training data\n", + "# params = calculate_mad_params(train_df, au_columns)\n", + "\n", + "# # Step 2: Apply filter consistently\n", + "# train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n", + "# val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n", + "# test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "289f6b89", + "metadata": {}, + "outputs": [], + "source": [ + "print(train_df.subjectID.unique())\n", + "print(df.subjectID.unique())\n", + "\n", + "normalizer = fit_normalizer(df, all_signal_columns, method='standard', scope='subject')\n", + "train_df_norm = apply_normalizer(train_df, all_signal_columns, normalizer)\n", + "val_df_norm = apply_normalizer(val_df, all_signal_columns, normalizer)\n", + "test_df_norm = apply_normalizer(test_df, all_signal_columns, normalizer)\n", + "\n", + "# normalizer = fit_normalizer(train_outlier_removed, au_columns, method=\"standard\", scope=\"global\")\n", + "\n", + "# train_scaled = apply_normalizer(train_outlier_removed, normalizer, au_columns)\n", + "# val_scaled = apply_normalizer(val_df, normalizer, au_columns)\n", + "# test_scaled = apply_normalizer(test_df, normalizer, au_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5df30e8d", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train = train_df[all_signal_columns].values, train_df[\"label\"].values\n", + "X_val, y_val = val_df[all_signal_columns].values, val_df[\"label\"].values\n", + "X_test, y_test = test_df[all_signal_columns].values, test_df[\"label\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb7c86a", + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from sklearn.model_selection import GroupKFold, GridSearchCV\n", + "\n", + "# EarlyStopping mit kürzerem Patience\n", + "early_stop = xgb.callback.EarlyStopping(\n", + " rounds=25, metric_name='auc', data_name='validation_0', save_best=True\n", + ")\n", + "\n", + "# Basis-Modell: nur feste Parameter, keine Optimierungswerte\n", + "xgb_clf = xgb.XGBClassifier(\n", + " objective=\"binary:logistic\",\n", + " scale_pos_weight=1100/1550, # Klassenungleichgewicht berücksichtigen\n", + " eval_metric=[\"logloss\", \"auc\", \"error\"],\n", + " use_label_encoder=False,\n", + " random_state=42,\n", + " callbacks=[early_stop],\n", + " verbosity=0\n", + ")\n", + "\n", + "# Parameter-Raster für GridSearch\n", + "param_grid = {\n", + " \"learning_rate\": [0.01, 0.05, 0.1],\n", + " \"max_depth\": [2, 3],\n", + " \"subsample\": [0.5, 0.6, 0.7],\n", + " \"colsample_bytree\": [0.5, 0.6, 0.7],\n", + " \"reg_alpha\": [0.1, 1, 5, 10],\n", + " \"reg_lambda\": [5, 10, 20, 50],\n", + " \"min_child_weight\": [10, 20, 50],\n", + " \"max_delta_step\": [1, 5, 10],\n", + " \"n_estimators\": [500, 1000, 2000]\n", + "}\n", + "\n", + "# K-Fold Cross Validation\n", + "cv = GroupKFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Grid Search Setup\n", + "grid_search = GridSearchCV(\n", + " estimator=xgb_clf,\n", + " param_grid=param_grid,\n", + " scoring=\"roc_auc\",\n", + " n_jobs=-1,\n", + " cv=cv,\n", + " verbose=2\n", + ")\n", + "\n", + "# Training mit Cross Validation, Gruppen übergeben\n", + "X_train = train_df[all_signal_columns].values\n", + "y_train = train_df[\"label\"].values\n", + "groups = train_df[\"subjectID\"].values\n", + "\n", + "# Training mit Cross Validation\n", + "grid_search.fit(\n", + " X_train, y_train,\n", + " groups=groups,\n", + " eval_set=[(X_train, y_train), (X_val, y_val)],\n", + " verbose=False,\n", + ")\n", + "\n", + "print(\"Beste Parameter:\", grid_search.best_params_)\n", + "print(\"Bestes AUC:\", grid_search.best_score_)\n", + "\n", + "# Bestes Modell extrahieren\n", + "model = grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2681022", + "metadata": {}, + "outputs": [], + "source": [ + "# Plots\n", + "\n", + "results = model.evals_result()\n", + "epochs = len(results['validation_0']['auc'])\n", + "x_axis = range(0, epochs)\n", + "\n", + "# --- Plot Loss ---\n", + "plt.figure(figsize=(8,6))\n", + "plt.plot(x_axis, results['validation_0']['logloss'], label='Validation Loss')\n", + "plt.plot(x_axis, results['validation_1']['logloss'], label='Training Loss')\n", + "plt.legend()\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Logloss')\n", + "plt.title('XGBoost Loss during Training')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "# --- Plot Accuracy ---\n", + "plt.figure(figsize=(8,6))\n", + "plt.plot(x_axis, [1-e for e in results['validation_0']['error']], label='Validation Accuracy')\n", + "plt.plot(x_axis, [1-e for e in results['validation_1']['error']], label='Training Accuracy')\n", + "plt.legend()\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.title('XGBoost Accuracy during Training')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "# Plot AUC\n", + "\n", + "plt.figure(figsize=(8,6))\n", + "plt.plot(x_axis, results['validation_0']['auc'], label='Validation AUC')\n", + "plt.plot(x_axis, results['validation_1']['auc'], marker='o')\n", + "plt.legend()\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('AUC')\n", + "plt.title('XGBoost AUC during Training')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "# ROC-Kurve plotten\n", + "y_pred_proba = model.predict_proba(X_val)[:, 1]\n", + "# RocCurveDisplay.from_predictions(y_val, y_pred_proba)\n", + "plt.title(\"ROC Curve (Validation Set)\")\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "# Test: Loss und Accuracy\n", + "y_test_proba = model.predict_proba(X_test)[:,1]\n", + "y_test_pred = (y_test_proba > 0.5).astype(int)\n", + "\n", + "# print(\"Test Loss:\", log_loss(y_test, y_test_proba))\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, y_test_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09a8cd21", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay\n", + "\n", + "def evaluate(model, X, y, title=\"Evaluation\"):\n", + " # Vorhersagen\n", + " preds_proba = model.predict_proba(X)[:, 1]\n", + " preds = (preds_proba > 0.5).astype(int)\n", + "\n", + " # Metriken ausgeben\n", + " print(\"Accuracy:\", accuracy_score(y, preds))\n", + " print(\"F1:\", f1_score(y, preds))\n", + " print(\"AUC:\", roc_auc_score(y, preds))\n", + " print(\"Confusion:\\n\", confusion_matrix(y, preds))\n", + " print(classification_report(y, preds))\n", + "\n", + " # Confusion Matrix plotten\n", + " def plot_confusion_matrix(true_labels, predictions, label_names):\n", + " for normalize in [None, 'true']:\n", + " cm = confusion_matrix(true_labels, predictions, normalize=normalize)\n", + " cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)\n", + " cm_disp.plot(cmap=\"Blues\")\n", + " #cm = confusion_matrix(y, preds)\n", + " plot_confusion_matrix(y,preds, label_names=['Low','High'])\n", + " # plt.figure(figsize=(5,4))\n", + " # sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False,\n", + " # xticklabels=[\"Predicted low\", \"Predicted high\"],\n", + " # yticklabels=[\"Actual low\", \"Actual high\"])\n", + " # plt.title(f\"Confusion Matrix - {title}\")\n", + " # plt.ylabel(\"True label\")\n", + " # plt.xlabel(\"Predicted label\")\n", + " # plt.show()\n", + "\n", + "# Aufrufen für Train/Val/Test\n", + "print(\"TRAIN:\")\n", + "evaluate(model, X_train, y_train, title=\"Train\")\n", + "\n", + "print(\"VAL:\")\n", + "evaluate(model, X_val, y_val, title=\"Validation\")\n", + "\n", + "print(\"TEST:\")\n", + "evaluate(model, X_test, y_test, title=\"Test\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c43b0c80", + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(model, \"xgb_model_with_MAD.joblib\")\n", + "joblib.dump(normalizer, \"normalizer_with_MAD.joblib\")\n", + "print(\"Model gespeichert.\")\n", + "\n", + "model.save_model(\"xgb_model_with_MAD.json\") # als JSON (lesbar, portabel)\n", + "model.save_model(\"xgb_model_with_MAD.bin\") # als Binärdatei (kompakt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3195cc84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.getcwd()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}