- added earlyFusionTest

- changed to group split
2026-02-24 19:51:29 +01:00 · 2026-02-24 19:51:29 +01:00 · e69000fbd8
commit e69000fbd8
parent 7a63c7acd3
2 changed files with 604 additions and 245 deletions
--- a/model_training/CNN/CNN_crossVal_EarlyFusion_Test_Eval.ipynb
+++ b/model_training/CNN/CNN_crossVal_EarlyFusion_Test_Eval.ipynb
@ -0,0 +1,529 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "47f6de7b",
+   "metadata": {},
+   "source": [
+    "Bibliotheken importieren"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99294260",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import numpy as np \n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns \n",
+    "import random \n",
+    "import joblib \n",
+    "from pathlib import Path \n",
+    "\n",
+    "from sklearn.model_selection import GroupKFold, GroupShuffleSplit\n",
+    "from sklearn.preprocessing import StandardScaler \n",
+    "from sklearn.metrics import ( \n",
+    "    precision_score, recall_score,\n",
+    "    confusion_matrix, roc_curve, auc, \n",
+    "    precision_recall_curve, f1_score, \n",
+    "    balanced_accuracy_score, accuracy_score\n",
+    ") \n",
+    "\n",
+    "import tensorflow as tf \n",
+    "from tensorflow.keras import Input, layers, models, regularizers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52b4ca8c",
+   "metadata": {},
+   "source": [
+    "Seed festlegen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e49d281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SEED = 42 \n",
+    "np.random.seed(SEED) \n",
+    "tf.random.set_seed(SEED) \n",
+    "random.seed(SEED)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae1a715f",
+   "metadata": {},
+   "source": [
+    "Daten laden"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "870f01c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = Path(r\"~/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\") \n",
+    "\n",
+    "data = pd.read_parquet(path=data_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bedbc23b",
+   "metadata": {},
+   "source": [
+    "Labels erstellen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38848515",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_all = data[((data[\"PHASE\"] == \"baseline\") | \n",
+    "                ((data[\"STUDY\"] == \"n-back\") & (data[\"PHASE\"] != \"baseline\") & (data[\"LEVEL\"].isin([1,4]))))].copy() \n",
+    "\n",
+    "high_all = pd.concat([ \n",
+    "    data[(data[\"STUDY\"]==\"n-back\") & (data[\"LEVEL\"].isin([2,3,5,6])) & (data[\"PHASE\"].isin([\"train\",\"test\"]))], \n",
+    "    data[(data[\"STUDY\"]==\"k-drive\") & (data[\"PHASE\"]!=\"baseline\")] \n",
+    "]).copy() \n",
+    "\n",
+    "low_all[\"label\"] = 0 \n",
+    "high_all[\"label\"] = 1 \n",
+    "data = pd.concat([low_all, high_all], ignore_index=True).drop_duplicates() "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b282acf",
+   "metadata": {},
+   "source": [
+    "Features und Labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5edb00a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Face AUs\n",
+    "au_columns = [col for col in data.columns if \"face\" in col.lower()] \n",
+    "\n",
+    "#Eye Features\n",
+    "eye_columns = [ \n",
+    "    'Fix_count_short_66_150', \n",
+    "    'Fix_count_medium_300_500', \n",
+    "    'Fix_count_long_gt_1000', \n",
+    "    'Fix_count_100', \n",
+    "    'Fix_mean_duration', \n",
+    "    'Fix_median_duration', \n",
+    "    'Sac_count', \n",
+    "    'Sac_mean_amp', \n",
+    "    'Sac_mean_dur', \n",
+    "    'Sac_median_dur', \n",
+    "    'Blink_count', \n",
+    "    'Blink_mean_dur', \n",
+    "    'Blink_median_dur', \n",
+    "    'Pupil_mean', \n",
+    "    'Pupil_IPA' \n",
+    "]\n",
+    "\n",
+    "#Early Fusion\n",
+    "feature_columns = au_columns + eye_columns\n",
+    "\n",
+    "#NaNs entfernen \n",
+    "data = data.dropna(subset=feature_columns + [\"label\"])\n",
+    "\n",
+    "X = data[feature_columns].values[..., np.newaxis] \n",
+    "y = data[\"label\"].values \n",
+    "\n",
+    "groups = data[\"subjectID\"].values\n",
+    "print(data.columns.tolist())\n",
+    "\n",
+    "print(\"Gefundene FACE_AU-Spalten:\", au_columns)\n",
+    "print(\"Gefundene Eye Features:\" , eye_columns)\n",
+    "\n",
+    "print(\"Anzahl FACE_AUs:\", len(au_columns)) \n",
+    "print(\"Anzahl EYE Features:\", len(eye_columns)) \n",
+    "print(\"Gesamtzahl Features:\", len(feature_columns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8689679",
+   "metadata": {},
+   "source": [
+    "Train-Test-Split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5cf88c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
+    "train_idx, test_idx = next(gss.split(X, y, groups))\n",
+    "\n",
+    "feature_columns_train, feature_columns_test = X[train_idx], X[test_idx]\n",
+    "y_train, y_test = y[train_idx], y[test_idx]\n",
+    "groups_train, groups_test = groups[train_idx], groups[test_idx]\n",
+    "\n",
+    "print(\"Train:\", len(y_train), \" | Test:\", len(y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a539b83b",
+   "metadata": {},
+   "source": [
+    "CNN-Modell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4a7f496",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_model(input_shape, lr=1e-4): \n",
+    "    model = models.Sequential([ \n",
+    "        Input(shape=input_shape), \n",
+    "        layers.Conv1D(32, kernel_size=3, activation=\"relu\", kernel_regularizer=regularizers.l2(0.001)), \n",
+    "        layers.BatchNormalization(), \n",
+    "        layers.MaxPooling1D(pool_size=2),\n",
+    "\n",
+    "        layers.Conv1D(64, kernel_size=3, activation=\"relu\", kernel_regularizer=regularizers.l2(0.001)), \n",
+    "        layers.BatchNormalization(), \n",
+    "        layers.GlobalAveragePooling1D(), \n",
+    "        \n",
+    "        layers.Dense(32, activation=\"relu\", kernel_regularizer=regularizers.l2(0.001)), \n",
+    "        layers.Dropout(0.5), \n",
+    "        layers.Dense(1, activation=\"sigmoid\") \n",
+    "    ]) \n",
+    "    \n",
+    "    model.compile( \n",
+    "        optimizer=tf.keras.optimizers.Adam(learning_rate=lr), \n",
+    "        loss=\"binary_crossentropy\", \n",
+    "        metrics=[\"accuracy\", tf.keras.metrics.AUC(name=\"auc\")] \n",
+    "    ) \n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5905871b",
+   "metadata": {},
+   "source": [
+    "Cross-Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90658000",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gkf = GroupKFold(n_splits=5) \n",
+    "cv_histories = [] \n",
+    "cv_results = [] \n",
+    "fold_subjects = []\n",
+    "all_conf_matrices = []\n",
+    "\n",
+    "for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):\n",
+    "    train_subjects = np.unique(groups[train_idx]) \n",
+    "    val_subjects = np.unique(groups[val_idx]) \n",
+    "    fold_subjects.append({\"Fold\": fold+1, \n",
+    "                          \"Train_Subjects\": train_subjects, \n",
+    "                          \"Val_Subjects\": val_subjects}) \n",
+    "    \n",
+    "    print(f\"\\n--- Fold {fold+1} ---\") \n",
+    "    print(\"Train-Subjects:\", train_subjects) \n",
+    "    print(\"Val-Subjects:\", val_subjects) \n",
+    "\n",
+    "    #Split\n",
+    "    X_train, X_val = X[train_idx], X[val_idx] \n",
+    "    y_train, y_val = y[train_idx], y[val_idx] # Normalisierung pro Fold \n",
+    "\n",
+    "    #Normalisierung pro Fold\n",
+    "    scaler = StandardScaler() \n",
+    "    X_train = scaler.fit_transform(X_train.reshape(len(X_train), -1)).reshape(X_train.shape) \n",
+    "    X_val = scaler.transform(X_val.reshape(len(X_val), -1)).reshape(X_val.shape) \n",
+    "\n",
+    "    # Plausibilitäts-Check \n",
+    "    print(\"Train Mittelwerte (erste 5 Features):\", X_train.mean(axis=0)[:5]) \n",
+    "    print(\"Train Std (erste 5 Features):\", X_train.std(axis=0)[:5]) \n",
+    "    print(\"Val Mittelwerte (erste 5 Features):\", X_val.mean(axis=0)[:5]) \n",
+    "    print(\"Val Std (erste 5 Features):\", X_val.std(axis=0)[:5]) \n",
+    "\n",
+    "    # Modell \n",
+    "    model = build_model(input_shape=(len(feature_columns_train),1), lr=1e-4) \n",
+    "    model.summary()  \n",
+    "\n",
+    "    callbacks = [ \n",
+    "        tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=10, restore_best_weights=True), \n",
+    "        tf.keras.callbacks.ReduceLROnPlateau(monitor=\"val_loss\", factor=0.5, patience=5, min_lr=1e-6) \n",
+    "    ] \n",
+    "\n",
+    "    history = model.fit( \n",
+    "        X_train, y_train, \n",
+    "        validation_data=(X_val, y_val), \n",
+    "        epochs=100, \n",
+    "        batch_size=16, \n",
+    "        callbacks=callbacks, \n",
+    "        verbose=0 \n",
+    "    ) \n",
+    "\n",
+    "    cv_histories.append(history.history) \n",
+    "    scores = model.evaluate(X_val, y_val, verbose=0) \n",
+    "    cv_results.append(scores) \n",
+    "    print(f\"Fold {fold+1} - Val Loss: {scores[0]:.4f}, Val Acc: {scores[1]:.4f}, Val AUC: {scores[2]:.4f}\")\n",
+    "\n",
+    "\n",
+    "    #Konfusionsmatrix \n",
+    "    y_pred = (model.predict(X_val) > 0.5).astype(int) \n",
+    "    cm = confusion_matrix(y_val, y_pred) \n",
+    "    all_conf_matrices.append(cm) \n",
+    "    \n",
+    "    print(f\"Konfusionsmatrix Fold {fold+1}:\\n{cm}\\n\") \n",
+    "    \n",
+    "# Aggregierte Matrix \n",
+    "agg_cm = sum(all_conf_matrices) \n",
+    "print(\"Aggregierte Konfusionsmatrix über alle Folds:\") \n",
+    "print(agg_cm)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d10b7e78",
+   "metadata": {},
+   "source": [
+    "Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9aeba7f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#results\n",
+    "cv_results = np.array(cv_results) \n",
+    "print(\"\\n=== Cross-Validation Ergebnisse ===\") \n",
+    "print(f\"Durchschnittlicher Val-Loss: {cv_results[:,0].mean():.4f}\") \n",
+    "print(f\"Durchschnittliche Val-Accuracy: {cv_results[:,1].mean():.4f}\") \n",
+    "print(f\"Durchschnittliche Val-AUC: {cv_results[:,2].mean():.4f}\")\n",
+    "\n",
+    "#Ergebnis-Tabelle erstellen\n",
+    "results_table = pd.DataFrame({ \n",
+    "    \"Fold\": np.arange(1, len(cv_results)+1), \n",
+    "    \"Val Loss\": cv_results[:,0], \n",
+    "    \"Val Accuracy\": cv_results[:,1], \n",
+    "    \"Val AUC\": cv_results[:,2] }) \n",
+    "\n",
+    "# Durchschnittszeile hinzufügen \n",
+    "avg_row = pd.DataFrame({ \n",
+    "    \"Fold\": [\"Ø\"], \n",
+    "    \"Val Loss\": [cv_results[:,0].mean()], \n",
+    "    \"Val Accuracy\": [cv_results[:,1].mean()], \n",
+    "    \"Val AUC\": [cv_results[:,2].mean()] \n",
+    "}) \n",
+    "\n",
+    "results_table = pd.concat([results_table, avg_row], ignore_index=True) \n",
+    "\n",
+    "print(\"\\n=== Ergebnis-Tabelle ===\") \n",
+    "print(results_table) \n",
+    "\n",
+    "#Tabelle speichern \n",
+    "results_table.to_csv(\"cnn_crossVal_results.csv\", index=False) \n",
+    "print(\"Ergebnisse gespeichert als 'cnn_crossVal_results.csv'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fae5df7a",
+   "metadata": {},
+   "source": [
+    "Finales Modell trainieren"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b3eab61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaler_final = StandardScaler() \n",
+    "X_scaled = scaler_final.fit_transform(feature_columns_train.reshape(len(feature_columns_train), -1)).reshape(feature_columns_train.shape) \n",
+    "\n",
+    "final_model = build_model(input_shape=(len(feature_columns_train),1), lr=1e-4) \n",
+    "final_model.summary() \n",
+    "\n",
+    "final_model.fit( \n",
+    "    X_scaled, y_train, \n",
+    "    epochs=150, \n",
+    "    batch_size=16, \n",
+    "    verbose=1 \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c7f9cc4",
+   "metadata": {},
+   "source": [
+    "Speichern des Modells"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d3af5be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# final_model.save(\"cnn_crossVal_EarlyFusion_V2.keras\") \n",
+    "# joblib.dump(scaler_final, \"scaler_crossVal_EarlyFusion_V2.joblib\") \n",
+    "\n",
+    "# print(\"Finales Modell und Scaler gespeichert als 'cnn_crossVal_EarlyFusion_V2.keras' und 'scaler_crossVal_EarlyFusion_V2.joblib'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c11891e0",
+   "metadata": {},
+   "source": [
+    "Plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f6a8584",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#plots\n",
+    "def plot_cv_histories(cv_histories, metric): \n",
+    "    plt.figure(figsize=(10,6)) \n",
+    "    \n",
+    "    for i, hist in enumerate(cv_histories): \n",
+    "        plt.plot(hist[metric], label=f\"Fold {i+1} Train\", alpha=0.7) \n",
+    "        plt.plot(hist[f\"val_{metric}\"], label=f\"Fold {i+1} Val\", linestyle=\"--\", alpha=0.7) \n",
+    "    plt.xlabel(\"Epochs\") \n",
+    "    plt.ylabel(metric.capitalize()) \n",
+    "    plt.title(f\"Cross-Validation {metric.capitalize()} Verläufe\") \n",
+    "    plt.legend() \n",
+    "    plt.grid(True) \n",
+    "    plt.show()\n",
+    "    \n",
+    "plot_cv_histories(cv_histories, \"loss\") \n",
+    "plot_cv_histories(cv_histories, \"accuracy\") \n",
+    "plot_cv_histories(cv_histories, \"auc\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4aebe6c6",
+   "metadata": {},
+   "source": [
+    "Test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d34d6b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocessing Testdaten \n",
+    "X_test_scaled = scaler.transform( \n",
+    "    feature_columns_test.reshape(len(feature_columns_test), -1) \n",
+    ").reshape(feature_columns_test.shape) \n",
+    "\n",
+    "# Vorhersagen \n",
+    "y_prob_test = model.predict(X_test_scaled).flatten() \n",
+    "y_pred_test = (y_prob_test > 0.5).astype(int) \n",
+    "\n",
+    "# Konfusionsmatrix \n",
+    "cm_test = confusion_matrix(y_test, y_pred_test) \n",
+    "\n",
+    "plt.figure(figsize=(6,5)) \n",
+    "sns.heatmap(cm_test, annot=True, fmt=\"d\", cmap=\"Greens\", \n",
+    "            xticklabels=[\"Pred 0\", \"Pred 1\"], \n",
+    "            yticklabels=[\"True 0\", \"True 1\"]) \n",
+    "plt.title(\"Konfusionsmatrix - Testdaten\") \n",
+    "plt.show() \n",
+    "\n",
+    "# ROC \n",
+    "fpr, tpr, _ = roc_curve(y_test, y_prob_test) \n",
+    "roc_auc = auc(fpr, tpr) \n",
+    "\n",
+    "plt.figure(figsize=(7,6)) \n",
+    "plt.plot(fpr, tpr, label=f\"AUC = {roc_auc:.3f}\") \n",
+    "plt.plot([0,1], [0,1], \"k--\") \n",
+    "plt.title(\"ROC - Testdaten\") \n",
+    "plt.legend() \n",
+    "plt.grid(True) \n",
+    "plt.show() \n",
+    "\n",
+    "# Precision-Recall \n",
+    "precision, recall, _ = precision_recall_curve(y_test, y_prob_test) \n",
+    "plt.figure(figsize=(7,6)) \n",
+    "plt.plot(recall, precision) \n",
+    "plt.title(\"Precision-Recall - Testdaten\") \n",
+    "plt.grid(True) \n",
+    "plt.show() \n",
+    "\n",
+    "# Metriken \n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred_test))\n",
+    "print(\"F1-Score:\", f1_score(y_test, y_pred_test)) \n",
+    "print(\"Balanced Accuracy:\", balanced_accuracy_score(y_test, y_pred_test)) \n",
+    "print(\"Precision:\", precision_score(y_test, y_pred_test)) \n",
+    "print(\"Recall:\", recall_score(y_test, y_pred_test)) \n",
+    "print(\"AUC:\", roc_auc)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/model_training/CNN/CNN_crossVal_HybridFusion_Test_Eval.ipynb
+++ b/model_training/CNN/CNN_crossVal_HybridFusion_Test_Eval.ipynb