tool function for performance based split; new model training notebooks

2026-01-02 12:02:48 +01:00 · 2026-01-02 12:02:48 +01:00 · a3119e4fc3
commit a3119e4fc3
parent f5796a5cdd
3 changed files with 1187 additions and 0 deletions
--- a/model_training/DeepSVDD/deepSVDD.ipynb
+++ b/model_training/DeepSVDD/deepSVDD.ipynb
@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf894f6f",
+   "metadata": {},
+   "source": [
+    "# Intermediate Fusion mit Deep SVDD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "494626b1",
+   "metadata": {},
+   "source": [
+    "* Input: gemeinsames Dataset aus EYE Tracking und Action Units mit selber Abtastfrequenz\n",
+    "* Verarbeitung: Intermediate Fusion\n",
+    "* Modell: Deep SVDD --> Erlernen einer Kugel durch ein neuronales Netz, dass die Normaldaten einschließt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bef91203",
+   "metadata": {},
+   "source": [
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0b8274a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
+    "sys.path.append(base_dir)\n",
+    "print(base_dir)\n",
+    "\n",
+    "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n",
+    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
+    "from sklearn.svm import OneClassSVM\n",
+    "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, GroupKFold\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import pickle\n",
+    "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f00a477c",
+   "metadata": {},
+   "source": [
+    "### Data Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "504c1df7",
+   "metadata": {},
+   "source": [
+    "Laden der Daten"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6482542b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce8ab464",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(path=dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b736bc58",
+   "metadata": {},
+   "source": [
+    "### Modell Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa11faf3",
+   "metadata": {},
+   "source": [
+    "Vor-Training der Gewichte mit Autoencoder, Loss: MSE"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/model_training/VAE_SVM/AEdannSVM.ipynb
+++ b/model_training/VAE_SVM/AEdannSVM.ipynb
@ -0,0 +1,918 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "708c9745",
+   "metadata": {},
+   "source": [
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53b10294",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
+    "sys.path.append(base_dir)\n",
+    "print(base_dir)\n",
+    "\n",
+    "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n",
+    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
+    "from sklearn.svm import OneClassSVM\n",
+    "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, GroupKFold\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import pickle\n",
+    "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "68101229",
+   "metadata": {},
+   "source": [
+    "### load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24a765e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "471001b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(path=dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0fdecdaa",
+   "metadata": {},
+   "source": [
+    "### Load Performance data and Subject Split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "692d1b47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "performance_path =  Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n",
+    "performance_df = pd.read_csv(performance_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea617e3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Subject IDs aus dem Haupt-Dataset nehmen\n",
+    "subjects_from_df = df[\"subjectID\"].unique()\n",
+    "\n",
+    "# Performance-Subset nur für vorhandene Subjects\n",
+    "perf_filtered = performance_df[\n",
+    "    performance_df[\"subjectID\"].isin(subjects_from_df)\n",
+    "][[\"subjectID\", \"overall_score\"]]\n",
+    "\n",
+    "# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n",
+    "merged = (\n",
+    "    pd.DataFrame({\"subjectID\": subjects_from_df})\n",
+    "    .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n",
+    ")\n",
+    "\n",
+    "# Sicherstellen, dass keine Scores fehlen\n",
+    "if merged[\"overall_score\"].isna().any():\n",
+    "    raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae43df8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n",
+    "\n",
+    "scores = merged_sorted[\"overall_score\"].values\n",
+    "n_total = len(merged_sorted)\n",
+    "n_small = n_total // 3\n",
+    "n_large = n_total - n_small\n",
+    "\n",
+    "# Schritt 1: zufällige Start-Aufteilung\n",
+    "idx = np.arange(n_total)\n",
+    "np.random.shuffle(idx)\n",
+    "\n",
+    "small_idx = idx[:n_small]\n",
+    "large_idx = idx[n_small:]\n",
+    "\n",
+    "def score_diff(small_idx, large_idx):\n",
+    "    return abs(scores[small_idx].mean() - scores[large_idx].mean())\n",
+    "\n",
+    "diff = score_diff(small_idx, large_idx)\n",
+    "threshold = 0.01\n",
+    "max_iter = 100\n",
+    "count = 0\n",
+    "\n",
+    "# Schritt 2: random swaps bis Differenz klein genug\n",
+    "while diff > threshold and count < max_iter:\n",
+    "    # Zwei zufällige Elemente auswählen\n",
+    "    si = np.random.choice(small_idx)\n",
+    "    li = np.random.choice(large_idx)\n",
+    "    \n",
+    "    # Tausch durchführen\n",
+    "    new_small_idx = small_idx.copy()\n",
+    "    new_large_idx = large_idx.copy()\n",
+    "    \n",
+    "    new_small_idx[new_small_idx == si] = li\n",
+    "    new_large_idx[new_large_idx == li] = si\n",
+    "\n",
+    "    # neue Differenz berechnen\n",
+    "    new_diff = score_diff(new_small_idx, new_large_idx)\n",
+    "\n",
+    "    # Swap akzeptieren, wenn es besser wird\n",
+    "    if new_diff < diff:\n",
+    "        small_idx = new_small_idx\n",
+    "        large_idx = new_large_idx\n",
+    "        diff = new_diff\n",
+    "\n",
+    "    count += 1\n",
+    "\n",
+    "# Finalgruppen\n",
+    "group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n",
+    "group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n",
+    "\n",
+    "print(\"Finale Score-Differenz:\", diff)\n",
+    "print(\"Größe Gruppe 1:\", len(group_small))\n",
+    "print(\"Größe Gruppe 2:\", len(group_large))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d1b414e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_large['overall_score'].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa71f9a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_small['overall_score'].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79ecb4a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_subjects = group_large['subjectID'].values\n",
+    "test_subjects = group_small['subjectID'].values\n",
+    "print(training_subjects)\n",
+    "print(test_subjects)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87f9fe7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "au_columns = [col for col in df.columns if col.lower().startswith(\"au\")]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "009d268b",
+   "metadata": {},
+   "source": [
+    "Labeling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fa79163",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_all = df[\n",
+    "    ((df[\"PHASE\"] == \"baseline\") |\n",
+    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n",
+    "]\n",
+    "print(f\"low all: {low_all.shape}\")\n",
+    "\n",
+    "high_nback = df[\n",
+    "    (df[\"STUDY\"]==\"n-back\") &\n",
+    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+    "]\n",
+    "print(f\"high n-back: {high_nback.shape}\")\n",
+    "\n",
+    "high_kdrive = df[\n",
+    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+    "]\n",
+    "print(f\"high k-drive: {high_kdrive.shape}\")\n",
+    "\n",
+    "high_all = pd.concat([high_nback, high_kdrive])\n",
+    "print(f\"high all: {high_all.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82b17d0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low = low_all.copy()\n",
+    "high = high_all.copy()\n",
+    "\n",
+    "low[\"label\"] = 0\n",
+    "high[\"label\"] = 1\n",
+    "\n",
+    "data = pd.concat([low, high], ignore_index=True)\n",
+    "df = data.drop_duplicates()\n",
+    "\n",
+    "print(\"Label distribution:\")\n",
+    "print(df[\"label\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4353f87c",
+   "metadata": {},
+   "source": [
+    "### Data cleaning with mad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9afaf61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# methode CT\n",
+    "def calculate_mad_params(df, columns):\n",
+    "    \"\"\"\n",
+    "    Calculate median and MAD parameters for each column.\n",
+    "    This should be run ONLY on the training data.\n",
+    "    \n",
+    "    Returns a dictionary: {col: (median, mad)}\n",
+    "    \"\"\"\n",
+    "    params = {}\n",
+    "    for col in columns:\n",
+    "        median = df[col].median()\n",
+    "        mad = np.median(np.abs(df[col] - median))\n",
+    "        params[col] = (median, mad)\n",
+    "    return params\n",
+    "\n",
+    "def apply_mad_filter(df, params, threshold=3.5):\n",
+    "    \"\"\"\n",
+    "    Apply MAD-based outlier removal using precomputed parameters.\n",
+    "    Works on training, validation, and test data.\n",
+    "    \n",
+    "    df: DataFrame to filter\n",
+    "    params: dictionary {col: (median, mad)} from training data\n",
+    "    threshold: cutoff for robust Z-score\n",
+    "    \"\"\"\n",
+    "    df_clean = df.copy()\n",
+    "\n",
+    "    for col, (median, mad) in params.items():\n",
+    "        if mad == 0:\n",
+    "            continue  # no spread; nothing to remove for this column\n",
+    "\n",
+    "        robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
+    "        outlier_mask = np.abs(robust_z) > threshold\n",
+    "\n",
+    "        # Remove values only in this specific column\n",
+    "        df_clean.loc[outlier_mask, col] = np.nan\n",
+    "        \n",
+    "    return df_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a286665",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = df[df.subjectID.isin(training_subjects)]\n",
+    "test_df  = df[df.subjectID.isin(test_subjects)]\n",
+    "print(train_df.shape, test_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2671e0f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params = calculate_mad_params(train_df, au_columns)\n",
+    "\n",
+    "# Step 2: Apply filter consistently\n",
+    "train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
+    "test_outlier_removed  = apply_mad_filter(test_df, params, threshold=3.5)\n",
+    "print(train_outlier_removed.shape, test_outlier_removed.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c39b37f",
+   "metadata": {},
+   "source": [
+    "Normalisierung der Daten"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e6c654f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "normalizer = scaler.fit_normalizer(train_df, au_columns=au_columns, method='standard', scope='global')\n",
+    "train_df_normal = scaler.apply_normalizer(train_df, au_columns=au_columns, normalizer_dict=normalizer)\n",
+    "test_df_normal = scaler.apply_normalizer(test_df, au_columns=au_columns, normalizer_dict=normalizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6d25e7b",
+   "metadata": {},
+   "source": [
+    "to do insert group k fold for train_df_normal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e826a998",
+   "metadata": {},
+   "source": [
+    "### AE first"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6421371",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Beide Klassen für AE und SVM Training\n",
+    "X_train_full = train_outlier_removed[au_columns].dropna()\n",
+    "y_train_full = train_outlier_removed.loc[X_train_full.index, 'label'].values\n",
+    "groups_train = train_outlier_removed.loc[X_train_full.index, 'subjectID'].values\n",
+    "\n",
+    "print(f\"Training data shape: {X_train_full.shape}\")\n",
+    "print(f\"Label distribution in training: {pd.Series(y_train_full).value_counts()}\")\n",
+    "\n",
+    "# Test data\n",
+    "X_test = test_outlier_removed[au_columns].dropna()\n",
+    "y_test = test_outlier_removed.loc[X_test.index, 'label'].values\n",
+    "\n",
+    "print(f\"Test data shape: {X_test.shape}\")\n",
+    "print(f\"Label distribution in test: {pd.Series(y_test).value_counts()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d982e47a",
+   "metadata": {},
+   "source": [
+    "### Custom SVM Layer (differentiable approximation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50fbda1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DifferentiableSVM(tf.keras.layers.Layer):\n",
+    "    \"\"\"\n",
+    "    Differentiable SVM Layer using hinge loss.\n",
+    "    This allows backpropagation through the SVM to the encoder.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, C=1.0, **kwargs):\n",
+    "        super(DifferentiableSVM, self).__init__(**kwargs)\n",
+    "        self.C = C\n",
+    "        \n",
+    "    def build(self, input_shape):\n",
+    "        # SVM weights: w and bias b\n",
+    "        self.w = self.add_weight(\n",
+    "            shape=(input_shape[-1],),\n",
+    "            initializer='glorot_uniform',\n",
+    "            trainable=True,\n",
+    "            name='svm_w'\n",
+    "        )\n",
+    "        self.b = self.add_weight(\n",
+    "            shape=(1,),\n",
+    "            initializer='zeros',\n",
+    "            trainable=True,\n",
+    "            name='svm_b'\n",
+    "        )\n",
+    "        \n",
+    "    def call(self, inputs):\n",
+    "        # Decision function: w^T * x + b\n",
+    "        decision = tf.reduce_sum(inputs * self.w, axis=1, keepdims=True) + self.b\n",
+    "        return decision\n",
+    "    \n",
+    "    def compute_loss(self, inputs, labels):\n",
+    "        \"\"\"\n",
+    "        Hinge loss for SVM: max(0, 1 - y * (w^T * x + b))\n",
+    "        labels should be -1 or +1\n",
+    "        \"\"\"\n",
+    "        decision = self.call(inputs)\n",
+    "        \n",
+    "        # Convert labels from 0/1 to -1/+1\n",
+    "        labels_svm = tf.where(labels == 0, -1.0, 1.0)\n",
+    "        labels_svm = tf.cast(labels_svm, tf.float32)\n",
+    "        labels_svm = tf.reshape(labels_svm, (-1, 1))\n",
+    "        \n",
+    "        # Hinge loss\n",
+    "        hinge_loss = tf.reduce_mean(\n",
+    "            tf.maximum(0.0, 1.0 - labels_svm * decision)\n",
+    "        )\n",
+    "        \n",
+    "        # L2 regularization\n",
+    "        l2_loss = 0.5 * tf.reduce_sum(tf.square(self.w))\n",
+    "        \n",
+    "        return self.C * hinge_loss + l2_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7def811",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class JointAESVM(tf.keras.Model):\n",
+    "    \"\"\"\n",
+    "    Joint Autoencoder + SVM Model\n",
+    "    Loss = reconstruction_loss + svm_loss\n",
+    "    \"\"\"\n",
+    "    def __init__(self, input_dim, latent_dim=5, hidden_dim=16, ae_weight=1.0, \n",
+    "                 svm_weight=1.0, svm_C=1.0, reg=0.0001, **kwargs):\n",
+    "        super(JointAESVM, self).__init__(**kwargs)\n",
+    "        \n",
+    "        self.ae_weight = ae_weight\n",
+    "        self.svm_weight = svm_weight\n",
+    "        \n",
+    "        # Encoder\n",
+    "        self.encoder = tf.keras.Sequential([\n",
+    "            tf.keras.layers.Dense(input_dim, activation='relu', \n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "            tf.keras.layers.Dense(hidden_dim, activation='relu',\n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "            tf.keras.layers.Dense(latent_dim, activation='relu',\n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg))\n",
+    "        ], name='encoder')\n",
+    "        \n",
+    "        # Decoder\n",
+    "        self.decoder = tf.keras.Sequential([\n",
+    "            tf.keras.layers.Dense(latent_dim, activation='relu',\n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "            tf.keras.layers.Dense(hidden_dim, activation='relu',\n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "            tf.keras.layers.Dense(input_dim, activation='linear',\n",
+    "                                 kernel_regularizer=tf.keras.regularizers.l2(reg))\n",
+    "        ], name='decoder')\n",
+    "        \n",
+    "        # SVM Layer\n",
+    "        self.svm = DifferentiableSVM(C=svm_C, name='svm')\n",
+    "        \n",
+    "    def call(self, inputs, training=False):\n",
+    "        # Encode\n",
+    "        encoded = self.encoder(inputs, training=training)\n",
+    "        \n",
+    "        # Decode (for reconstruction)\n",
+    "        decoded = self.decoder(encoded, training=training)\n",
+    "        \n",
+    "        # SVM decision (for classification)\n",
+    "        svm_output = self.svm(encoded)\n",
+    "        \n",
+    "        return decoded, svm_output, encoded\n",
+    "    \n",
+    "    def compute_loss(self, x, y_true):\n",
+    "        # Forward pass\n",
+    "        x_reconstructed, svm_decision, encoded = self(x, training=True)\n",
+    "        \n",
+    "        # Reconstruction loss (MSE)\n",
+    "        reconstruction_loss = tf.reduce_mean(\n",
+    "            tf.square(x - x_reconstructed)\n",
+    "        )\n",
+    "        \n",
+    "        # SVM loss (hinge)\n",
+    "        svm_loss = self.svm.compute_loss(encoded, y_true)\n",
+    "        \n",
+    "        # Total loss\n",
+    "        total_loss = (self.ae_weight * reconstruction_loss + \n",
+    "                     self.svm_weight * svm_loss)\n",
+    "        \n",
+    "        return total_loss, reconstruction_loss, svm_loss\n",
+    "\n",
+    "print(\"Joint AE-SVM Model class defined\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "541085f3",
+   "metadata": {},
+   "source": [
+    "Train function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0bf18e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_joint_model(X_train, y_train, groups, model_params, \n",
+    "                      epochs=200, batch_size=64, learning_rate=0.0001):\n",
+    "    \"\"\"\n",
+    "    Train joint model on given data\n",
+    "    \"\"\"\n",
+    "    # Build model\n",
+    "    model = JointAESVM(\n",
+    "        input_dim=X_train.shape[1],\n",
+    "        latent_dim=model_params['latent_dim'],\n",
+    "        hidden_dim=model_params['hidden_dim'],\n",
+    "        ae_weight=model_params['ae_weight'],\n",
+    "        svm_weight=model_params['svm_weight'],\n",
+    "        svm_C=model_params['svm_C'],\n",
+    "        reg=model_params['reg']\n",
+    "    )\n",
+    "    \n",
+    "    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)\n",
+    "    \n",
+    "    # Training history\n",
+    "    history = {\n",
+    "        'total_loss': [],\n",
+    "        'recon_loss': [],\n",
+    "        'svm_loss': []\n",
+    "    }\n",
+    "    \n",
+    "    # Convert to tensors\n",
+    "    X_train_tf = tf.constant(X_train.values, dtype=tf.float32)\n",
+    "    y_train_tf = tf.constant(y_train, dtype=tf.float32)\n",
+    "    \n",
+    "    # Create dataset\n",
+    "    dataset = tf.data.Dataset.from_tensor_slices((X_train_tf, y_train_tf))\n",
+    "    dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)\n",
+    "    \n",
+    "    # Training loop\n",
+    "    for epoch in range(epochs):\n",
+    "        epoch_loss = 0.0\n",
+    "        epoch_recon = 0.0\n",
+    "        epoch_svm = 0.0\n",
+    "        n_batches = 0\n",
+    "        \n",
+    "        for x_batch, y_batch in dataset:\n",
+    "            with tf.GradientTape() as tape:\n",
+    "                total_loss, recon_loss, svm_loss = model.compute_loss(x_batch, y_batch)\n",
+    "            \n",
+    "            # Backpropagation\n",
+    "            gradients = tape.gradient(total_loss, model.trainable_variables)\n",
+    "            optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
+    "            \n",
+    "            epoch_loss += total_loss.numpy()\n",
+    "            epoch_recon += recon_loss.numpy()\n",
+    "            epoch_svm += svm_loss.numpy()\n",
+    "            n_batches += 1\n",
+    "        \n",
+    "        # Average losses\n",
+    "        history['total_loss'].append(epoch_loss / n_batches)\n",
+    "        history['recon_loss'].append(epoch_recon / n_batches)\n",
+    "        history['svm_loss'].append(epoch_svm / n_batches)\n",
+    "        \n",
+    "        if (epoch + 1) % 20 == 0:\n",
+    "            print(f\"Epoch {epoch+1}/{epochs} - \"\n",
+    "                  f\"Total: {history['total_loss'][-1]:.4f}, \"\n",
+    "                  f\"Recon: {history['recon_loss'][-1]:.4f}, \"\n",
+    "                  f\"SVM: {history['svm_loss'][-1]:.4f}\")\n",
+    "    \n",
+    "    return model, history\n",
+    "\n",
+    "print(\"Training function defined\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6a04540",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parameter Grid\n",
+    "param_grid = {\n",
+    "    'latent_dim': [5, 8],\n",
+    "    'hidden_dim': [10, 16],\n",
+    "    'ae_weight': [0.5, 1.0],\n",
+    "    'svm_weight': [0.5, 1.0, 2.0],\n",
+    "    'svm_C': [0.1, 1.0, 10.0],\n",
+    "    'reg': [0.0001, 0.001]\n",
+    "}\n",
+    "\n",
+    "n_splits = 5  # Weniger Splits wegen Rechenzeit\n",
+    "gkf = GroupKFold(n_splits=n_splits)\n",
+    "\n",
+    "print(f\"Starting Grid Search with {n_splits}-fold GroupKFold\")\n",
+    "print(f\"Parameter combinations: {len(list(ParameterGrid(param_grid)))}\")\n",
+    "print(\"This will take a while...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "228463ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_model(model, X, y):\n",
+    "    \"\"\"Evaluate joint model\"\"\"\n",
+    "    X_tf = tf.constant(X, dtype=tf.float32)\n",
+    "    _, svm_decision, _ = model(X_tf, training=False)\n",
+    "    \n",
+    "    # Predict: decision > 0 -> class 1, else class 0\n",
+    "    y_pred = (svm_decision.numpy().flatten() > 0).astype(int)\n",
+    "    \n",
+    "    bal_accuracy = balanced_accuracy_score(y, y_pred)\n",
+    "    return bal_accuracy, y_pred\n",
+    "\n",
+    "print(\"Evaluation function defined\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c945fc87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Grid Search\n",
+    "best_score = -np.inf\n",
+    "best_params = None\n",
+    "best_model = None\n",
+    "all_results = []\n",
+    "\n",
+    "X_train_array = X_train_full.values\n",
+    "y_train_array = y_train_full\n",
+    "\n",
+    "for param_idx, params in enumerate(ParameterGrid(param_grid)):\n",
+    "    print(f\"\\n{'='*60}\")\n",
+    "    print(f\"Testing parameters {param_idx + 1}/{len(list(ParameterGrid(param_grid)))}\")\n",
+    "    print(f\"Params: {params}\")\n",
+    "    print(f\"{'='*60}\")\n",
+    "    \n",
+    "    fold_scores = []\n",
+    "    \n",
+    "    for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_array, y_train_array, groups_train)):\n",
+    "        print(f\"\\nFold {fold + 1}/{n_splits}\")\n",
+    "        \n",
+    "        X_fold_train = pd.DataFrame(X_train_array[train_idx], columns=X_train_full.columns)\n",
+    "        y_fold_train = y_train_array[train_idx]\n",
+    "        X_fold_val = X_train_array[val_idx]\n",
+    "        y_fold_val = y_train_array[val_idx]\n",
+    "        \n",
+    "        # Train model\n",
+    "        model, history = train_joint_model(\n",
+    "            X_fold_train, y_fold_train, groups_train[train_idx],\n",
+    "            model_params=params,\n",
+    "            epochs=100,  # Weniger Epochen für Grid Search\n",
+    "            batch_size=64,\n",
+    "            learning_rate=0.0001\n",
+    "        )\n",
+    "        \n",
+    "        # Validate\n",
+    "        val_bal_acc, _ = evaluate_model(model, X_fold_val, y_fold_val)\n",
+    "        fold_scores.append(val_bal_acc)\n",
+    "        print(f\"Fold {fold + 1} Validation balanced Accuracy: {val_bal_acc:.4f}\")\n",
+    "    \n",
+    "    mean_score = np.mean(fold_scores)\n",
+    "    std_score = np.std(fold_scores)\n",
+    "    \n",
+    "    result = {\n",
+    "        **params,\n",
+    "        'mean_cv_bal_accuracy': mean_score,\n",
+    "        'std_cv_bal_accuracy': std_score\n",
+    "    }\n",
+    "    all_results.append(result)\n",
+    "    \n",
+    "    print(f\"\\nMean CV bal. Accuracy: {mean_score:.4f} ± {std_score:.4f}\")\n",
+    "    \n",
+    "    if mean_score > best_score:\n",
+    "        best_score = mean_score\n",
+    "        best_params = params\n",
+    "        print(\"*** NEW BEST PARAMETERS ***\")\n",
+    "\n",
+    "print(f\"\\n{'='*60}\")\n",
+    "print(\"GRID SEARCH COMPLETED\")\n",
+    "print(f\"{'='*60}\")\n",
+    "print(f\"Best parameters: {best_params}\")\n",
+    "print(f\"Best CV bal. accuracy: {best_score:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a0606f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_df = pd.DataFrame(all_results)\n",
+    "results_df = results_df.sort_values('mean_cv_accuracy', ascending=False)\n",
+    "\n",
+    "print(\"\\nTop 10 configurations:\")\n",
+    "print(results_df.head(10))\n",
+    "\n",
+    "# Plot\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "plt.barh(range(min(10, len(results_df))), \n",
+    "         results_df['mean_cv_accuracy'].head(10))\n",
+    "plt.yticks(range(min(10, len(results_df))), \n",
+    "           [f\"Config {i+1}\" for i in range(min(10, len(results_df)))])\n",
+    "plt.xlabel('Mean CV Accuracy')\n",
+    "plt.title('Top 10 Configurations')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87906b05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Training final model on all training data...\")\n",
+    "print(f\"Best parameters: {best_params}\")\n",
+    "\n",
+    "final_model, final_history = train_joint_model(\n",
+    "    X_train_full, y_train_full, groups_train,\n",
+    "    model_params=best_params,\n",
+    "    epochs=300,  # Mehr Epochen für finales Training\n",
+    "    batch_size=64,\n",
+    "    learning_rate=0.0001\n",
+    ")\n",
+    "\n",
+    "print(\"\\nFinal model training completed!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "718137a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
+    "\n",
+    "axes[0].plot(final_history['total_loss'])\n",
+    "axes[0].set_title('Total Loss')\n",
+    "axes[0].set_xlabel('Epoch')\n",
+    "axes[0].set_ylabel('Loss')\n",
+    "axes[0].grid(True, alpha=0.3)\n",
+    "\n",
+    "axes[1].plot(final_history['recon_loss'])\n",
+    "axes[1].set_title('Reconstruction Loss')\n",
+    "axes[1].set_xlabel('Epoch')\n",
+    "axes[1].set_ylabel('Loss')\n",
+    "axes[1].grid(True, alpha=0.3)\n",
+    "\n",
+    "axes[2].plot(final_history['svm_loss'])\n",
+    "axes[2].set_title('SVM Loss')\n",
+    "axes[2].set_xlabel('Epoch')\n",
+    "axes[2].set_ylabel('Loss')\n",
+    "axes[2].grid(True, alpha=0.3)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02fbc5a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get predictions\n",
+    "test_acc, y_pred = evaluate_model(final_model, X_test.values, y_test)\n",
+    "\n",
+    "# Get SVM decision values for ROC-AUC\n",
+    "X_test_tf = tf.constant(X_test.values, dtype=tf.float32)\n",
+    "_, svm_decision, _ = final_model(X_test_tf, training=False)\n",
+    "y_pred_decision = svm_decision.numpy().flatten()\n",
+    "\n",
+    "# Metrics\n",
+    "print(\"=\" * 50)\n",
+    "print(\"TEST SET EVALUATION\")\n",
+    "print(\"=\" * 50)\n",
+    "print(f\"\\nAccuracy:  {accuracy_score(y_test, y_pred):.4f}\")\n",
+    "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n",
+    "print(f\"Recall:    {recall_score(y_test, y_pred):.4f}\")\n",
+    "print(f\"F1-Score:  {f1_score(y_test, y_pred):.4f}\")\n",
+    "\n",
+    "# ROC-AUC (decision values as probability proxy)\n",
+    "decision_scaled = MinMaxScaler().fit_transform(y_pred_decision.reshape(-1, 1)).flatten()\n",
+    "print(f\"ROC-AUC:   {roc_auc_score(y_test, decision_scaled):.4f}\")\n",
+    "\n",
+    "print(\"\\nConfusion Matrix:\")\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "print(cm)\n",
+    "\n",
+    "print(\"\\nClassification Report:\")\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "\n",
+    "# Visualize Confusion Matrix\n",
+    "fig, ax = plt.subplots(figsize=(8, 6))\n",
+    "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Low Load (0)', 'High Load (1)'])\n",
+    "disp.plot(cmap='Blues', ax=ax, colorbar=True, values_format='d')\n",
+    "ax.set_title('Confusion Matrix - Test Set', fontsize=14, fontweight='bold')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c524bce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save entire model\n",
+    "final_model.save_weights('joint_ae_svm_weights.h5')\n",
+    "print(\"Model weights saved as 'joint_ae_svm_weights.h5'\")\n",
+    "\n",
+    "# Save encoder separately\n",
+    "final_model.encoder.save('encoder_joint.keras')\n",
+    "print(\"Encoder saved as 'encoder_joint.keras'\")\n",
+    "\n",
+    "# Save best parameters\n",
+    "with open('best_params_joint.pkl', 'wb') as f:\n",
+    "    pickle.dump(best_params, f)\n",
+    "print(\"Best parameters saved as 'best_params_joint.pkl'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "792c658d",
+   "metadata": {},
+   "source": [
+    "* doch mal svm ae pipeline?\n",
+    "* einfach mal mit 20 13 5\n",
+    "* label hinzufügen\n",
+    "* mad von CT verwenden oder wert anpassen, ggf. vergleich welches label wie oft vorkommt vorher und nachher. --> labelling schritt von CT übernehmen\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/model_training/tools/performance_split.py
+++ b/model_training/tools/performance_split.py
@ -0,0 +1,139 @@
+import pandas as pd
+import numpy as np
+
+
+def performance_based_split(
+    subject_ids,
+    performance_df,
+    split_ratio=0.33,
+    threshold=0.01,
+    max_iter=100,
+    random_seed=None
+):
+    """
+    Split subjects into two groups based on performance scores with balanced means.
+    
+    Parameters
+    ----------
+    subject_ids : array-like
+        List or array of subject IDs present in your dataset
+    performance_df : pd.DataFrame
+        DataFrame containing 'subjectID' and 'overall_score' columns
+    split_ratio : float, default=0.33
+        Proportion of subjects for the smaller group (0 < split_ratio < 1)
+    threshold : float, default=0.01
+        Target difference threshold between group means
+    max_iter : int, default=100
+        Maximum number of swap iterations
+    random_seed : int, optional
+        Random seed for reproducibility
+        
+    Returns
+    -------
+    group_small_ids : np.ndarray
+        Subject IDs for the smaller group
+    group_large_ids : np.ndarray
+        Subject IDs for the larger group
+    score_diff : float
+        Final absolute difference between group means
+        
+    Raises
+    ------
+    ValueError
+        If subjects are missing performance scores or no subjects match
+    """
+    if random_seed is not None:
+        np.random.seed(random_seed)
+    
+    # Filter performance data
+    perf_filtered = performance_df[
+        performance_df["subjectID"].isin(subject_ids)
+    ][["subjectID", "overall_score"]]
+    
+    # Merge to get only subjects present in both dataset and performance file
+    merged = (
+        pd.DataFrame({"subjectID": subject_ids})
+        .merge(perf_filtered, on="subjectID", how="inner")
+    )
+    
+    if len(merged) == 0:
+        raise ValueError("No subjects found in both dataset and performance file.")
+    
+    # Check for missing scores
+    if merged["overall_score"].isna().any():
+        raise ValueError("Missing score values for some subjects.")
+    
+    merged_sorted = merged.sort_values("overall_score", ascending=False).reset_index(drop=True)
+    
+    scores = merged_sorted["overall_score"].values
+    n_total = len(merged_sorted)
+    n_small = int(n_total * split_ratio)
+    n_large = n_total - n_small
+    
+    # Initial random split
+    idx = np.arange(n_total)
+    np.random.shuffle(idx)
+    
+    small_idx = idx[:n_small]
+    large_idx = idx[n_small:]
+    
+    def score_diff(small_idx, large_idx):
+        return abs(scores[small_idx].mean() - scores[large_idx].mean())
+    
+    diff = score_diff(small_idx, large_idx)
+    count = 0
+    
+    # Optimize via random swaps
+    while diff > threshold and count < max_iter:
+        si = np.random.choice(small_idx)
+        li = np.random.choice(large_idx)
+        
+        new_small_idx = small_idx.copy()
+        new_large_idx = large_idx.copy()
+        
+        new_small_idx[new_small_idx == si] = li
+        new_large_idx[new_large_idx == li] = si
+        
+        new_diff = score_diff(new_small_idx, new_large_idx)
+        
+        if new_diff < diff:
+            small_idx = new_small_idx
+            large_idx = new_large_idx
+            diff = new_diff
+        
+        count += 1
+    
+    # Extract subject IDs
+    group_small_ids = merged_sorted.loc[small_idx, "subjectID"].values
+    group_large_ids = merged_sorted.loc[large_idx, "subjectID"].values
+    
+    return group_small_ids, group_large_ids, diff
+
+
+# Example usage for 2-way split (train/test)
+# subjects = df["subjectID"].unique()
+# performance_df = pd.read_csv("performance.csv")
+# 
+# train_ids, test_ids, diff = performance_based_split(
+#     subject_ids=subjects,
+#     performance_df=performance_df,
+#     split_ratio=0.2,
+#     random_seed=42
+# )
+
+# Example usage for 3-way split (train/val/test)
+# Step 1: Split into train and temp
+# train_ids, temp_ids, diff1 = performance_based_split(
+#     subject_ids=subjects,
+#     performance_df=performance_df,
+#     split_ratio=0.6,  # 60% train, 40% temp
+#     random_seed=42
+# )
+# 
+# Step 2: Split temp into val and test
+# val_ids, test_ids, diff2 = performance_based_split(
+#     subject_ids=temp_ids,
+#     performance_df=performance_df,
+#     split_ratio=0.5,  # 50/50 split of remaining 40%
+#     random_seed=43
+# )