diff --git a/model_training/DeepSVDD/deepSVDD.ipynb b/model_training/DeepSVDD/deepSVDD.ipynb new file mode 100644 index 0000000..d518f89 --- /dev/null +++ b/model_training/DeepSVDD/deepSVDD.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf894f6f", + "metadata": {}, + "source": [ + "# Intermediate Fusion mit Deep SVDD" + ] + }, + { + "cell_type": "markdown", + "id": "494626b1", + "metadata": {}, + "source": [ + "* Input: gemeinsames Dataset aus EYE Tracking und Action Units mit selber Abtastfrequenz\n", + "* Verarbeitung: Intermediate Fusion\n", + "* Modell: Deep SVDD --> Erlernen einer Kugel durch ein neuronales Netz, dass die Normaldaten einschließt" + ] + }, + { + "cell_type": "markdown", + "id": "bef91203", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0b8274a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import sys\n", + "import os\n", + "\n", + "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", + "sys.path.append(base_dir)\n", + "print(base_dir)\n", + "\n", + "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, GroupKFold\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "import pickle\n", + "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay) " + ] + }, + { + "cell_type": "markdown", + "id": "f00a477c", + "metadata": {}, + "source": [ + "### Data Preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "504c1df7", + "metadata": {}, + "source": [ + "Laden der Daten" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6482542b", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce8ab464", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(path=dataset_path)" + ] + }, + { + "cell_type": "markdown", + "id": "b736bc58", + "metadata": {}, + "source": [ + "### Modell Training" + ] + }, + { + "cell_type": "markdown", + "id": "aa11faf3", + "metadata": {}, + "source": [ + "Vor-Training der Gewichte mit Autoencoder, Loss: MSE" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/model_training/VAE_SVM/AEdannSVM.ipynb b/model_training/VAE_SVM/AEdannSVM.ipynb new file mode 100644 index 0000000..395863d --- /dev/null +++ b/model_training/VAE_SVM/AEdannSVM.ipynb @@ -0,0 +1,918 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "708c9745", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53b10294", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import sys\n", + "import os\n", + "\n", + "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", + "sys.path.append(base_dir)\n", + "print(base_dir)\n", + "\n", + "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, GroupKFold\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "import pickle\n", + "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay) " + ] + }, + { + "cell_type": "markdown", + "id": "68101229", + "metadata": {}, + "source": [ + "### load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24a765e8", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "471001b0", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(path=dataset_path)" + ] + }, + { + "cell_type": "markdown", + "id": "0fdecdaa", + "metadata": {}, + "source": [ + "### Load Performance data and Subject Split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "692d1b47", + "metadata": {}, + "outputs": [], + "source": [ + "performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n", + "performance_df = pd.read_csv(performance_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea617e3f", + "metadata": {}, + "outputs": [], + "source": [ + "# Subject IDs aus dem Haupt-Dataset nehmen\n", + "subjects_from_df = df[\"subjectID\"].unique()\n", + "\n", + "# Performance-Subset nur für vorhandene Subjects\n", + "perf_filtered = performance_df[\n", + " performance_df[\"subjectID\"].isin(subjects_from_df)\n", + "][[\"subjectID\", \"overall_score\"]]\n", + "\n", + "# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n", + "merged = (\n", + " pd.DataFrame({\"subjectID\": subjects_from_df})\n", + " .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n", + ")\n", + "\n", + "# Sicherstellen, dass keine Scores fehlen\n", + "if merged[\"overall_score\"].isna().any():\n", + " raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae43df8d", + "metadata": {}, + "outputs": [], + "source": [ + "merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n", + "\n", + "scores = merged_sorted[\"overall_score\"].values\n", + "n_total = len(merged_sorted)\n", + "n_small = n_total // 3\n", + "n_large = n_total - n_small\n", + "\n", + "# Schritt 1: zufällige Start-Aufteilung\n", + "idx = np.arange(n_total)\n", + "np.random.shuffle(idx)\n", + "\n", + "small_idx = idx[:n_small]\n", + "large_idx = idx[n_small:]\n", + "\n", + "def score_diff(small_idx, large_idx):\n", + " return abs(scores[small_idx].mean() - scores[large_idx].mean())\n", + "\n", + "diff = score_diff(small_idx, large_idx)\n", + "threshold = 0.01\n", + "max_iter = 100\n", + "count = 0\n", + "\n", + "# Schritt 2: random swaps bis Differenz klein genug\n", + "while diff > threshold and count < max_iter:\n", + " # Zwei zufällige Elemente auswählen\n", + " si = np.random.choice(small_idx)\n", + " li = np.random.choice(large_idx)\n", + " \n", + " # Tausch durchführen\n", + " new_small_idx = small_idx.copy()\n", + " new_large_idx = large_idx.copy()\n", + " \n", + " new_small_idx[new_small_idx == si] = li\n", + " new_large_idx[new_large_idx == li] = si\n", + "\n", + " # neue Differenz berechnen\n", + " new_diff = score_diff(new_small_idx, new_large_idx)\n", + "\n", + " # Swap akzeptieren, wenn es besser wird\n", + " if new_diff < diff:\n", + " small_idx = new_small_idx\n", + " large_idx = new_large_idx\n", + " diff = new_diff\n", + "\n", + " count += 1\n", + "\n", + "# Finalgruppen\n", + "group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n", + "group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n", + "\n", + "print(\"Finale Score-Differenz:\", diff)\n", + "print(\"Größe Gruppe 1:\", len(group_small))\n", + "print(\"Größe Gruppe 2:\", len(group_large))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d1b414e", + "metadata": {}, + "outputs": [], + "source": [ + "group_large['overall_score'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa71f9a5", + "metadata": {}, + "outputs": [], + "source": [ + "group_small['overall_score'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79ecb4a2", + "metadata": {}, + "outputs": [], + "source": [ + "training_subjects = group_large['subjectID'].values\n", + "test_subjects = group_small['subjectID'].values\n", + "print(training_subjects)\n", + "print(test_subjects)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87f9fe7d", + "metadata": {}, + "outputs": [], + "source": [ + "au_columns = [col for col in df.columns if col.lower().startswith(\"au\")]" + ] + }, + { + "cell_type": "markdown", + "id": "009d268b", + "metadata": {}, + "source": [ + "Labeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fa79163", + "metadata": {}, + "outputs": [], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n", + "]\n", + "print(f\"low all: {low_all.shape}\")\n", + "\n", + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "print(f\"high n-back: {high_nback.shape}\")\n", + "\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(f\"high k-drive: {high_kdrive.shape}\")\n", + "\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"high all: {high_all.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82b17d0b", + "metadata": {}, + "outputs": [], + "source": [ + "low = low_all.copy()\n", + "high = high_all.copy()\n", + "\n", + "low[\"label\"] = 0\n", + "high[\"label\"] = 1\n", + "\n", + "data = pd.concat([low, high], ignore_index=True)\n", + "df = data.drop_duplicates()\n", + "\n", + "print(\"Label distribution:\")\n", + "print(df[\"label\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "id": "4353f87c", + "metadata": {}, + "source": [ + "### Data cleaning with mad" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9afaf61", + "metadata": {}, + "outputs": [], + "source": [ + "# methode CT\n", + "def calculate_mad_params(df, columns):\n", + " \"\"\"\n", + " Calculate median and MAD parameters for each column.\n", + " This should be run ONLY on the training data.\n", + " \n", + " Returns a dictionary: {col: (median, mad)}\n", + " \"\"\"\n", + " params = {}\n", + " for col in columns:\n", + " median = df[col].median()\n", + " mad = np.median(np.abs(df[col] - median))\n", + " params[col] = (median, mad)\n", + " return params\n", + "\n", + "def apply_mad_filter(df, params, threshold=3.5):\n", + " \"\"\"\n", + " Apply MAD-based outlier removal using precomputed parameters.\n", + " Works on training, validation, and test data.\n", + " \n", + " df: DataFrame to filter\n", + " params: dictionary {col: (median, mad)} from training data\n", + " threshold: cutoff for robust Z-score\n", + " \"\"\"\n", + " df_clean = df.copy()\n", + "\n", + " for col, (median, mad) in params.items():\n", + " if mad == 0:\n", + " continue # no spread; nothing to remove for this column\n", + "\n", + " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", + " outlier_mask = np.abs(robust_z) > threshold\n", + "\n", + " # Remove values only in this specific column\n", + " df_clean.loc[outlier_mask, col] = np.nan\n", + " \n", + " return df_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a286665", + "metadata": {}, + "outputs": [], + "source": [ + "train_df = df[df.subjectID.isin(training_subjects)]\n", + "test_df = df[df.subjectID.isin(test_subjects)]\n", + "print(train_df.shape, test_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2671e0f4", + "metadata": {}, + "outputs": [], + "source": [ + "params = calculate_mad_params(train_df, au_columns)\n", + "\n", + "# Step 2: Apply filter consistently\n", + "train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n", + "test_outlier_removed = apply_mad_filter(test_df, params, threshold=3.5)\n", + "print(train_outlier_removed.shape, test_outlier_removed.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "6c39b37f", + "metadata": {}, + "source": [ + "Normalisierung der Daten" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e6c654f", + "metadata": {}, + "outputs": [], + "source": [ + "normalizer = scaler.fit_normalizer(train_df, au_columns=au_columns, method='standard', scope='global')\n", + "train_df_normal = scaler.apply_normalizer(train_df, au_columns=au_columns, normalizer_dict=normalizer)\n", + "test_df_normal = scaler.apply_normalizer(test_df, au_columns=au_columns, normalizer_dict=normalizer)" + ] + }, + { + "cell_type": "markdown", + "id": "b6d25e7b", + "metadata": {}, + "source": [ + "to do insert group k fold for train_df_normal" + ] + }, + { + "cell_type": "markdown", + "id": "e826a998", + "metadata": {}, + "source": [ + "### AE first" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6421371", + "metadata": {}, + "outputs": [], + "source": [ + "# Beide Klassen für AE und SVM Training\n", + "X_train_full = train_outlier_removed[au_columns].dropna()\n", + "y_train_full = train_outlier_removed.loc[X_train_full.index, 'label'].values\n", + "groups_train = train_outlier_removed.loc[X_train_full.index, 'subjectID'].values\n", + "\n", + "print(f\"Training data shape: {X_train_full.shape}\")\n", + "print(f\"Label distribution in training: {pd.Series(y_train_full).value_counts()}\")\n", + "\n", + "# Test data\n", + "X_test = test_outlier_removed[au_columns].dropna()\n", + "y_test = test_outlier_removed.loc[X_test.index, 'label'].values\n", + "\n", + "print(f\"Test data shape: {X_test.shape}\")\n", + "print(f\"Label distribution in test: {pd.Series(y_test).value_counts()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d982e47a", + "metadata": {}, + "source": [ + "### Custom SVM Layer (differentiable approximation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50fbda1a", + "metadata": {}, + "outputs": [], + "source": [ + "class DifferentiableSVM(tf.keras.layers.Layer):\n", + " \"\"\"\n", + " Differentiable SVM Layer using hinge loss.\n", + " This allows backpropagation through the SVM to the encoder.\n", + " \"\"\"\n", + " def __init__(self, C=1.0, **kwargs):\n", + " super(DifferentiableSVM, self).__init__(**kwargs)\n", + " self.C = C\n", + " \n", + " def build(self, input_shape):\n", + " # SVM weights: w and bias b\n", + " self.w = self.add_weight(\n", + " shape=(input_shape[-1],),\n", + " initializer='glorot_uniform',\n", + " trainable=True,\n", + " name='svm_w'\n", + " )\n", + " self.b = self.add_weight(\n", + " shape=(1,),\n", + " initializer='zeros',\n", + " trainable=True,\n", + " name='svm_b'\n", + " )\n", + " \n", + " def call(self, inputs):\n", + " # Decision function: w^T * x + b\n", + " decision = tf.reduce_sum(inputs * self.w, axis=1, keepdims=True) + self.b\n", + " return decision\n", + " \n", + " def compute_loss(self, inputs, labels):\n", + " \"\"\"\n", + " Hinge loss for SVM: max(0, 1 - y * (w^T * x + b))\n", + " labels should be -1 or +1\n", + " \"\"\"\n", + " decision = self.call(inputs)\n", + " \n", + " # Convert labels from 0/1 to -1/+1\n", + " labels_svm = tf.where(labels == 0, -1.0, 1.0)\n", + " labels_svm = tf.cast(labels_svm, tf.float32)\n", + " labels_svm = tf.reshape(labels_svm, (-1, 1))\n", + " \n", + " # Hinge loss\n", + " hinge_loss = tf.reduce_mean(\n", + " tf.maximum(0.0, 1.0 - labels_svm * decision)\n", + " )\n", + " \n", + " # L2 regularization\n", + " l2_loss = 0.5 * tf.reduce_sum(tf.square(self.w))\n", + " \n", + " return self.C * hinge_loss + l2_loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7def811", + "metadata": {}, + "outputs": [], + "source": [ + "class JointAESVM(tf.keras.Model):\n", + " \"\"\"\n", + " Joint Autoencoder + SVM Model\n", + " Loss = reconstruction_loss + svm_loss\n", + " \"\"\"\n", + " def __init__(self, input_dim, latent_dim=5, hidden_dim=16, ae_weight=1.0, \n", + " svm_weight=1.0, svm_C=1.0, reg=0.0001, **kwargs):\n", + " super(JointAESVM, self).__init__(**kwargs)\n", + " \n", + " self.ae_weight = ae_weight\n", + " self.svm_weight = svm_weight\n", + " \n", + " # Encoder\n", + " self.encoder = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(input_dim, activation='relu', \n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(hidden_dim, activation='relu',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(latent_dim, activation='relu',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + " ], name='encoder')\n", + " \n", + " # Decoder\n", + " self.decoder = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(latent_dim, activation='relu',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(hidden_dim, activation='relu',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(input_dim, activation='linear',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + " ], name='decoder')\n", + " \n", + " # SVM Layer\n", + " self.svm = DifferentiableSVM(C=svm_C, name='svm')\n", + " \n", + " def call(self, inputs, training=False):\n", + " # Encode\n", + " encoded = self.encoder(inputs, training=training)\n", + " \n", + " # Decode (for reconstruction)\n", + " decoded = self.decoder(encoded, training=training)\n", + " \n", + " # SVM decision (for classification)\n", + " svm_output = self.svm(encoded)\n", + " \n", + " return decoded, svm_output, encoded\n", + " \n", + " def compute_loss(self, x, y_true):\n", + " # Forward pass\n", + " x_reconstructed, svm_decision, encoded = self(x, training=True)\n", + " \n", + " # Reconstruction loss (MSE)\n", + " reconstruction_loss = tf.reduce_mean(\n", + " tf.square(x - x_reconstructed)\n", + " )\n", + " \n", + " # SVM loss (hinge)\n", + " svm_loss = self.svm.compute_loss(encoded, y_true)\n", + " \n", + " # Total loss\n", + " total_loss = (self.ae_weight * reconstruction_loss + \n", + " self.svm_weight * svm_loss)\n", + " \n", + " return total_loss, reconstruction_loss, svm_loss\n", + "\n", + "print(\"Joint AE-SVM Model class defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "541085f3", + "metadata": {}, + "source": [ + "Train function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0bf18e3", + "metadata": {}, + "outputs": [], + "source": [ + "def train_joint_model(X_train, y_train, groups, model_params, \n", + " epochs=200, batch_size=64, learning_rate=0.0001):\n", + " \"\"\"\n", + " Train joint model on given data\n", + " \"\"\"\n", + " # Build model\n", + " model = JointAESVM(\n", + " input_dim=X_train.shape[1],\n", + " latent_dim=model_params['latent_dim'],\n", + " hidden_dim=model_params['hidden_dim'],\n", + " ae_weight=model_params['ae_weight'],\n", + " svm_weight=model_params['svm_weight'],\n", + " svm_C=model_params['svm_C'],\n", + " reg=model_params['reg']\n", + " )\n", + " \n", + " optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)\n", + " \n", + " # Training history\n", + " history = {\n", + " 'total_loss': [],\n", + " 'recon_loss': [],\n", + " 'svm_loss': []\n", + " }\n", + " \n", + " # Convert to tensors\n", + " X_train_tf = tf.constant(X_train.values, dtype=tf.float32)\n", + " y_train_tf = tf.constant(y_train, dtype=tf.float32)\n", + " \n", + " # Create dataset\n", + " dataset = tf.data.Dataset.from_tensor_slices((X_train_tf, y_train_tf))\n", + " dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)\n", + " \n", + " # Training loop\n", + " for epoch in range(epochs):\n", + " epoch_loss = 0.0\n", + " epoch_recon = 0.0\n", + " epoch_svm = 0.0\n", + " n_batches = 0\n", + " \n", + " for x_batch, y_batch in dataset:\n", + " with tf.GradientTape() as tape:\n", + " total_loss, recon_loss, svm_loss = model.compute_loss(x_batch, y_batch)\n", + " \n", + " # Backpropagation\n", + " gradients = tape.gradient(total_loss, model.trainable_variables)\n", + " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", + " \n", + " epoch_loss += total_loss.numpy()\n", + " epoch_recon += recon_loss.numpy()\n", + " epoch_svm += svm_loss.numpy()\n", + " n_batches += 1\n", + " \n", + " # Average losses\n", + " history['total_loss'].append(epoch_loss / n_batches)\n", + " history['recon_loss'].append(epoch_recon / n_batches)\n", + " history['svm_loss'].append(epoch_svm / n_batches)\n", + " \n", + " if (epoch + 1) % 20 == 0:\n", + " print(f\"Epoch {epoch+1}/{epochs} - \"\n", + " f\"Total: {history['total_loss'][-1]:.4f}, \"\n", + " f\"Recon: {history['recon_loss'][-1]:.4f}, \"\n", + " f\"SVM: {history['svm_loss'][-1]:.4f}\")\n", + " \n", + " return model, history\n", + "\n", + "print(\"Training function defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6a04540", + "metadata": {}, + "outputs": [], + "source": [ + "# Parameter Grid\n", + "param_grid = {\n", + " 'latent_dim': [5, 8],\n", + " 'hidden_dim': [10, 16],\n", + " 'ae_weight': [0.5, 1.0],\n", + " 'svm_weight': [0.5, 1.0, 2.0],\n", + " 'svm_C': [0.1, 1.0, 10.0],\n", + " 'reg': [0.0001, 0.001]\n", + "}\n", + "\n", + "n_splits = 5 # Weniger Splits wegen Rechenzeit\n", + "gkf = GroupKFold(n_splits=n_splits)\n", + "\n", + "print(f\"Starting Grid Search with {n_splits}-fold GroupKFold\")\n", + "print(f\"Parameter combinations: {len(list(ParameterGrid(param_grid)))}\")\n", + "print(\"This will take a while...\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "228463ce", + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_model(model, X, y):\n", + " \"\"\"Evaluate joint model\"\"\"\n", + " X_tf = tf.constant(X, dtype=tf.float32)\n", + " _, svm_decision, _ = model(X_tf, training=False)\n", + " \n", + " # Predict: decision > 0 -> class 1, else class 0\n", + " y_pred = (svm_decision.numpy().flatten() > 0).astype(int)\n", + " \n", + " bal_accuracy = balanced_accuracy_score(y, y_pred)\n", + " return bal_accuracy, y_pred\n", + "\n", + "print(\"Evaluation function defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c945fc87", + "metadata": {}, + "outputs": [], + "source": [ + "# Grid Search\n", + "best_score = -np.inf\n", + "best_params = None\n", + "best_model = None\n", + "all_results = []\n", + "\n", + "X_train_array = X_train_full.values\n", + "y_train_array = y_train_full\n", + "\n", + "for param_idx, params in enumerate(ParameterGrid(param_grid)):\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"Testing parameters {param_idx + 1}/{len(list(ParameterGrid(param_grid)))}\")\n", + " print(f\"Params: {params}\")\n", + " print(f\"{'='*60}\")\n", + " \n", + " fold_scores = []\n", + " \n", + " for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_array, y_train_array, groups_train)):\n", + " print(f\"\\nFold {fold + 1}/{n_splits}\")\n", + " \n", + " X_fold_train = pd.DataFrame(X_train_array[train_idx], columns=X_train_full.columns)\n", + " y_fold_train = y_train_array[train_idx]\n", + " X_fold_val = X_train_array[val_idx]\n", + " y_fold_val = y_train_array[val_idx]\n", + " \n", + " # Train model\n", + " model, history = train_joint_model(\n", + " X_fold_train, y_fold_train, groups_train[train_idx],\n", + " model_params=params,\n", + " epochs=100, # Weniger Epochen für Grid Search\n", + " batch_size=64,\n", + " learning_rate=0.0001\n", + " )\n", + " \n", + " # Validate\n", + " val_bal_acc, _ = evaluate_model(model, X_fold_val, y_fold_val)\n", + " fold_scores.append(val_bal_acc)\n", + " print(f\"Fold {fold + 1} Validation balanced Accuracy: {val_bal_acc:.4f}\")\n", + " \n", + " mean_score = np.mean(fold_scores)\n", + " std_score = np.std(fold_scores)\n", + " \n", + " result = {\n", + " **params,\n", + " 'mean_cv_bal_accuracy': mean_score,\n", + " 'std_cv_bal_accuracy': std_score\n", + " }\n", + " all_results.append(result)\n", + " \n", + " print(f\"\\nMean CV bal. Accuracy: {mean_score:.4f} ± {std_score:.4f}\")\n", + " \n", + " if mean_score > best_score:\n", + " best_score = mean_score\n", + " best_params = params\n", + " print(\"*** NEW BEST PARAMETERS ***\")\n", + "\n", + "print(f\"\\n{'='*60}\")\n", + "print(\"GRID SEARCH COMPLETED\")\n", + "print(f\"{'='*60}\")\n", + "print(f\"Best parameters: {best_params}\")\n", + "print(f\"Best CV bal. accuracy: {best_score:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a0606f5", + "metadata": {}, + "outputs": [], + "source": [ + "results_df = pd.DataFrame(all_results)\n", + "results_df = results_df.sort_values('mean_cv_accuracy', ascending=False)\n", + "\n", + "print(\"\\nTop 10 configurations:\")\n", + "print(results_df.head(10))\n", + "\n", + "# Plot\n", + "plt.figure(figsize=(12, 6))\n", + "plt.barh(range(min(10, len(results_df))), \n", + " results_df['mean_cv_accuracy'].head(10))\n", + "plt.yticks(range(min(10, len(results_df))), \n", + " [f\"Config {i+1}\" for i in range(min(10, len(results_df)))])\n", + "plt.xlabel('Mean CV Accuracy')\n", + "plt.title('Top 10 Configurations')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87906b05", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Training final model on all training data...\")\n", + "print(f\"Best parameters: {best_params}\")\n", + "\n", + "final_model, final_history = train_joint_model(\n", + " X_train_full, y_train_full, groups_train,\n", + " model_params=best_params,\n", + " epochs=300, # Mehr Epochen für finales Training\n", + " batch_size=64,\n", + " learning_rate=0.0001\n", + ")\n", + "\n", + "print(\"\\nFinal model training completed!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "718137a8", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", + "\n", + "axes[0].plot(final_history['total_loss'])\n", + "axes[0].set_title('Total Loss')\n", + "axes[0].set_xlabel('Epoch')\n", + "axes[0].set_ylabel('Loss')\n", + "axes[0].grid(True, alpha=0.3)\n", + "\n", + "axes[1].plot(final_history['recon_loss'])\n", + "axes[1].set_title('Reconstruction Loss')\n", + "axes[1].set_xlabel('Epoch')\n", + "axes[1].set_ylabel('Loss')\n", + "axes[1].grid(True, alpha=0.3)\n", + "\n", + "axes[2].plot(final_history['svm_loss'])\n", + "axes[2].set_title('SVM Loss')\n", + "axes[2].set_xlabel('Epoch')\n", + "axes[2].set_ylabel('Loss')\n", + "axes[2].grid(True, alpha=0.3)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02fbc5a2", + "metadata": {}, + "outputs": [], + "source": [ + "# Get predictions\n", + "test_acc, y_pred = evaluate_model(final_model, X_test.values, y_test)\n", + "\n", + "# Get SVM decision values for ROC-AUC\n", + "X_test_tf = tf.constant(X_test.values, dtype=tf.float32)\n", + "_, svm_decision, _ = final_model(X_test_tf, training=False)\n", + "y_pred_decision = svm_decision.numpy().flatten()\n", + "\n", + "# Metrics\n", + "print(\"=\" * 50)\n", + "print(\"TEST SET EVALUATION\")\n", + "print(\"=\" * 50)\n", + "print(f\"\\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", + "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n", + "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n", + "print(f\"F1-Score: {f1_score(y_test, y_pred):.4f}\")\n", + "\n", + "# ROC-AUC (decision values as probability proxy)\n", + "decision_scaled = MinMaxScaler().fit_transform(y_pred_decision.reshape(-1, 1)).flatten()\n", + "print(f\"ROC-AUC: {roc_auc_score(y_test, decision_scaled):.4f}\")\n", + "\n", + "print(\"\\nConfusion Matrix:\")\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)\n", + "\n", + "print(\"\\nClassification Report:\")\n", + "print(classification_report(y_test, y_pred))\n", + "\n", + "# Visualize Confusion Matrix\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Low Load (0)', 'High Load (1)'])\n", + "disp.plot(cmap='Blues', ax=ax, colorbar=True, values_format='d')\n", + "ax.set_title('Confusion Matrix - Test Set', fontsize=14, fontweight='bold')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c524bce", + "metadata": {}, + "outputs": [], + "source": [ + "# Save entire model\n", + "final_model.save_weights('joint_ae_svm_weights.h5')\n", + "print(\"Model weights saved as 'joint_ae_svm_weights.h5'\")\n", + "\n", + "# Save encoder separately\n", + "final_model.encoder.save('encoder_joint.keras')\n", + "print(\"Encoder saved as 'encoder_joint.keras'\")\n", + "\n", + "# Save best parameters\n", + "with open('best_params_joint.pkl', 'wb') as f:\n", + " pickle.dump(best_params, f)\n", + "print(\"Best parameters saved as 'best_params_joint.pkl'\")" + ] + }, + { + "cell_type": "markdown", + "id": "792c658d", + "metadata": {}, + "source": [ + "* doch mal svm ae pipeline?\n", + "* einfach mal mit 20 13 5\n", + "* label hinzufügen\n", + "* mad von CT verwenden oder wert anpassen, ggf. vergleich welches label wie oft vorkommt vorher und nachher. --> labelling schritt von CT übernehmen\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/model_training/tools/performance_split.py b/model_training/tools/performance_split.py new file mode 100644 index 0000000..e40b564 --- /dev/null +++ b/model_training/tools/performance_split.py @@ -0,0 +1,139 @@ +import pandas as pd +import numpy as np + + +def performance_based_split( + subject_ids, + performance_df, + split_ratio=0.33, + threshold=0.01, + max_iter=100, + random_seed=None +): + """ + Split subjects into two groups based on performance scores with balanced means. + + Parameters + ---------- + subject_ids : array-like + List or array of subject IDs present in your dataset + performance_df : pd.DataFrame + DataFrame containing 'subjectID' and 'overall_score' columns + split_ratio : float, default=0.33 + Proportion of subjects for the smaller group (0 < split_ratio < 1) + threshold : float, default=0.01 + Target difference threshold between group means + max_iter : int, default=100 + Maximum number of swap iterations + random_seed : int, optional + Random seed for reproducibility + + Returns + ------- + group_small_ids : np.ndarray + Subject IDs for the smaller group + group_large_ids : np.ndarray + Subject IDs for the larger group + score_diff : float + Final absolute difference between group means + + Raises + ------ + ValueError + If subjects are missing performance scores or no subjects match + """ + if random_seed is not None: + np.random.seed(random_seed) + + # Filter performance data + perf_filtered = performance_df[ + performance_df["subjectID"].isin(subject_ids) + ][["subjectID", "overall_score"]] + + # Merge to get only subjects present in both dataset and performance file + merged = ( + pd.DataFrame({"subjectID": subject_ids}) + .merge(perf_filtered, on="subjectID", how="inner") + ) + + if len(merged) == 0: + raise ValueError("No subjects found in both dataset and performance file.") + + # Check for missing scores + if merged["overall_score"].isna().any(): + raise ValueError("Missing score values for some subjects.") + + merged_sorted = merged.sort_values("overall_score", ascending=False).reset_index(drop=True) + + scores = merged_sorted["overall_score"].values + n_total = len(merged_sorted) + n_small = int(n_total * split_ratio) + n_large = n_total - n_small + + # Initial random split + idx = np.arange(n_total) + np.random.shuffle(idx) + + small_idx = idx[:n_small] + large_idx = idx[n_small:] + + def score_diff(small_idx, large_idx): + return abs(scores[small_idx].mean() - scores[large_idx].mean()) + + diff = score_diff(small_idx, large_idx) + count = 0 + + # Optimize via random swaps + while diff > threshold and count < max_iter: + si = np.random.choice(small_idx) + li = np.random.choice(large_idx) + + new_small_idx = small_idx.copy() + new_large_idx = large_idx.copy() + + new_small_idx[new_small_idx == si] = li + new_large_idx[new_large_idx == li] = si + + new_diff = score_diff(new_small_idx, new_large_idx) + + if new_diff < diff: + small_idx = new_small_idx + large_idx = new_large_idx + diff = new_diff + + count += 1 + + # Extract subject IDs + group_small_ids = merged_sorted.loc[small_idx, "subjectID"].values + group_large_ids = merged_sorted.loc[large_idx, "subjectID"].values + + return group_small_ids, group_large_ids, diff + + +# Example usage for 2-way split (train/test) +# subjects = df["subjectID"].unique() +# performance_df = pd.read_csv("performance.csv") +# +# train_ids, test_ids, diff = performance_based_split( +# subject_ids=subjects, +# performance_df=performance_df, +# split_ratio=0.2, +# random_seed=42 +# ) + +# Example usage for 3-way split (train/val/test) +# Step 1: Split into train and temp +# train_ids, temp_ids, diff1 = performance_based_split( +# subject_ids=subjects, +# performance_df=performance_df, +# split_ratio=0.6, # 60% train, 40% temp +# random_seed=42 +# ) +# +# Step 2: Split temp into val and test +# val_ids, test_ids, diff2 = performance_based_split( +# subject_ids=temp_ids, +# performance_df=performance_df, +# split_ratio=0.5, # 50/50 split of remaining 40% +# random_seed=43 +# ) \ No newline at end of file