diff --git a/model_training/VAE_SVM/AEdannSVM.ipynb b/model_training/VAE_SVM/AEdannSVM.ipynb deleted file mode 100644 index 88a7ce6..0000000 --- a/model_training/VAE_SVM/AEdannSVM.ipynb +++ /dev/null @@ -1,1312 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "bcbd4937", - "metadata": {}, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7670c30e", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from pathlib import Path\n", - "import sys\n", - "import os\n", - "\n", - "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", - "sys.path.append(base_dir)\n", - "print(base_dir)\n", - "print(os.getcwd())\n", - "# from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n", - "\n", - "from tools import evaluation_tools, scaler, mad_outlier_removal\n", - "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", - "from sklearn.svm import OneClassSVM\n", - "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, GroupKFold\n", - "import matplotlib.pyplot as plt\n", - "import tensorflow as tf\n", - "import pickle\n", - "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59b2b100", - "metadata": {}, - "outputs": [], - "source": [ - "# Check GPU availability\n", - "print(\"TensorFlow version:\", tf.__version__)\n", - "print(\"GPU Available:\", tf.config.list_physical_devices('GPU'))\n", - "print(\"CUDA Available:\", tf.test.is_built_with_cuda())\n", - "\n", - "# Get detailed GPU info\n", - "gpus = tf.config.list_physical_devices('GPU')\n", - "if gpus:\n", - " print(f\"\\nNumber of GPUs: {len(gpus)}\")\n", - " for gpu in gpus:\n", - " print(f\"GPU: {gpu}\")\n", - " \n", - " # Enable memory growth to prevent TF from allocating all GPU memory\n", - " try:\n", - " for gpu in gpus:\n", - " tf.config.experimental.set_memory_growth(gpu, True)\n", - " print(\"\\nGPU memory growth enabled\")\n", - " except RuntimeError as e:\n", - " print(e)\n", - "else:\n", - " print(\"\\nNo GPU found - running on CPU\")" - ] - }, - { - "cell_type": "markdown", - "id": "b002d3c8", - "metadata": {}, - "source": [ - "### load Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1620827e", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "854240b8", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(path=dataset_path)" - ] - }, - { - "cell_type": "markdown", - "id": "69b21772", - "metadata": {}, - "source": [ - "### Load Performance data and Subject Split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff894fda", - "metadata": {}, - "outputs": [], - "source": [ - "performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n", - "performance_df = pd.read_csv(performance_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b2b789b", - "metadata": {}, - "outputs": [], - "source": [ - "# Subject IDs aus dem Haupt-Dataset nehmen\n", - "subjects_from_df = df[\"subjectID\"].unique()\n", - "\n", - "# Performance-Subset nur für vorhandene Subjects\n", - "perf_filtered = performance_df[\n", - " performance_df[\"subjectID\"].isin(subjects_from_df)\n", - "][[\"subjectID\", \"overall_score\"]]\n", - "\n", - "# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n", - "merged = (\n", - " pd.DataFrame({\"subjectID\": subjects_from_df})\n", - " .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n", - ")\n", - "\n", - "# Sicherstellen, dass keine Scores fehlen\n", - "if merged[\"overall_score\"].isna().any():\n", - " raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7336051", - "metadata": {}, - "outputs": [], - "source": [ - "merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n", - "\n", - "scores = merged_sorted[\"overall_score\"].values\n", - "n_total = len(merged_sorted)\n", - "n_small = n_total // 3\n", - "n_large = n_total - n_small\n", - "\n", - "# Schritt 1: zufällige Start-Aufteilung\n", - "idx = np.arange(n_total)\n", - "np.random.shuffle(idx)\n", - "\n", - "small_idx = idx[:n_small]\n", - "large_idx = idx[n_small:]\n", - "\n", - "def score_diff(small_idx, large_idx):\n", - " return abs(scores[small_idx].mean() - scores[large_idx].mean())\n", - "\n", - "diff = score_diff(small_idx, large_idx)\n", - "threshold = 0.01\n", - "max_iter = 100\n", - "count = 0\n", - "\n", - "# Schritt 2: random swaps bis Differenz klein genug\n", - "while diff > threshold and count < max_iter:\n", - " # Zwei zufällige Elemente auswählen\n", - " si = np.random.choice(small_idx)\n", - " li = np.random.choice(large_idx)\n", - " \n", - " # Tausch durchführen\n", - " new_small_idx = small_idx.copy()\n", - " new_large_idx = large_idx.copy()\n", - " \n", - " new_small_idx[new_small_idx == si] = li\n", - " new_large_idx[new_large_idx == li] = si\n", - "\n", - " # neue Differenz berechnen\n", - " new_diff = score_diff(new_small_idx, new_large_idx)\n", - "\n", - " # Swap akzeptieren, wenn es besser wird\n", - " if new_diff < diff:\n", - " small_idx = new_small_idx\n", - " large_idx = new_large_idx\n", - " diff = new_diff\n", - "\n", - " count += 1\n", - "\n", - "# Finalgruppen\n", - "group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n", - "group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n", - "\n", - "print(\"Finale Score-Differenz:\", diff)\n", - "print(\"Größe Gruppe 1:\", len(group_small))\n", - "print(\"Größe Gruppe 2:\", len(group_large))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96d9241d", - "metadata": {}, - "outputs": [], - "source": [ - "group_large['overall_score'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c41544e", - "metadata": {}, - "outputs": [], - "source": [ - "group_small['overall_score'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a110ca6", - "metadata": {}, - "outputs": [], - "source": [ - "training_subjects = group_large['subjectID'].values\n", - "test_subjects = group_small['subjectID'].values\n", - "print(training_subjects)\n", - "print(test_subjects)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b64d8c2b", - "metadata": {}, - "outputs": [], - "source": [ - "au_columns = [col for col in df.columns if col.lower().startswith(\"au\")]" - ] - }, - { - "cell_type": "markdown", - "id": "3d7adcd9", - "metadata": {}, - "source": [ - "Labeling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e563d890", - "metadata": {}, - "outputs": [], - "source": [ - "low_all = df[\n", - " ((df[\"PHASE\"] == \"baseline\") |\n", - " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n", - "]\n", - "print(f\"low all: {low_all.shape}\")\n", - "\n", - "high_nback = df[\n", - " (df[\"STUDY\"]==\"n-back\") &\n", - " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", - " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", - "]\n", - "print(f\"high n-back: {high_nback.shape}\")\n", - "\n", - "high_kdrive = df[\n", - " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", - "]\n", - "print(f\"high k-drive: {high_kdrive.shape}\")\n", - "\n", - "high_all = pd.concat([high_nback, high_kdrive])\n", - "print(f\"high all: {high_all.shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c44eafa9", - "metadata": {}, - "outputs": [], - "source": [ - "low = low_all.copy()\n", - "high = high_all.copy()\n", - "\n", - "low[\"label\"] = 0\n", - "high[\"label\"] = 1\n", - "\n", - "data = pd.concat([low, high], ignore_index=True)\n", - "df = data.drop_duplicates()\n", - "\n", - "print(\"Label distribution:\")\n", - "print(df[\"label\"].value_counts())" - ] - }, - { - "cell_type": "markdown", - "id": "d110bd77", - "metadata": {}, - "source": [ - "### Data cleaning with mad" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1cea8fa4", - "metadata": {}, - "outputs": [], - "source": [ - "# methode CT\n", - "def calculate_mad_params(df, columns):\n", - " \"\"\"\n", - " Calculate median and MAD parameters for each column.\n", - " This should be run ONLY on the training data.\n", - " \n", - " Returns a dictionary: {col: (median, mad)}\n", - " \"\"\"\n", - " params = {}\n", - " for col in columns:\n", - " median = df[col].median()\n", - " mad = np.median(np.abs(df[col] - median))\n", - " params[col] = (median, mad)\n", - " return params\n", - "\n", - "def apply_mad_filter(df, params, threshold=3.5):\n", - " \"\"\"\n", - " Apply MAD-based outlier removal using precomputed parameters.\n", - " Works on training, validation, and test data.\n", - " \n", - " df: DataFrame to filter\n", - " params: dictionary {col: (median, mad)} from training data\n", - " threshold: cutoff for robust Z-score\n", - " \"\"\"\n", - " df_clean = df.copy()\n", - "\n", - " for col, (median, mad) in params.items():\n", - " if mad == 0:\n", - " continue # no spread; nothing to remove for this column\n", - "\n", - " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", - " outlier_mask = np.abs(robust_z) > threshold\n", - "\n", - " # Remove values only in this specific column\n", - " df_clean.loc[outlier_mask, col] = np.nan\n", - " \n", - " return df_clean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8aa01ada", - "metadata": {}, - "outputs": [], - "source": [ - "train_df = df[df.subjectID.isin(training_subjects)]\n", - "test_df = df[df.subjectID.isin(test_subjects)]\n", - "print(train_df.shape, test_df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "857c0ffd", - "metadata": {}, - "outputs": [], - "source": [ - "params = calculate_mad_params(train_df, au_columns)\n", - "\n", - "# Step 2: Apply filter consistently\n", - "train_outlier_removed = apply_mad_filter(train_df, params, threshold=7)\n", - "test_outlier_removed = apply_mad_filter(test_df, params, threshold=7)\n", - "print(train_outlier_removed.shape, test_outlier_removed.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "f9c5b562", - "metadata": {}, - "source": [ - "Normalisierung der Daten" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "162163ae", - "metadata": {}, - "outputs": [], - "source": [ - "normalizer = scaler.fit_normalizer(train_df, au_columns=au_columns, method='standard', scope='global')\n", - "train_df_normal = scaler.apply_normalizer(train_df, au_columns=au_columns, normalizer_dict=normalizer)\n", - "test_df_normal = scaler.apply_normalizer(test_df, au_columns=au_columns, normalizer_dict=normalizer)" - ] - }, - { - "cell_type": "markdown", - "id": "ec1548c2", - "metadata": {}, - "source": [ - "to do insert group k fold for train_df_normal" - ] - }, - { - "cell_type": "markdown", - "id": "be77010e", - "metadata": {}, - "source": [ - "### AE first" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "462d33eb", - "metadata": {}, - "outputs": [], - "source": [ - "# Beide Klassen für AE und SVM Training\n", - "X_train_full = train_outlier_removed[au_columns].dropna()\n", - "y_train_full = train_outlier_removed.loc[X_train_full.index, 'label'].values\n", - "groups_train = train_outlier_removed.loc[X_train_full.index, 'subjectID'].values\n", - "\n", - "print(f\"Training data shape (before balancing): {X_train_full.shape}\")\n", - "print(f\"Label distribution (before balancing): {pd.Series(y_train_full).value_counts()}\")\n", - "\n", - "# Test data\n", - "X_test = test_outlier_removed[au_columns].dropna()\n", - "y_test = test_outlier_removed.loc[X_test.index, 'label'].values\n", - "\n", - "print(f\"Test data shape: {X_test.shape}\")\n", - "print(f\"Label distribution in test: {pd.Series(y_test).value_counts()}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc757b7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Class balancing durch Undersampling der Mehrheitsklasse\n", - "from sklearn.utils import resample\n", - "\n", - "# Separate nach Labels\n", - "X_train_class0 = X_train_full[y_train_full == 0]\n", - "X_train_class1 = X_train_full[y_train_full == 1]\n", - "groups_class0 = groups_train[y_train_full == 0]\n", - "groups_class1 = groups_train[y_train_full == 1]\n", - "\n", - "print(f\"\\nBefore balancing - Class 0: {len(X_train_class0)}, Class 1: {len(X_train_class1)}\")\n", - "\n", - "# Undersample der Mehrheitsklasse (class 1)\n", - "n_samples = min(len(X_train_class0), len(X_train_class1))\n", - "\n", - "X_class1_downsampled, groups_class1_downsampled = resample(\n", - " X_train_class1, \n", - " groups_class1,\n", - " n_samples=n_samples,\n", - " random_state=42,\n", - " replace=False\n", - ")\n", - "\n", - "# Kombiniere balanced Daten\n", - "X_train_full = pd.concat([X_train_class0, X_class1_downsampled]).reset_index(drop=True)\n", - "y_train_full = np.concatenate([\n", - " np.zeros(len(X_train_class0)),\n", - " np.ones(len(X_class1_downsampled))\n", - "])\n", - "groups_train = np.concatenate([groups_class0, groups_class1_downsampled])\n", - "\n", - "# Shuffle\n", - "shuffle_idx = np.random.permutation(len(X_train_full))\n", - "X_train_full = X_train_full.iloc[shuffle_idx].reset_index(drop=True)\n", - "y_train_full = y_train_full[shuffle_idx]\n", - "groups_train = groups_train[shuffle_idx]\n", - "\n", - "# Verify balancing worked\n", - "print(\"\\n=== DATA CHECK AFTER BALANCING ===\")\n", - "print(f\"Training - Class 0: {(y_train_full==0).sum()}, Class 1: {(y_train_full==1).sum()}\")\n", - "print(f\"Test - Class 0: {(y_test==0).sum()}, Class 1: {(y_test==1).sum()}\")\n", - "print(f\"Training balanced: {(y_train_full==0).sum() == (y_train_full==1).sum()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "530e4acf", - "metadata": {}, - "source": [ - "### Custom SVM Layer (differentiable approximation)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4abbabe8", - "metadata": {}, - "outputs": [], - "source": [ - "class DifferentiableSVM(tf.keras.layers.Layer):\n", - " \"\"\"\n", - " Differentiable SVM Layer using hinge loss.\n", - " This allows backpropagation through the SVM to the encoder.\n", - " \"\"\"\n", - " def __init__(self, C=1.0, **kwargs):\n", - " super(DifferentiableSVM, self).__init__(**kwargs)\n", - " self.C = C\n", - " \n", - " def build(self, input_shape):\n", - " # SVM weights: w and bias b\n", - " self.w = self.add_weight(\n", - " shape=(input_shape[-1],),\n", - " initializer='glorot_uniform',\n", - " trainable=True,\n", - " name='svm_w'\n", - " )\n", - " self.b = self.add_weight(\n", - " shape=(1,),\n", - " initializer='zeros',\n", - " trainable=True,\n", - " name='svm_b'\n", - " )\n", - " \n", - " def call(self, inputs):\n", - " # Decision function: w^T * x + b\n", - " decision = tf.reduce_sum(inputs * self.w, axis=1, keepdims=True) + self.b\n", - " return decision\n", - " \n", - " def compute_loss(self, inputs, labels):\n", - " \"\"\"\n", - " Hinge loss for SVM: max(0, 1 - y * (w^T * x + b))\n", - " labels should be -1 or +1\n", - " \"\"\"\n", - " decision = self.call(inputs)\n", - " \n", - " # Convert labels from 0/1 to -1/+1\n", - " labels_svm = tf.where(labels == 0, -1.0, 1.0)\n", - " labels_svm = tf.cast(labels_svm, tf.float32)\n", - " labels_svm = tf.reshape(labels_svm, (-1, 1))\n", - " \n", - " # Hinge loss\n", - " hinge_loss = tf.reduce_mean(\n", - " tf.maximum(0.0, 1.0 - labels_svm * decision)\n", - " )\n", - " \n", - " # L2 regularization\n", - " l2_loss = 0.5 * tf.reduce_sum(tf.square(self.w))\n", - " \n", - " return self.C * hinge_loss + l2_loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61b8978e", - "metadata": {}, - "outputs": [], - "source": [ - "class JointAESVM(tf.keras.Model):\n", - " \"\"\"\n", - " Joint Autoencoder + SVM Model with Batch Normalization and Dropout\n", - " \"\"\"\n", - " def __init__(self, input_dim, latent_dim=5, hidden_dim=16, ae_weight=1.0, \n", - " svm_weight=1.0, svm_C=1.0, reg=0.0001, \n", - " use_batchnorm=True, dropout_rate=0.3, **kwargs):\n", - " super(JointAESVM, self).__init__(**kwargs)\n", - " \n", - " self.ae_weight = ae_weight\n", - " self.svm_weight = svm_weight\n", - " self.use_batchnorm = use_batchnorm\n", - " self.dropout_rate = dropout_rate\n", - " \n", - " # Encoder with BatchNorm and Dropout\n", - " encoder_layers = []\n", - " \n", - " encoder_layers.append(tf.keras.layers.Dense(\n", - " input_dim, \n", - " activation=None,\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " if use_batchnorm:\n", - " encoder_layers.append(tf.keras.layers.BatchNormalization())\n", - " encoder_layers.append(tf.keras.layers.Activation('relu'))\n", - " if dropout_rate > 0:\n", - " encoder_layers.append(tf.keras.layers.Dropout(dropout_rate))\n", - " \n", - " encoder_layers.append(tf.keras.layers.Dense(\n", - " hidden_dim,\n", - " activation=None,\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " if use_batchnorm:\n", - " encoder_layers.append(tf.keras.layers.BatchNormalization())\n", - " encoder_layers.append(tf.keras.layers.Activation('relu'))\n", - " if dropout_rate > 0:\n", - " encoder_layers.append(tf.keras.layers.Dropout(dropout_rate))\n", - " \n", - " encoder_layers.append(tf.keras.layers.Dense(\n", - " latent_dim,\n", - " activation=None,\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " if use_batchnorm:\n", - " encoder_layers.append(tf.keras.layers.BatchNormalization())\n", - " encoder_layers.append(tf.keras.layers.Activation('relu'))\n", - " # Kein Dropout auf latent layer!\n", - " \n", - " self.encoder = tf.keras.Sequential(encoder_layers, name='encoder')\n", - " \n", - " # Decoder with BatchNorm and Dropout\n", - " decoder_layers = []\n", - " \n", - " decoder_layers.append(tf.keras.layers.Dense(\n", - " latent_dim,\n", - " activation=None,\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " if use_batchnorm:\n", - " decoder_layers.append(tf.keras.layers.BatchNormalization())\n", - " decoder_layers.append(tf.keras.layers.Activation('relu'))\n", - " if dropout_rate > 0:\n", - " decoder_layers.append(tf.keras.layers.Dropout(dropout_rate))\n", - " \n", - " decoder_layers.append(tf.keras.layers.Dense(\n", - " hidden_dim,\n", - " activation=None,\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " if use_batchnorm:\n", - " decoder_layers.append(tf.keras.layers.BatchNormalization())\n", - " decoder_layers.append(tf.keras.layers.Activation('relu'))\n", - " if dropout_rate > 0:\n", - " decoder_layers.append(tf.keras.layers.Dropout(dropout_rate))\n", - " \n", - " decoder_layers.append(tf.keras.layers.Dense(\n", - " input_dim,\n", - " activation='linear',\n", - " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", - " ))\n", - " \n", - " self.decoder = tf.keras.Sequential(decoder_layers, name='decoder')\n", - " \n", - " # SVM Layer\n", - " self.svm = DifferentiableSVM(C=svm_C, name='svm')\n", - " \n", - " def call(self, inputs, training=False):\n", - " encoded = self.encoder(inputs, training=training)\n", - " decoded = self.decoder(encoded, training=training)\n", - " svm_output = self.svm(encoded)\n", - " \n", - " return decoded, svm_output, encoded\n", - " \n", - " def compute_loss(self, x, y_true):\n", - " x_reconstructed, svm_decision, encoded = self(x, training=True)\n", - " \n", - " reconstruction_loss = tf.reduce_mean(\n", - " tf.square(x - x_reconstructed)\n", - " )\n", - " \n", - " svm_loss = self.svm.compute_loss(encoded, y_true)\n", - " \n", - " total_loss = (self.ae_weight * reconstruction_loss + \n", - " self.svm_weight * svm_loss)\n", - " \n", - " return total_loss, reconstruction_loss, svm_loss\n", - "\n", - "print(\"Joint AE-SVM Model class defined (with BatchNorm + Dropout)\")" - ] - }, - { - "cell_type": "markdown", - "id": "445e10ff", - "metadata": {}, - "source": [ - "Train function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae498e69", - "metadata": {}, - "outputs": [], - "source": [ - "def train_joint_model(X_train, y_train, groups, model_params, \n", - " epochs=200, batch_size=64, learning_rate=0.0001,\n", - " use_batchnorm=True, dropout_rate=0.3):\n", - " \"\"\"\n", - " Train joint model on given data - GPU optimized\n", - " \"\"\"\n", - " # Build model\n", - " model = JointAESVM(\n", - " input_dim=X_train.shape[1],\n", - " latent_dim=model_params['latent_dim'],\n", - " hidden_dim=model_params['hidden_dim'],\n", - " ae_weight=model_params['ae_weight'],\n", - " svm_weight=model_params['svm_weight'],\n", - " svm_C=model_params['svm_C'],\n", - " reg=model_params['reg'],\n", - " use_batchnorm=use_batchnorm,\n", - " dropout_rate=dropout_rate # NEU!\n", - " )\n", - " \n", - " optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)\n", - " \n", - " history = {\n", - " 'total_loss': [],\n", - " 'recon_loss': [],\n", - " 'svm_loss': []\n", - " }\n", - " \n", - " X_train_tf = tf.constant(X_train.values, dtype=tf.float32)\n", - " y_train_tf = tf.constant(y_train, dtype=tf.float32)\n", - " \n", - " dataset = tf.data.Dataset.from_tensor_slices((X_train_tf, y_train_tf))\n", - " dataset = dataset.shuffle(buffer_size=min(10000, len(X_train)), \n", - " reshuffle_each_iteration=True)\n", - " dataset = dataset.batch(batch_size)\n", - " dataset = dataset.prefetch(tf.data.AUTOTUNE)\n", - " \n", - " @tf.function\n", - " def train_step(x_batch, y_batch):\n", - " with tf.GradientTape() as tape:\n", - " total_loss, recon_loss, svm_loss = model.compute_loss(x_batch, y_batch)\n", - " \n", - " gradients = tape.gradient(total_loss, model.trainable_variables)\n", - " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", - " \n", - " return total_loss, recon_loss, svm_loss\n", - " \n", - " for epoch in range(epochs):\n", - " epoch_loss = 0.0\n", - " epoch_recon = 0.0\n", - " epoch_svm = 0.0\n", - " n_batches = 0\n", - " \n", - " for x_batch, y_batch in dataset:\n", - " total_loss, recon_loss, svm_loss = train_step(x_batch, y_batch)\n", - " \n", - " epoch_loss += total_loss.numpy()\n", - " epoch_recon += recon_loss.numpy()\n", - " epoch_svm += svm_loss.numpy()\n", - " n_batches += 1\n", - " \n", - " history['total_loss'].append(epoch_loss / n_batches)\n", - " history['recon_loss'].append(epoch_recon / n_batches)\n", - " history['svm_loss'].append(epoch_svm / n_batches)\n", - " \n", - " if (epoch + 1) % 25 == 0:\n", - " print(f\"Epoch {epoch+1}/{epochs} - \"\n", - " f\"Total: {history['total_loss'][-1]:.4f}, \"\n", - " f\"Recon: {history['recon_loss'][-1]:.4f}, \"\n", - " f\"SVM: {history['svm_loss'][-1]:.4f}\")\n", - " \n", - " return model, history\n", - "\n", - "print(\"Training function defined (with Dropout)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ded352fe", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameter Grid\n", - "param_grid = {\n", - " 'latent_dim': [5, 8],\n", - " 'hidden_dim': [10, 16],\n", - " 'ae_weight': [0.5, 1.0],\n", - " 'svm_weight': [0.5, 2.0],\n", - " 'svm_C': [0.1, 1.0, 10.0],\n", - " 'reg': [0.01, 0.001]\n", - "}\n", - "\n", - "n_splits = 5\n", - "\n", - "# GPU-optimierte Batch Size\n", - "gpus = tf.config.list_physical_devices('GPU')\n", - "if gpus:\n", - " BATCH_SIZE = 256 # Größere Batches für GPU\n", - " print(\"GPU detected - using batch size:\", BATCH_SIZE)\n", - "else:\n", - " BATCH_SIZE = 64 # Kleinere Batches für CPU\n", - " print(\"CPU only - using batch size:\", BATCH_SIZE)\n", - "\n", - "gkf = GroupKFold(n_splits=n_splits)\n", - "\n", - "print(f\"Starting Grid Search with {n_splits}-fold GroupKFold\")\n", - "print(f\"Parameter combinations: {len(list(ParameterGrid(param_grid)))}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c36cc9d", - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate_model(model, X, y):\n", - " \"\"\"Evaluate joint model\"\"\"\n", - " X_tf = tf.constant(X, dtype=tf.float32)\n", - " _, svm_decision, _ = model(X_tf, training=False)\n", - " \n", - " # Predict: decision > 0 -> class 1, else class 0\n", - " y_pred = (svm_decision.numpy().flatten() > 0).astype(int)\n", - " \n", - " bal_accuracy = balanced_accuracy_score(y, y_pred)\n", - " return bal_accuracy, y_pred\n", - "\n", - "print(\"Evaluation function defined\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92216898", - "metadata": {}, - "outputs": [], - "source": [ - "# Grid Search\n", - "best_score = -np.inf\n", - "best_params = None\n", - "best_model = None\n", - "all_results = []\n", - "\n", - "X_train_array = X_train_full.values\n", - "y_train_array = y_train_full\n", - "\n", - "for param_idx, params in enumerate(ParameterGrid(param_grid)):\n", - " print(f\"\\n{'='*60}\")\n", - " print(f\"Testing parameters {param_idx + 1}/{len(list(ParameterGrid(param_grid)))}\")\n", - " print(f\"Params: {params}\")\n", - " print(f\"{'='*60}\")\n", - " \n", - " fold_scores = []\n", - " \n", - " for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_array, y_train_array, groups_train)):\n", - " print(f\"\\nFold {fold + 1}/{n_splits}\")\n", - " \n", - " X_fold_train = pd.DataFrame(X_train_array[train_idx], columns=X_train_full.columns) # wieso hier ein DF!\n", - " y_fold_train = y_train_array[train_idx]\n", - " X_fold_val = X_train_array[val_idx]\n", - " y_fold_val = y_train_array[val_idx]\n", - " \n", - " # Train model\n", - " model, history = train_joint_model(\n", - " X_fold_train, y_fold_train, groups_train[train_idx],\n", - " model_params=params,\n", - " epochs=100,\n", - " batch_size=BATCH_SIZE,\n", - " learning_rate=0.0001,\n", - " use_batchnorm=True, # HINZUFÜGEN!\n", - " dropout_rate=0.3)\n", - " \n", - " # Validate\n", - " val_bal_acc, _ = evaluate_model(model, X_fold_val, y_fold_val)\n", - " fold_scores.append(val_bal_acc)\n", - " print(f\"Fold {fold + 1} Validation balanced Accuracy: {val_bal_acc:.4f}\")\n", - " \n", - " mean_score = np.mean(fold_scores)\n", - " std_score = np.std(fold_scores)\n", - " \n", - " result = {\n", - " **params,\n", - " 'mean_cv_bal_accuracy': mean_score,\n", - " 'std_cv_bal_accuracy': std_score\n", - " }\n", - " all_results.append(result)\n", - " \n", - " print(f\"\\nMean CV bal. Accuracy: {mean_score:.4f} ± {std_score:.4f}\")\n", - " \n", - " if mean_score > best_score:\n", - " best_score = mean_score\n", - " best_params = params\n", - " print(\"*** NEW BEST PARAMETERS ***\")\n", - "\n", - "print(f\"\\n{'='*60}\")\n", - "print(\"GRID SEARCH COMPLETED\")\n", - "print(f\"{'='*60}\")\n", - "print(f\"Best parameters: {best_params}\")\n", - "print(f\"Best CV bal. accuracy: {best_score:.4f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91182740", - "metadata": {}, - "outputs": [], - "source": [ - "results_df = pd.DataFrame(all_results)\n", - "results_df = results_df.sort_values('mean_cv_bal_accuracy', ascending=False)\n", - "\n", - "print(\"\\nTop 10 configurations:\")\n", - "print(results_df.head(10))\n", - "\n", - "# Plot\n", - "plt.figure(figsize=(12, 6))\n", - "plt.barh(range(min(10, len(results_df))), \n", - " results_df['mean_cv_bal_accuracy'].head(10))\n", - "plt.yticks(range(min(10, len(results_df))), \n", - " [f\"Config {i+1}\" for i in range(min(10, len(results_df)))])\n", - "plt.xlabel('Mean CV bal Accuracy')\n", - "plt.title('Top 10 Configurations')\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6769a88", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Training final model on all training data...\")\n", - "print(f\"Best parameters: {best_params}\")\n", - "\n", - "final_model, final_history = train_joint_model(\n", - " X_train_full, y_train_full, groups_train,\n", - " model_params=best_params,\n", - " epochs=400, # 2000 ist zu viel, 300 reicht!\n", - " batch_size=BATCH_SIZE,\n", - " learning_rate=0.0001,\n", - " use_batchnorm=True, # HINZUFÜGEN!\n", - " dropout_rate=0.3 # HINZUFÜGEN!\n", - ")\n", - "\n", - "print(\"\\nFinal model training completed!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dfeaa54", - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", - "\n", - "axes[0].plot(final_history['total_loss'])\n", - "axes[0].set_title('Total Loss')\n", - "axes[0].set_xlabel('Epoch')\n", - "axes[0].set_ylabel('Loss')\n", - "axes[0].grid(True, alpha=0.3)\n", - "\n", - "axes[1].plot(final_history['recon_loss'])\n", - "axes[1].set_title('Reconstruction Loss')\n", - "axes[1].set_xlabel('Epoch')\n", - "axes[1].set_ylabel('Loss')\n", - "axes[1].grid(True, alpha=0.3)\n", - "\n", - "axes[2].plot(final_history['svm_loss'])\n", - "axes[2].set_title('SVM Loss')\n", - "axes[2].set_xlabel('Epoch')\n", - "axes[2].set_ylabel('Loss')\n", - "axes[2].grid(True, alpha=0.3)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b14b63f3", - "metadata": {}, - "outputs": [], - "source": [ - "# Get predictions\n", - "test_acc, y_pred = evaluate_model(final_model, X_test.values, y_test)\n", - "\n", - "# Get SVM decision values for ROC-AUC\n", - "X_test_tf = tf.constant(X_test.values, dtype=tf.float32)\n", - "_, svm_decision, _ = final_model(X_test_tf, training=False)\n", - "y_pred_decision = svm_decision.numpy().flatten()\n", - "\n", - "# Metrics\n", - "print(\"=\" * 50)\n", - "print(\"TEST SET EVALUATION\")\n", - "print(\"=\" * 50)\n", - "print(f\"\\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", - "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n", - "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n", - "print(f\"F1-Score: {f1_score(y_test, y_pred):.4f}\")\n", - "\n", - "# ROC-AUC (decision values as probability proxy)\n", - "decision_scaled = MinMaxScaler().fit_transform(y_pred_decision.reshape(-1, 1)).flatten()\n", - "print(f\"ROC-AUC: {roc_auc_score(y_test, decision_scaled):.4f}\")\n", - "\n", - "print(\"\\nConfusion Matrix:\")\n", - "cm = confusion_matrix(y_test, y_pred, normalize='true')\n", - "print(cm)\n", - "\n", - "print(\"\\nClassification Report:\")\n", - "print(classification_report(y_test, y_pred))\n", - "\n", - "# Visualize Confusion Matrix\n", - "fig, ax = plt.subplots(figsize=(8, 6))\n", - "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Low Load (0)', 'High Load (1)'])\n", - "disp.plot(cmap='Blues', ax=ax, colorbar=True)\n", - "ax.set_title('Confusion Matrix - Test Set', fontsize=14, fontweight='bold')\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8787bc7", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "from datetime import datetime\n", - "\n", - "# Timestamp für eindeutige Dateinamen\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "\n", - "# 1. Save model weights\n", - "weights_path = f'joint_ae_svm_weights_{timestamp}.h5'\n", - "final_model.save_weights(weights_path)\n", - "print(f\"Model weights saved as '{weights_path}'\")\n", - "\n", - "# 2. Save encoder separately\n", - "encoder_path = f'encoder_joint_{timestamp}.keras'\n", - "final_model.encoder.save(encoder_path)\n", - "print(f\"Encoder saved as '{encoder_path}'\")\n", - "\n", - "# 3. Save best parameters + model architecture info\n", - "model_config = {\n", - " 'best_params': best_params,\n", - " 'input_dim': X_train_full.shape[1],\n", - " 'au_columns': au_columns,\n", - " 'timestamp': timestamp,\n", - " 'training_samples': len(X_train_full),\n", - " 'test_samples': len(X_test)\n", - "}\n", - "\n", - "config_pkl_path = f'model_config_joint_{timestamp}.pkl'\n", - "with open(config_pkl_path, 'wb') as f:\n", - " pickle.dump(model_config, f)\n", - "print(f\"Model config saved as '{config_pkl_path}'\")\n", - "\n", - "# 4. Speichere auch als JSON (lesbar)\n", - "config_json_path = f'model_config_joint_{timestamp}.json'\n", - "with open(config_json_path, 'w') as f:\n", - " json_data = {k: v.tolist() if isinstance(v, np.ndarray) else v \n", - " for k, v in model_config.items() if k != 'au_columns'}\n", - " json_data['au_columns'] = au_columns # Liste ist JSON-serializable\n", - " json.dump(json_data, f, indent=2)\n", - "print(f\"Model config (JSON) saved as '{config_json_path}'\")\n", - " \n", - "# 5. Save SVM weights separately\n", - "svm_weights_path = f'svm_weights_joint_{timestamp}.pkl'\n", - "svm_weights = {\n", - " 'w': final_model.svm.w.numpy(),\n", - " 'b': final_model.svm.b.numpy()\n", - "}\n", - "with open(svm_weights_path, 'wb') as f:\n", - " pickle.dump(svm_weights, f)\n", - "print(f\"SVM weights saved as '{svm_weights_path}'\")\n", - "\n", - "# 6. Save Grid Search Results\n", - "results_path = f'grid_search_results_{timestamp}.pkl'\n", - "with open(results_path, 'wb') as f:\n", - " pickle.dump(all_results, f)\n", - "print(f\"Grid search results saved as '{results_path}'\")\n", - "\n", - "print(f\"\\n✓ All models and configs saved with timestamp: {timestamp}\")\n", - "print(f\"\\nTo load this model later, use:\")\n", - "print(f\" load_joint_model('{weights_path}', '{config_pkl_path}')\")" - ] - }, - { - "cell_type": "markdown", - "id": "06a538c4", - "metadata": {}, - "source": [ - "* doch mal svm ae pipeline?\n", - "* einfach mal mit 20 13 5\n", - "* label hinzufügen\n", - "* mad von CT verwenden oder wert anpassen, ggf. vergleich welches label wie oft vorkommt vorher und nachher. --> labelling schritt von CT übernehmen\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9da57ed", - "metadata": {}, - "outputs": [], - "source": [ - "def load_joint_model(weights_path='joint_ae_svm.weights.h5',\n", - " config_path='model_config_joint.pkl'):\n", - " \"\"\"\n", - " Load the trained joint AE-SVM model\n", - " \n", - " Returns: model, config dict\n", - " \"\"\"\n", - " # Load config\n", - " with open(config_path, 'rb') as f:\n", - " config = pickle.load(f)\n", - " \n", - " params = config['best_params']\n", - " input_dim = config['input_dim']\n", - " \n", - " # Rebuild model with same architecture\n", - " model = JointAESVM(\n", - " input_dim=input_dim,\n", - " latent_dim=params['latent_dim'],\n", - " hidden_dim=params['hidden_dim'],\n", - " ae_weight=params['ae_weight'],\n", - " svm_weight=params['svm_weight'],\n", - " svm_C=params['svm_C'],\n", - " reg=params['reg']\n", - " )\n", - " \n", - " # Dummy forward pass to build weights\n", - " dummy_input = tf.random.normal((1, input_dim))\n", - " _ = model(dummy_input, training=False)\n", - " \n", - " # Load weights\n", - " model.load_weights(weights_path)\n", - " print(f\"✓ Model loaded from {weights_path}\")\n", - " \n", - " return model, config\n", - "\n", - "print(\"Load function defined\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a9c9fb", - "metadata": {}, - "outputs": [], - "source": [ - "# Test: Model laden\n", - "loaded_model, loaded_config = load_joint_model()\n", - "\n", - "# Test prediction\n", - "test_sample = X_test.values[:5]\n", - "test_sample_tf = tf.constant(test_sample, dtype=tf.float32)\n", - "_, svm_out, encoded = loaded_model(test_sample_tf, training=False)\n", - "\n", - "print(\"Test prediction successful!\")\n", - "print(f\"Encoded shape: {encoded.shape}\")\n", - "print(f\"SVM decisions: {svm_out.numpy().flatten()}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4528f50", - "metadata": {}, - "outputs": [], - "source": [ - "def predict_flexible(model, X_data, use_gpu=None):\n", - " \"\"\"\n", - " Predict auf CPU oder GPU\n", - " \n", - " Args:\n", - " model: Das geladene Modell\n", - " X_data: Input data (numpy array oder DataFrame)\n", - " use_gpu: True/False/None (None = auto-detect)\n", - " \n", - " Returns:\n", - " predictions, decision_values, encoded_features\n", - " \"\"\"\n", - " # Auto-detect GPU\n", - " if use_gpu is None:\n", - " gpus = tf.config.list_physical_devices('GPU')\n", - " use_gpu = len(gpus) > 0\n", - " \n", - " # Force CPU oder GPU\n", - " device = '/GPU:0' if use_gpu else '/CPU:0'\n", - " \n", - " print(f\"Running prediction on: {device}\")\n", - " \n", - " with tf.device(device):\n", - " if isinstance(X_data, pd.DataFrame):\n", - " X_data = X_data.values\n", - " \n", - " X_tf = tf.constant(X_data, dtype=tf.float32)\n", - " _, svm_decision, encoded = model(X_tf, training=False)\n", - " \n", - " # Predictions: decision > 0 -> class 1\n", - " y_pred = (svm_decision.numpy().flatten() > 0).astype(int)\n", - " \n", - " return y_pred, svm_decision.numpy().flatten(), encoded.numpy()\n", - "\n", - "# Test auf CPU\n", - "y_pred_cpu, decisions_cpu, encoded_cpu = predict_flexible(\n", - " loaded_model, X_test.values[:10], use_gpu=False\n", - ")\n", - "print(f\"CPU Predictions: {y_pred_cpu}\")\n", - "\n", - "# Test auf GPU\n", - "y_pred_gpu, decisions_gpu, encoded_gpu = predict_flexible(\n", - " loaded_model, X_test.values[:10], use_gpu=True\n", - ")\n", - "print(f\"GPU Predictions: {y_pred_gpu}\")\n", - "\n", - "# Verify sie sind identisch\n", - "print(f\"\\nResults identical: {np.allclose(decisions_cpu, decisions_gpu)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1447bfbd", - "metadata": {}, - "outputs": [], - "source": [ - "# Diagnose: Was lernt das Modell?\n", - "print(\"=== MODEL DIAGNOSIS ===\\n\")\n", - "\n", - "# Check SVM weights\n", - "print(\"SVM Weights (w):\", final_model.svm.w.numpy()[:10], \"...\")\n", - "print(\"SVM Bias (b):\", final_model.svm.b.numpy())\n", - "print(\"SVM weight norm:\", np.linalg.norm(final_model.svm.w.numpy()))\n", - "\n", - "# Check predictions distribution\n", - "X_train_tf = tf.constant(X_train_full.values, dtype=tf.float32)\n", - "_, train_decisions, train_encoded = final_model(X_train_tf, training=False)\n", - "train_decisions = train_decisions.numpy().flatten()\n", - "train_encoded = train_encoded.numpy() # KORREKTUR!\n", - "\n", - "print(f\"\\nTraining set decisions:\")\n", - "print(f\" Min: {train_decisions.min():.4f}\")\n", - "print(f\" Max: {train_decisions.max():.4f}\")\n", - "print(f\" Mean: {train_decisions.mean():.4f}\")\n", - "print(f\" Std: {train_decisions.std():.4f}\")\n", - "\n", - "train_pred = (train_decisions > 0).astype(int)\n", - "print(f\"\\nTraining predictions distribution:\")\n", - "print(pd.Series(train_pred).value_counts())\n", - "print(f\"Training balanced accuracy: {balanced_accuracy_score(y_train_full, train_pred):.4f}\")\n", - "\n", - "# Check encoded features\n", - "print(f\"\\nEncoded features stats:\")\n", - "print(f\" Mean: {train_encoded.mean():.4f}\")\n", - "print(f\" Std: {train_encoded.std():.4f}\")\n", - "print(f\" Min: {train_encoded.min():.4f}\")\n", - "print(f\" Max: {train_encoded.max():.4f}\")\n", - "\n", - "# Check per class\n", - "print(f\"\\nEncoded features per class:\")\n", - "for label in [0, 1]:\n", - " mask = y_train_full == label\n", - " enc_class = train_encoded[mask]\n", - " print(f\" Class {label}: mean={enc_class.mean():.4f}, std={enc_class.std():.4f}\")\n", - "\n", - "# Test set diagnosis\n", - "print(\"\\n=== TEST SET DIAGNOSIS ===\\n\")\n", - "X_test_tf = tf.constant(X_test.values, dtype=tf.float32)\n", - "_, test_decisions, test_encoded = final_model(X_test_tf, training=False)\n", - "test_decisions = test_decisions.numpy().flatten()\n", - "test_encoded = test_encoded.numpy()\n", - "\n", - "print(f\"Test set decisions:\")\n", - "print(f\" Min: {test_decisions.min():.4f}\")\n", - "print(f\" Max: {test_decisions.max():.4f}\")\n", - "print(f\" Mean: {test_decisions.mean():.4f}\")\n", - "print(f\" Std: {test_decisions.std():.4f}\")\n", - "\n", - "test_pred = (test_decisions > 0).astype(int)\n", - "print(f\"\\nTest predictions distribution:\")\n", - "print(pd.Series(test_pred).value_counts())\n", - "print(f\"Test balanced accuracy: {balanced_accuracy_score(y_test, test_pred):.4f}\")\n", - "\n", - "# Vergleich Train vs Test encoded features\n", - "print(f\"\\n=== TRAIN vs TEST Encoded Features ===\")\n", - "print(f\"Train encoded - Mean: {train_encoded.mean():.4f}, Std: {train_encoded.std():.4f}\")\n", - "print(f\"Test encoded - Mean: {test_encoded.mean():.4f}, Std: {test_encoded.std():.4f}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/model_training/VAE_SVM/vaesvm.ipynb b/model_training/VAE_SVM/vaesvm.ipynb deleted file mode 100644 index dd5ac15..0000000 --- a/model_training/VAE_SVM/vaesvm.ipynb +++ /dev/null @@ -1,877 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "708c9745", - "metadata": {}, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53b10294", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from pathlib import Path\n", - "import sys\n", - "import os\n", - "\n", - "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", - "sys.path.append(base_dir)\n", - "print(base_dir)\n", - "\n", - "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n", - "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", - "from sklearn.svm import OneClassSVM\n", - "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n", - "import matplotlib.pyplot as plt\n", - "import tensorflow as tf\n", - "import pickle\n", - "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n", - " recall_score, f1_score, confusion_matrix, classification_report) " - ] - }, - { - "cell_type": "markdown", - "id": "68101229", - "metadata": {}, - "source": [ - "### load Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24a765e8", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "471001b0", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(path=dataset_path)" - ] - }, - { - "cell_type": "markdown", - "id": "0fdecdaa", - "metadata": {}, - "source": [ - "### Load Performance data and Subject Split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "692d1b47", - "metadata": {}, - "outputs": [], - "source": [ - "performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n", - "performance_df = pd.read_csv(performance_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea617e3f", - "metadata": {}, - "outputs": [], - "source": [ - "# Subject IDs aus dem Haupt-Dataset nehmen\n", - "subjects_from_df = df[\"subjectID\"].unique()\n", - "\n", - "# Performance-Subset nur für vorhandene Subjects\n", - "perf_filtered = performance_df[\n", - " performance_df[\"subjectID\"].isin(subjects_from_df)\n", - "][[\"subjectID\", \"overall_score\"]]\n", - "\n", - "# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n", - "merged = (\n", - " pd.DataFrame({\"subjectID\": subjects_from_df})\n", - " .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n", - ")\n", - "\n", - "# Sicherstellen, dass keine Scores fehlen\n", - "if merged[\"overall_score\"].isna().any():\n", - " raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae43df8d", - "metadata": {}, - "outputs": [], - "source": [ - "merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n", - "\n", - "scores = merged_sorted[\"overall_score\"].values\n", - "n_total = len(merged_sorted)\n", - "n_small = n_total // 3\n", - "n_large = n_total - n_small\n", - "\n", - "# Schritt 1: zufällige Start-Aufteilung\n", - "idx = np.arange(n_total)\n", - "np.random.shuffle(idx)\n", - "\n", - "small_idx = idx[:n_small]\n", - "large_idx = idx[n_small:]\n", - "\n", - "def score_diff(small_idx, large_idx):\n", - " return abs(scores[small_idx].mean() - scores[large_idx].mean())\n", - "\n", - "diff = score_diff(small_idx, large_idx)\n", - "threshold = 0.01\n", - "max_iter = 100\n", - "count = 0\n", - "\n", - "# Schritt 2: random swaps bis Differenz klein genug\n", - "while diff > threshold and count < max_iter:\n", - " # Zwei zufällige Elemente auswählen\n", - " si = np.random.choice(small_idx)\n", - " li = np.random.choice(large_idx)\n", - " \n", - " # Tausch durchführen\n", - " new_small_idx = small_idx.copy()\n", - " new_large_idx = large_idx.copy()\n", - " \n", - " new_small_idx[new_small_idx == si] = li\n", - " new_large_idx[new_large_idx == li] = si\n", - "\n", - " # neue Differenz berechnen\n", - " new_diff = score_diff(new_small_idx, new_large_idx)\n", - "\n", - " # Swap akzeptieren, wenn es besser wird\n", - " if new_diff < diff:\n", - " small_idx = new_small_idx\n", - " large_idx = new_large_idx\n", - " diff = new_diff\n", - "\n", - " count += 1\n", - "\n", - "# Finalgruppen\n", - "group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n", - "group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n", - "\n", - "print(\"Finale Score-Differenz:\", diff)\n", - "print(\"Größe Gruppe 1:\", len(group_small))\n", - "print(\"Größe Gruppe 2:\", len(group_large))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d1b414e", - "metadata": {}, - "outputs": [], - "source": [ - "group_large['overall_score'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa71f9a5", - "metadata": {}, - "outputs": [], - "source": [ - "group_small['overall_score'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79ecb4a2", - "metadata": {}, - "outputs": [], - "source": [ - "training_subjects = group_large['subjectID'].values\n", - "test_subjects = group_small['subjectID'].values\n", - "print(training_subjects)\n", - "print(test_subjects)" - ] - }, - { - "cell_type": "markdown", - "id": "4353f87c", - "metadata": {}, - "source": [ - "### Data cleaning with mad" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76610052", - "metadata": {}, - "outputs": [], - "source": [ - "# SET\n", - "threshold_mad = 5\n", - "column_praefix ='AU'\n", - "\n", - "au_columns = [col for col in df.columns if col.startswith(column_praefix)]\n", - "cleaned_df = mad_outlier_removal.mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n", - "print(cleaned_df.shape)\n", - "print(df.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "9a6c1732", - "metadata": {}, - "source": [ - "#### TO DO\n", - " * pipeline aus Autoencoder und SVM\n", - " * group k fold\n", - " * AE überpüfen, loss dokumentieren" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "877309d9", - "metadata": {}, - "outputs": [], - "source": [ - "### Variational Autoencoder with Classifier Head\n", - "import pandas as pd\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "from tensorflow import keras\n", - "from tensorflow.keras import layers, Model\n", - "from sklearn.model_selection import GroupKFold\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.metrics import (\n", - " accuracy_score, precision_score, recall_score, f1_score, \n", - " roc_auc_score, confusion_matrix, classification_report\n", - ")\n", - "import matplotlib.pyplot as plt\n", - "from collections import defaultdict\n", - "\n", - "# ============================================================================\n", - "# 1. CREATE LABELS\n", - "# ============================================================================\n", - "\n", - "# Low workload: baseline + n-back level 1,4\n", - "low_all = cleaned_df[\n", - " ((cleaned_df[\"PHASE\"] == \"baseline\") |\n", - " ((cleaned_df[\"STUDY\"] == \"n-back\") & (cleaned_df[\"PHASE\"] != \"baseline\") & (cleaned_df[\"LEVEL\"].isin([1,4]))))\n", - "].copy()\n", - "low_all['label'] = 0\n", - "print(f\"Low workload samples: {low_all.shape[0]}\")\n", - "\n", - "# High workload n-back: level 2,3,5,6\n", - "high_nback = cleaned_df[\n", - " (cleaned_df[\"STUDY\"]==\"n-back\") &\n", - " (cleaned_df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", - " (cleaned_df[\"PHASE\"].isin([\"train\", \"test\"]))\n", - "].copy()\n", - "high_nback['label'] = 1\n", - "print(f\"High n-back samples: {high_nback.shape[0]}\")\n", - "\n", - "# High workload k-drive\n", - "high_kdrive = cleaned_df[\n", - " (cleaned_df[\"STUDY\"] == \"k-drive\") & (cleaned_df[\"PHASE\"] != \"baseline\")\n", - "].copy()\n", - "high_kdrive['label'] = 1\n", - "print(f\"High k-drive samples: {high_kdrive.shape[0]}\")\n", - "\n", - "# Combine all high workload\n", - "high_all = pd.concat([high_nback, high_kdrive])\n", - "print(f\"Total high workload samples: {high_all.shape[0]}\")\n", - "\n", - "# Complete labeled dataset\n", - "labeled_df = pd.concat([low_all, high_all]).reset_index(drop=True)\n", - "print(f\"\\nTotal labeled samples: {labeled_df.shape[0]}\")\n", - "print(f\"Class distribution:\\n{labeled_df['label'].value_counts()}\")\n", - "\n", - "# ============================================================================\n", - "# 2. TRAIN/TEST SPLIT BY SUBJECTS\n", - "# ============================================================================\n", - "\n", - "train_df = labeled_df[labeled_df['subjectID'].isin(training_subjects)].copy()\n", - "test_df = labeled_df[labeled_df['subjectID'].isin(test_subjects)].copy()\n", - "\n", - "print(f\"\\nTraining subjects: {training_subjects}\")\n", - "print(f\"Test subjects: {test_subjects}\")\n", - "print(f\"Train samples: {train_df.shape[0]}, Test samples: {test_df.shape[0]}\")\n", - "\n", - "# Extract features and labels\n", - "au_columns = [col for col in labeled_df.columns if col.startswith('AU')]\n", - "print(f\"\\nUsing {len(au_columns)} AU features: {au_columns}\")\n", - "\n", - "X_train = train_df[au_columns].values\n", - "y_train = train_df['label'].values\n", - "groups_train = train_df['subjectID'].values\n", - "\n", - "X_test = test_df[au_columns].values\n", - "y_test = test_df['label'].values\n", - "\n", - "# Normalize features\n", - "scaler = StandardScaler()\n", - "X_train_scaled = scaler.fit_transform(X_train)\n", - "X_test_scaled = scaler.transform(X_test)\n", - "\n", - "print(f\"\\nTrain class distribution: {np.bincount(y_train)}\")\n", - "print(f\"Test class distribution: {np.bincount(y_test)}\")\n", - "\n", - "# ============================================================================\n", - "# 3. VAE WITH CLASSIFIER HEAD MODEL\n", - "# ============================================================================\n", - "\n", - "class Sampling(layers.Layer):\n", - " \"\"\"Reparameterization trick for VAE\"\"\"\n", - " def call(self, inputs):\n", - " z_mean, z_log_var = inputs\n", - " batch = tf.shape(z_mean)[0]\n", - " dim = tf.shape(z_mean)[1]\n", - " epsilon = tf.random.normal(shape=(batch, dim))\n", - " return z_mean + tf.exp(0.5 * z_log_var) * epsilon\n", - "\n", - "def build_vae_classifier(input_dim, latent_dim, encoder_dims=[32, 16], \n", - " decoder_dims=[16, 32], classifier_dims=[16]):\n", - " \"\"\"\n", - " Build VAE with classifier head\n", - " \n", - " Args:\n", - " input_dim: Number of input features (20 AUs)\n", - " latent_dim: Dimension of latent space (2-5)\n", - " encoder_dims: Hidden layer sizes for encoder\n", - " decoder_dims: Hidden layer sizes for decoder\n", - " classifier_dims: Hidden layer sizes for classifier\n", - " \"\"\"\n", - " \n", - " # ---- ENCODER ----\n", - " encoder_inputs = keras.Input(shape=(input_dim,), name='encoder_input')\n", - " x = encoder_inputs\n", - " \n", - " for i, dim in enumerate(encoder_dims):\n", - " x = layers.Dense(dim, activation='relu', name=f'encoder_dense_{i}')(x)\n", - " x = layers.BatchNormalization(name=f'encoder_bn_{i}')(x)\n", - " x = layers.Dropout(0.2, name=f'encoder_dropout_{i}')(x)\n", - " \n", - " z_mean = layers.Dense(latent_dim, name='z_mean')(x)\n", - " z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)\n", - " z = Sampling()([z_mean, z_log_var])\n", - " \n", - " encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')\n", - " \n", - " # ---- DECODER ----\n", - " latent_inputs = keras.Input(shape=(latent_dim,), name='latent_input')\n", - " x = latent_inputs\n", - " \n", - " for i, dim in enumerate(decoder_dims):\n", - " x = layers.Dense(dim, activation='relu', name=f'decoder_dense_{i}')(x)\n", - " x = layers.BatchNormalization(name=f'decoder_bn_{i}')(x)\n", - " \n", - " decoder_outputs = layers.Dense(input_dim, activation='linear', name='decoder_output')(x)\n", - " decoder = Model(latent_inputs, decoder_outputs, name='decoder')\n", - " \n", - " # ---- CLASSIFIER HEAD ----\n", - " x = latent_inputs\n", - " for i, dim in enumerate(classifier_dims):\n", - " x = layers.Dense(dim, activation='relu', name=f'classifier_dense_{i}')(x)\n", - " x = layers.Dropout(0.3, name=f'classifier_dropout_{i}')(x)\n", - " \n", - " classifier_output = layers.Dense(1, activation='sigmoid', name='classifier_output')(x)\n", - " classifier = Model(latent_inputs, classifier_output, name='classifier')\n", - " \n", - " # ---- FULL MODEL ----\n", - " inputs = keras.Input(shape=(input_dim,), name='vae_input')\n", - " z_mean, z_log_var, z = encoder(inputs)\n", - " reconstructed = decoder(z)\n", - " classification = classifier(z)\n", - " \n", - " model = Model(inputs, [reconstructed, classification], name='vae_classifier')\n", - " \n", - " return model, encoder, decoder, classifier\n", - "\n", - "# ============================================================================\n", - "# 4. CUSTOM TRAINING LOOP WITH COMBINED LOSS\n", - "# ============================================================================\n", - "\n", - "class VAEClassifier(keras.Model):\n", - " def __init__(self, encoder, decoder, classifier, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self.encoder = encoder\n", - " self.decoder = decoder\n", - " self.classifier = classifier\n", - " self.total_loss_tracker = keras.metrics.Mean(name=\"total_loss\")\n", - " self.reconstruction_loss_tracker = keras.metrics.Mean(name=\"reconstruction_loss\")\n", - " self.kl_loss_tracker = keras.metrics.Mean(name=\"kl_loss\")\n", - " self.classification_loss_tracker = keras.metrics.Mean(name=\"classification_loss\")\n", - " self.accuracy_tracker = keras.metrics.BinaryAccuracy(name=\"accuracy\")\n", - " \n", - " @property\n", - " def metrics(self):\n", - " return [\n", - " self.total_loss_tracker,\n", - " self.reconstruction_loss_tracker,\n", - " self.kl_loss_tracker,\n", - " self.classification_loss_tracker,\n", - " self.accuracy_tracker,\n", - " ]\n", - " \n", - " def train_step(self, data):\n", - " x, y = data\n", - " \n", - " with tf.GradientTape() as tape:\n", - " # Forward pass\n", - " z_mean, z_log_var, z = self.encoder(x, training=True)\n", - " reconstruction = self.decoder(z, training=True)\n", - " classification = self.classifier(z, training=True)\n", - " \n", - " # Reconstruction loss (MSE)\n", - " reconstruction_loss = tf.reduce_mean(\n", - " keras.losses.mse(x, reconstruction))\n", - " \n", - " # KL divergence loss\n", - " kl_loss = -0.5 * tf.reduce_mean(\n", - " tf.reduce_sum(\n", - " 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),\n", - " axis=1\n", - " )\n", - " )\n", - " \n", - " # Classification loss (binary crossentropy)\n", - " # Classification loss (binary crossentropy)\n", - " classification_loss = tf.reduce_mean(\n", - " keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n", - " )\n", - " \n", - " # Combined loss with weights\n", - " total_loss = reconstruction_loss + kl_loss + classification_loss\n", - " \n", - " # Backpropagation\n", - " grads = tape.gradient(total_loss, self.trainable_weights)\n", - " self.optimizer.apply_gradients(zip(grads, self.trainable_weights))\n", - " \n", - " # Update metrics\n", - " self.total_loss_tracker.update_state(total_loss)\n", - " self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n", - " self.kl_loss_tracker.update_state(kl_loss)\n", - " self.classification_loss_tracker.update_state(classification_loss)\n", - " self.accuracy_tracker.update_state(y, classification)\n", - " \n", - " return {\n", - " \"total_loss\": self.total_loss_tracker.result(),\n", - " \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n", - " \"kl_loss\": self.kl_loss_tracker.result(),\n", - " \"classification_loss\": self.classification_loss_tracker.result(),\n", - " \"accuracy\": self.accuracy_tracker.result(),\n", - " }\n", - " \n", - " def test_step(self, data):\n", - " x, y = data\n", - " \n", - " z_mean, z_log_var, z = self.encoder(x, training=False)\n", - " reconstruction = self.decoder(z, training=False)\n", - " classification = self.classifier(z, training=False)\n", - " \n", - " # Reconstruction loss (MSE)\n", - " reconstruction_loss = tf.reduce_mean(\n", - " keras.losses.mse(x, reconstruction))\n", - " kl_loss = -0.5 * tf.reduce_mean(\n", - " tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)\n", - " )\n", - " # Classification loss (binary crossentropy)\n", - " classification_loss = tf.reduce_mean(\n", - " keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n", - " )\n", - " total_loss = reconstruction_loss + kl_loss + classification_loss\n", - " \n", - " self.total_loss_tracker.update_state(total_loss)\n", - " self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n", - " self.kl_loss_tracker.update_state(kl_loss)\n", - " self.classification_loss_tracker.update_state(classification_loss)\n", - " self.accuracy_tracker.update_state(y, classification)\n", - " \n", - " return {\n", - " \"total_loss\": self.total_loss_tracker.result(),\n", - " \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n", - " \"kl_loss\": self.kl_loss_tracker.result(),\n", - " \"classification_loss\": self.classification_loss_tracker.result(),\n", - " \"accuracy\": self.accuracy_tracker.result(),\n", - " }\n", - "\n", - "# ============================================================================\n", - "# 5. GROUP K-FOLD CROSS-VALIDATION WITH GRID SEARCH\n", - "# ============================================================================\n", - "\n", - "# Hyperparameter grid\n", - "param_grid = {\n", - " 'latent_dim': [2, 5],\n", - " 'encoder_dims': [[32, 16], [64, 32]],\n", - " 'learning_rate': [0.001, 0.005],\n", - " 'batch_size': [32, 64],\n", - "}\n", - "\n", - "# Generate all combinations\n", - "from itertools import product\n", - "keys = param_grid.keys()\n", - "values = param_grid.values()\n", - "param_combinations = [dict(zip(keys, v)) for v in product(*values)]\n", - "\n", - "print(f\"\\nTotal hyperparameter combinations: {len(param_combinations)}\")\n", - "\n", - "# Group K-Fold setup\n", - "n_splits = 5\n", - "gkf = GroupKFold(n_splits=n_splits)\n", - "\n", - "# Store results\n", - "cv_results = []\n", - "\n", - "# Grid search with cross-validation\n", - "for idx, params in enumerate(param_combinations):\n", - " print(f\"\\n{'='*80}\")\n", - " print(f\"Testing combination {idx+1}/{len(param_combinations)}: {params}\")\n", - " print(f\"{'='*80}\")\n", - " \n", - " fold_results = []\n", - " \n", - " for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_scaled, y_train, groups_train)):\n", - " print(f\"\\nFold {fold+1}/{n_splits}\")\n", - " \n", - " X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]\n", - " y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]\n", - " \n", - " # Build model\n", - " model, encoder, decoder, classifier = build_vae_classifier(\n", - " input_dim=len(au_columns),\n", - " latent_dim=params['latent_dim'],\n", - " encoder_dims=params['encoder_dims'],\n", - " decoder_dims=list(reversed(params['encoder_dims'])),\n", - " classifier_dims=[16]\n", - " )\n", - " \n", - " vae_classifier = VAEClassifier(encoder, decoder, classifier)\n", - " vae_classifier.compile(optimizer=keras.optimizers.Adam(params['learning_rate']))\n", - " \n", - " # Early stopping\n", - " early_stop = keras.callbacks.EarlyStopping(\n", - " monitor='val_total_loss',\n", - " patience=10,\n", - " restore_best_weights=True,\n", - " mode='min'\n", - " )\n", - " \n", - " # Train\n", - " history = vae_classifier.fit(\n", - " X_fold_train, y_fold_train,\n", - " validation_data=(X_fold_val, y_fold_val),\n", - " epochs=60,\n", - " batch_size=params['batch_size'],\n", - " callbacks=[early_stop],\n", - " verbose=0\n", - " )\n", - " \n", - " # Evaluate on validation fold\n", - " z_mean_val, _, _ = encoder.predict(X_fold_val, verbose=0)\n", - " y_pred_proba = classifier.predict(z_mean_val, verbose=0).flatten()\n", - " y_pred = (y_pred_proba > 0.5).astype(int)\n", - " \n", - " fold_metrics = {\n", - " 'accuracy': accuracy_score(y_fold_val, y_pred),\n", - " 'precision': precision_score(y_fold_val, y_pred, zero_division=0),\n", - " 'recall': recall_score(y_fold_val, y_pred, zero_division=0),\n", - " 'f1': f1_score(y_fold_val, y_pred, zero_division=0),\n", - " 'roc_auc': roc_auc_score(y_fold_val, y_pred_proba),\n", - " 'final_recon_loss': history.history['val_reconstruction_loss'][-1],\n", - " 'final_kl_loss': history.history['val_kl_loss'][-1],\n", - " 'final_class_loss': history.history['val_classification_loss'][-1],\n", - " }\n", - " \n", - " fold_results.append(fold_metrics)\n", - " print(f\" Accuracy: {fold_metrics['accuracy']:.4f}, F1: {fold_metrics['f1']:.4f}, AUC: {fold_metrics['roc_auc']:.4f}\")\n", - " \n", - " # Clear session to free memory\n", - " keras.backend.clear_session()\n", - " \n", - " # Average across folds\n", - " avg_results = {\n", - " 'params': params,\n", - " 'mean_accuracy': np.mean([r['accuracy'] for r in fold_results]),\n", - " 'std_accuracy': np.std([r['accuracy'] for r in fold_results]),\n", - " 'mean_f1': np.mean([r['f1'] for r in fold_results]),\n", - " 'std_f1': np.std([r['f1'] for r in fold_results]),\n", - " 'mean_roc_auc': np.mean([r['roc_auc'] for r in fold_results]),\n", - " 'std_roc_auc': np.std([r['roc_auc'] for r in fold_results]),\n", - " 'mean_recon_loss': np.mean([r['final_recon_loss'] for r in fold_results]),\n", - " 'mean_kl_loss': np.mean([r['final_kl_loss'] for r in fold_results]),\n", - " 'mean_class_loss': np.mean([r['final_class_loss'] for r in fold_results]),\n", - " 'fold_results': fold_results\n", - " }\n", - " \n", - " cv_results.append(avg_results)\n", - " \n", - " print(f\"\\nMean CV Accuracy: {avg_results['mean_accuracy']:.4f} ± {avg_results['std_accuracy']:.4f}\")\n", - " print(f\"Mean CV F1: {avg_results['mean_f1']:.4f} ± {avg_results['std_f1']:.4f}\")\n", - " print(f\"Mean CV AUC: {avg_results['mean_roc_auc']:.4f} ± {avg_results['std_roc_auc']:.4f}\")\n", - "\n", - "# ============================================================================\n", - "# 6. SELECT BEST MODEL AND EVALUATE ON TEST SET\n", - "# ============================================================================\n", - "\n", - "# Find best hyperparameters based on mean F1 score\n", - "best_idx = np.argmax([r['mean_f1'] for r in cv_results])\n", - "best_params = cv_results[best_idx]['params']\n", - "\n", - "print(f\"\\n{'='*80}\")\n", - "print(\"BEST HYPERPARAMETERS (based on CV F1 score):\")\n", - "print(f\"{'='*80}\")\n", - "for key, value in best_params.items():\n", - " print(f\"{key}: {value}\")\n", - "print(f\"\\nCV Performance:\")\n", - "print(f\" Accuracy: {cv_results[best_idx]['mean_accuracy']:.4f} ± {cv_results[best_idx]['std_accuracy']:.4f}\")\n", - "print(f\" F1 Score: {cv_results[best_idx]['mean_f1']:.4f} ± {cv_results[best_idx]['std_f1']:.4f}\")\n", - "print(f\" ROC-AUC: {cv_results[best_idx]['mean_roc_auc']:.4f} ± {cv_results[best_idx]['std_roc_auc']:.4f}\")\n", - "\n", - "# Train final model on all training data\n", - "print(f\"\\n{'='*80}\")\n", - "print(\"TRAINING FINAL MODEL ON ALL TRAINING DATA\")\n", - "print(f\"{'='*80}\")\n", - "\n", - "final_model, final_encoder, final_decoder, final_classifier = build_vae_classifier(\n", - " input_dim=len(au_columns),\n", - " latent_dim=best_params['latent_dim'],\n", - " encoder_dims=best_params['encoder_dims'],\n", - " decoder_dims=list(reversed(best_params['encoder_dims'])),\n", - " classifier_dims=[16]\n", - ")\n", - "\n", - "final_vae_classifier = VAEClassifier(final_encoder, final_decoder, final_classifier)\n", - "final_vae_classifier.compile(optimizer=keras.optimizers.Adam(best_params['learning_rate']))\n", - "\n", - "final_history = final_vae_classifier.fit(\n", - " X_train_scaled, y_train,\n", - " validation_split=0.2,\n", - " epochs=100,\n", - " batch_size=best_params['batch_size'],\n", - " callbacks=[keras.callbacks.EarlyStopping(monitor='val_total_loss', patience=15, restore_best_weights=True, mode='min')],\n", - " verbose=1\n", - ")\n", - "\n", - "# Evaluate on held-out test set\n", - "print(f\"\\n{'='*80}\")\n", - "print(\"EVALUATION ON HELD-OUT TEST SET\")\n", - "print(f\"{'='*80}\")\n", - "\n", - "z_mean_test, _, _ = final_encoder.predict(X_test_scaled, verbose=0)\n", - "y_test_pred_proba = final_classifier.predict(z_mean_test, verbose=0).flatten()\n", - "y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n", - "\n", - "test_metrics = {\n", - " 'accuracy': accuracy_score(y_test, y_test_pred),\n", - " 'precision': precision_score(y_test, y_test_pred),\n", - " 'recall': recall_score(y_test, y_test_pred),\n", - " 'f1': f1_score(y_test, y_test_pred),\n", - " 'roc_auc': roc_auc_score(y_test, y_test_pred_proba),\n", - "}\n", - "\n", - "print(\"\\nTest Set Performance:\")\n", - "for metric, value in test_metrics.items():\n", - " print(f\" {metric.capitalize()}: {value:.4f}\")\n", - "\n", - "print(\"\\nConfusion Matrix:\")\n", - "print(confusion_matrix(y_test, y_test_pred))\n", - "\n", - "print(\"\\nClassification Report:\")\n", - "print(classification_report(y_test, y_test_pred, target_names=['Low Workload', 'High Workload']))\n", - "\n", - "# ============================================================================\n", - "# 7. VISUALIZATION\n", - "# ============================================================================\n", - "\n", - "# Plot training history\n", - "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n", - "\n", - "axes[0, 0].plot(final_history.history['reconstruction_loss'], label='Train')\n", - "axes[0, 0].plot(final_history.history['val_reconstruction_loss'], label='Val')\n", - "axes[0, 0].set_title('Reconstruction Loss')\n", - "axes[0, 0].set_xlabel('Epoch')\n", - "axes[0, 0].set_ylabel('Loss')\n", - "axes[0, 0].legend()\n", - "axes[0, 0].grid(True)\n", - "\n", - "axes[0, 1].plot(final_history.history['kl_loss'], label='Train')\n", - "axes[0, 1].plot(final_history.history['val_kl_loss'], label='Val')\n", - "axes[0, 1].set_title('KL Divergence Loss')\n", - "axes[0, 1].set_xlabel('Epoch')\n", - "axes[0, 1].set_ylabel('Loss')\n", - "axes[0, 1].legend()\n", - "axes[0, 1].grid(True)\n", - "\n", - "axes[1, 0].plot(final_history.history['classification_loss'], label='Train')\n", - "axes[1, 0].plot(final_history.history['val_classification_loss'], label='Val')\n", - "axes[1, 0].set_title('Classification Loss')\n", - "axes[1, 0].set_xlabel('Epoch')\n", - "axes[1, 0].set_ylabel('Loss')\n", - "axes[1, 0].legend()\n", - "axes[1, 0].grid(True)\n", - "\n", - "axes[1, 1].plot(final_history.history['accuracy'], label='Train')\n", - "axes[1, 1].plot(final_history.history['val_accuracy'], label='Val')\n", - "axes[1, 1].set_title('Classification Accuracy')\n", - "axes[1, 1].set_xlabel('Epoch')\n", - "axes[1, 1].set_ylabel('Accuracy')\n", - "axes[1, 1].legend()\n", - "axes[1, 1].grid(True)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Visualize latent space (if 2D or 3D)\n", - "if best_params['latent_dim'] == 2:\n", - " z_mean_train, _, _ = final_encoder.predict(X_train_scaled, verbose=0)\n", - " \n", - " plt.figure(figsize=(10, 8))\n", - " scatter = plt.scatter(z_mean_train[:, 0], z_mean_train[:, 1], \n", - " c=y_train, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n", - " plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n", - " plt.xlabel('Latent Dimension 1')\n", - " plt.ylabel('Latent Dimension 2')\n", - " plt.title('2D Latent Space Representation (Training Data)')\n", - " plt.grid(True, alpha=0.3)\n", - " plt.show()\n", - " \n", - " # Test set latent space\n", - " plt.figure(figsize=(10, 8))\n", - " scatter = plt.scatter(z_mean_test[:, 0], z_mean_test[:, 1], \n", - " c=y_test, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n", - " plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n", - " plt.xlabel('Latent Dimension 1')\n", - " plt.ylabel('Latent Dimension 2')\n", - " plt.title('2D Latent Space Representation (Test Data)')\n", - " plt.grid(True, alpha=0.3)\n", - " plt.show()\n", - "\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"TRAINING COMPLETE!\")\n", - "print(\"=\"*80)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79bcfc58", - "metadata": {}, - "outputs": [], - "source": [ - "### Save Trained VAE Classifier Model\n", - "from pathlib import Path\n", - "from datetime import datetime\n", - "\n", - "# Define save path\n", - "model_dir = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models\")\n", - "model_dir.mkdir(parents=True, exist_ok=True)\n", - "\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "model_path = model_dir / f\"vae_classifier_{timestamp}.keras\"\n", - "\n", - "# Save the complete model\n", - "final_vae_classifier.save(model_path)\n", - "\n", - "print(f\"Model saved to: {model_path}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d700e517", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30d8d100", - "metadata": {}, - "outputs": [], - "source": [ - "### Plot Confusion Matrix for Final Model\n", - "from sklearn.metrics import ConfusionMatrixDisplay\n", - "x = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models/vae_classifier_20251210_230121.keras\")\n", - "# Load the saved model\n", - "print(f\"Loading model from: {x}\")\n", - "# loaded_vae_classifier = tf.keras.models.load_model(x)\n", - "loaded_vae_classifier = final_vae_classifier\n", - "print(\"✓ Model loaded successfully!\")\n", - "\n", - "# Extract encoder and classifier from loaded model\n", - "loaded_encoder = loaded_vae_classifier.encoder\n", - "loaded_classifier = loaded_vae_classifier.classifier\n", - "\n", - "# Get predictions on test set\n", - "z_mean_test, _, _ = loaded_encoder.predict(X_test_scaled, verbose=0)\n", - "y_test_pred_proba = loaded_classifier.predict(z_mean_test, verbose=0).flatten()\n", - "y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n", - "\n", - "# Create and plot confusion matrix\n", - "cm = confusion_matrix(y_test, y_test_pred)\n", - "disp = ConfusionMatrixDisplay(confusion_matrix=cm, \n", - " display_labels=['Low Workload', 'High Workload'])\n", - "\n", - "fig, ax = plt.subplots(figsize=(8, 6))\n", - "disp.plot(ax=ax, cmap='Blues', values_format='d')\n", - "plt.title('Confusion Matrix - Test Set (Loaded Model)')\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Print metrics\n", - "print(f\"\\nTest Set Performance (Loaded Model):\")\n", - "print(f\" Accuracy: {accuracy_score(y_test, y_test_pred):.4f}\")\n", - "print(f\" Precision: {precision_score(y_test, y_test_pred):.4f}\")\n", - "print(f\" Recall: {recall_score(y_test, y_test_pred):.4f}\")\n", - "print(f\" F1 Score: {f1_score(y_test, y_test_pred):.4f}\")\n", - "print(f\" ROC-AUC: {roc_auc_score(y_test, y_test_pred_proba):.4f}\")" - ] - }, - { - "cell_type": "markdown", - "id": "e826a998", - "metadata": {}, - "source": [ - "TO DO\n", - " * autoencoder langsam anfangen mit 19 schichten\n", - " * dann AE und SVM mit hybridem training wie bei claude?!\n", - " * dataset aus eyetracking verwenden?" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}