diff --git a/dataset_creation/open_parquet_test.ipynb b/dataset_creation/open_parquet_test.ipynb index 38145c0..b72c1f7 100644 --- a/dataset_creation/open_parquet_test.ipynb +++ b/dataset_creation/open_parquet_test.ipynb @@ -17,9 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n", - "print(df.shape)\n", - "\n" + "df= pd.read_parquet(r\" \")\n", + "print(df.shape)" ] }, { diff --git a/model_training/DeepSVDD/deepSVDD.ipynb b/model_training/DeepSVDD/deepSVDD.ipynb index a7e3f52..4a664d6 100644 --- a/model_training/DeepSVDD/deepSVDD.ipynb +++ b/model_training/DeepSVDD/deepSVDD.ipynb @@ -107,7 +107,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_path = Path(r\"data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")" + "dataset_path = Path(r\"data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n", + "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")" ] }, { @@ -475,7 +476,7 @@ "metadata": {}, "outputs": [], "source": [ - "normalizer_path=Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/normalizer.pkl')" + "normalizer_path=Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/normalizer_min_max_global.pkl')" ] }, { @@ -494,7 +495,7 @@ "print(len(eye_cols))\n", "all_signal_columns = face_au_cols+eye_cols\n", "print(len(all_signal_columns))\n", - "normalizer = fit_normalizer(train_df, all_signal_columns, method='standard', scope='subject')\n", + "normalizer = fit_normalizer(train_df, all_signal_columns, method='minmax', scope='global')\n", "save_normalizer(normalizer, normalizer_path )" ] }, @@ -691,10 +692,10 @@ "model = build_intermediate_fusion_autoencoder(\n", " input_dim_mod1=len(face_au_cols),\n", " input_dim_mod2=len(eye_cols),\n", - " encoder_hidden_dim_mod1=15, # individuell\n", - " encoder_hidden_dim_mod2=10, # individuell\n", - " latent_dim=8,\n", - " dropout_rate=0.3, # einstellbar\n", + " encoder_hidden_dim_mod1=12, # individuell\n", + " encoder_hidden_dim_mod2=8, # individuell\n", + " latent_dim=4,\n", + " dropout_rate=0.7, # einstellbar\n", " neg_slope=0.1,\n", " weight_decay=1e-3\n", ")\n", @@ -708,7 +709,7 @@ " \"recon_modality_1\": 1.0,\n", " \"recon_modality_2\": 1.0,\n", " },\n", - " optimizer=tf.keras.optimizers.Adam(1e-2)\n", + " optimizer=tf.keras.optimizers.Adam(1e-3)\n", " \n", ")\n", "\n", @@ -739,7 +740,7 @@ " \"recon_modality_1\": 1.0,\n", " \"recon_modality_2\": 1.0,\n", " },\n", - " optimizer=tf.keras.optimizers.Adam(1e-5),\n", + " optimizer=tf.keras.optimizers.Adam(1e-4),\n", ")\n", "model.fit(\n", " x=[X_face, X_eye],\n", @@ -779,7 +780,7 @@ "metadata": {}, "outputs": [], "source": [ - "encoder_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/encoder_6_deep.keras')\n", + "encoder_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/encoder_8_deep.keras')\n", "encoder.save(encoder_save_path)" ] }, @@ -943,7 +944,7 @@ " return get_radius_from_arrays(nu, X_face, X_eye)\n", "\n", "\n", - "nu = 0.05\n", + "nu = 0.25\n", "\n", "train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye)).shuffle(64).batch(64)\n", "# train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye))\n", @@ -1018,7 +1019,7 @@ "metadata": {}, "outputs": [], "source": [ - "deep_svdd_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/deep_svdd_05.keras')\n", + "deep_svdd_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/deep_svdd_06.keras')\n", "deep_svdd_net.save(deep_svdd_save_path)" ] }, @@ -1075,6 +1076,18 @@ "test_predictions = (test_scores > 0).astype(int)\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "575dddcf", + "metadata": {}, + "outputs": [], + "source": [ + "normal_acc = np.mean(test_predictions[y_test == 0] == 0)\n", + "anomaly_acc = np.mean(test_predictions[y_test == 1] == 1)\n", + "print(f'Accuracy on Test set: {accuracy_score(y_test, test_predictions)}')" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/model_training/VAE_SVM/vaesvm.ipynb b/model_training/VAE_SVM/vaesvm.ipynb index 1572d80..dd5ac15 100644 --- a/model_training/VAE_SVM/vaesvm.ipynb +++ b/model_training/VAE_SVM/vaesvm.ipynb @@ -220,14 +220,637 @@ "outputs": [], "source": [ "# SET\n", - "threshold_mad = 100\n", + "threshold_mad = 5\n", "column_praefix ='AU'\n", "\n", "au_columns = [col for col in df.columns if col.startswith(column_praefix)]\n", - "cleaned_df = mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n", + "cleaned_df = mad_outlier_removal.mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n", "print(cleaned_df.shape)\n", "print(df.shape)" ] + }, + { + "cell_type": "markdown", + "id": "9a6c1732", + "metadata": {}, + "source": [ + "#### TO DO\n", + " * pipeline aus Autoencoder und SVM\n", + " * group k fold\n", + " * AE überpüfen, loss dokumentieren" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "877309d9", + "metadata": {}, + "outputs": [], + "source": [ + "### Variational Autoencoder with Classifier Head\n", + "import pandas as pd\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers, Model\n", + "from sklearn.model_selection import GroupKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import (\n", + " accuracy_score, precision_score, recall_score, f1_score, \n", + " roc_auc_score, confusion_matrix, classification_report\n", + ")\n", + "import matplotlib.pyplot as plt\n", + "from collections import defaultdict\n", + "\n", + "# ============================================================================\n", + "# 1. CREATE LABELS\n", + "# ============================================================================\n", + "\n", + "# Low workload: baseline + n-back level 1,4\n", + "low_all = cleaned_df[\n", + " ((cleaned_df[\"PHASE\"] == \"baseline\") |\n", + " ((cleaned_df[\"STUDY\"] == \"n-back\") & (cleaned_df[\"PHASE\"] != \"baseline\") & (cleaned_df[\"LEVEL\"].isin([1,4]))))\n", + "].copy()\n", + "low_all['label'] = 0\n", + "print(f\"Low workload samples: {low_all.shape[0]}\")\n", + "\n", + "# High workload n-back: level 2,3,5,6\n", + "high_nback = cleaned_df[\n", + " (cleaned_df[\"STUDY\"]==\"n-back\") &\n", + " (cleaned_df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (cleaned_df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "].copy()\n", + "high_nback['label'] = 1\n", + "print(f\"High n-back samples: {high_nback.shape[0]}\")\n", + "\n", + "# High workload k-drive\n", + "high_kdrive = cleaned_df[\n", + " (cleaned_df[\"STUDY\"] == \"k-drive\") & (cleaned_df[\"PHASE\"] != \"baseline\")\n", + "].copy()\n", + "high_kdrive['label'] = 1\n", + "print(f\"High k-drive samples: {high_kdrive.shape[0]}\")\n", + "\n", + "# Combine all high workload\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"Total high workload samples: {high_all.shape[0]}\")\n", + "\n", + "# Complete labeled dataset\n", + "labeled_df = pd.concat([low_all, high_all]).reset_index(drop=True)\n", + "print(f\"\\nTotal labeled samples: {labeled_df.shape[0]}\")\n", + "print(f\"Class distribution:\\n{labeled_df['label'].value_counts()}\")\n", + "\n", + "# ============================================================================\n", + "# 2. TRAIN/TEST SPLIT BY SUBJECTS\n", + "# ============================================================================\n", + "\n", + "train_df = labeled_df[labeled_df['subjectID'].isin(training_subjects)].copy()\n", + "test_df = labeled_df[labeled_df['subjectID'].isin(test_subjects)].copy()\n", + "\n", + "print(f\"\\nTraining subjects: {training_subjects}\")\n", + "print(f\"Test subjects: {test_subjects}\")\n", + "print(f\"Train samples: {train_df.shape[0]}, Test samples: {test_df.shape[0]}\")\n", + "\n", + "# Extract features and labels\n", + "au_columns = [col for col in labeled_df.columns if col.startswith('AU')]\n", + "print(f\"\\nUsing {len(au_columns)} AU features: {au_columns}\")\n", + "\n", + "X_train = train_df[au_columns].values\n", + "y_train = train_df['label'].values\n", + "groups_train = train_df['subjectID'].values\n", + "\n", + "X_test = test_df[au_columns].values\n", + "y_test = test_df['label'].values\n", + "\n", + "# Normalize features\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)\n", + "\n", + "print(f\"\\nTrain class distribution: {np.bincount(y_train)}\")\n", + "print(f\"Test class distribution: {np.bincount(y_test)}\")\n", + "\n", + "# ============================================================================\n", + "# 3. VAE WITH CLASSIFIER HEAD MODEL\n", + "# ============================================================================\n", + "\n", + "class Sampling(layers.Layer):\n", + " \"\"\"Reparameterization trick for VAE\"\"\"\n", + " def call(self, inputs):\n", + " z_mean, z_log_var = inputs\n", + " batch = tf.shape(z_mean)[0]\n", + " dim = tf.shape(z_mean)[1]\n", + " epsilon = tf.random.normal(shape=(batch, dim))\n", + " return z_mean + tf.exp(0.5 * z_log_var) * epsilon\n", + "\n", + "def build_vae_classifier(input_dim, latent_dim, encoder_dims=[32, 16], \n", + " decoder_dims=[16, 32], classifier_dims=[16]):\n", + " \"\"\"\n", + " Build VAE with classifier head\n", + " \n", + " Args:\n", + " input_dim: Number of input features (20 AUs)\n", + " latent_dim: Dimension of latent space (2-5)\n", + " encoder_dims: Hidden layer sizes for encoder\n", + " decoder_dims: Hidden layer sizes for decoder\n", + " classifier_dims: Hidden layer sizes for classifier\n", + " \"\"\"\n", + " \n", + " # ---- ENCODER ----\n", + " encoder_inputs = keras.Input(shape=(input_dim,), name='encoder_input')\n", + " x = encoder_inputs\n", + " \n", + " for i, dim in enumerate(encoder_dims):\n", + " x = layers.Dense(dim, activation='relu', name=f'encoder_dense_{i}')(x)\n", + " x = layers.BatchNormalization(name=f'encoder_bn_{i}')(x)\n", + " x = layers.Dropout(0.2, name=f'encoder_dropout_{i}')(x)\n", + " \n", + " z_mean = layers.Dense(latent_dim, name='z_mean')(x)\n", + " z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)\n", + " z = Sampling()([z_mean, z_log_var])\n", + " \n", + " encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')\n", + " \n", + " # ---- DECODER ----\n", + " latent_inputs = keras.Input(shape=(latent_dim,), name='latent_input')\n", + " x = latent_inputs\n", + " \n", + " for i, dim in enumerate(decoder_dims):\n", + " x = layers.Dense(dim, activation='relu', name=f'decoder_dense_{i}')(x)\n", + " x = layers.BatchNormalization(name=f'decoder_bn_{i}')(x)\n", + " \n", + " decoder_outputs = layers.Dense(input_dim, activation='linear', name='decoder_output')(x)\n", + " decoder = Model(latent_inputs, decoder_outputs, name='decoder')\n", + " \n", + " # ---- CLASSIFIER HEAD ----\n", + " x = latent_inputs\n", + " for i, dim in enumerate(classifier_dims):\n", + " x = layers.Dense(dim, activation='relu', name=f'classifier_dense_{i}')(x)\n", + " x = layers.Dropout(0.3, name=f'classifier_dropout_{i}')(x)\n", + " \n", + " classifier_output = layers.Dense(1, activation='sigmoid', name='classifier_output')(x)\n", + " classifier = Model(latent_inputs, classifier_output, name='classifier')\n", + " \n", + " # ---- FULL MODEL ----\n", + " inputs = keras.Input(shape=(input_dim,), name='vae_input')\n", + " z_mean, z_log_var, z = encoder(inputs)\n", + " reconstructed = decoder(z)\n", + " classification = classifier(z)\n", + " \n", + " model = Model(inputs, [reconstructed, classification], name='vae_classifier')\n", + " \n", + " return model, encoder, decoder, classifier\n", + "\n", + "# ============================================================================\n", + "# 4. CUSTOM TRAINING LOOP WITH COMBINED LOSS\n", + "# ============================================================================\n", + "\n", + "class VAEClassifier(keras.Model):\n", + " def __init__(self, encoder, decoder, classifier, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.encoder = encoder\n", + " self.decoder = decoder\n", + " self.classifier = classifier\n", + " self.total_loss_tracker = keras.metrics.Mean(name=\"total_loss\")\n", + " self.reconstruction_loss_tracker = keras.metrics.Mean(name=\"reconstruction_loss\")\n", + " self.kl_loss_tracker = keras.metrics.Mean(name=\"kl_loss\")\n", + " self.classification_loss_tracker = keras.metrics.Mean(name=\"classification_loss\")\n", + " self.accuracy_tracker = keras.metrics.BinaryAccuracy(name=\"accuracy\")\n", + " \n", + " @property\n", + " def metrics(self):\n", + " return [\n", + " self.total_loss_tracker,\n", + " self.reconstruction_loss_tracker,\n", + " self.kl_loss_tracker,\n", + " self.classification_loss_tracker,\n", + " self.accuracy_tracker,\n", + " ]\n", + " \n", + " def train_step(self, data):\n", + " x, y = data\n", + " \n", + " with tf.GradientTape() as tape:\n", + " # Forward pass\n", + " z_mean, z_log_var, z = self.encoder(x, training=True)\n", + " reconstruction = self.decoder(z, training=True)\n", + " classification = self.classifier(z, training=True)\n", + " \n", + " # Reconstruction loss (MSE)\n", + " reconstruction_loss = tf.reduce_mean(\n", + " keras.losses.mse(x, reconstruction))\n", + " \n", + " # KL divergence loss\n", + " kl_loss = -0.5 * tf.reduce_mean(\n", + " tf.reduce_sum(\n", + " 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),\n", + " axis=1\n", + " )\n", + " )\n", + " \n", + " # Classification loss (binary crossentropy)\n", + " # Classification loss (binary crossentropy)\n", + " classification_loss = tf.reduce_mean(\n", + " keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n", + " )\n", + " \n", + " # Combined loss with weights\n", + " total_loss = reconstruction_loss + kl_loss + classification_loss\n", + " \n", + " # Backpropagation\n", + " grads = tape.gradient(total_loss, self.trainable_weights)\n", + " self.optimizer.apply_gradients(zip(grads, self.trainable_weights))\n", + " \n", + " # Update metrics\n", + " self.total_loss_tracker.update_state(total_loss)\n", + " self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n", + " self.kl_loss_tracker.update_state(kl_loss)\n", + " self.classification_loss_tracker.update_state(classification_loss)\n", + " self.accuracy_tracker.update_state(y, classification)\n", + " \n", + " return {\n", + " \"total_loss\": self.total_loss_tracker.result(),\n", + " \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n", + " \"kl_loss\": self.kl_loss_tracker.result(),\n", + " \"classification_loss\": self.classification_loss_tracker.result(),\n", + " \"accuracy\": self.accuracy_tracker.result(),\n", + " }\n", + " \n", + " def test_step(self, data):\n", + " x, y = data\n", + " \n", + " z_mean, z_log_var, z = self.encoder(x, training=False)\n", + " reconstruction = self.decoder(z, training=False)\n", + " classification = self.classifier(z, training=False)\n", + " \n", + " # Reconstruction loss (MSE)\n", + " reconstruction_loss = tf.reduce_mean(\n", + " keras.losses.mse(x, reconstruction))\n", + " kl_loss = -0.5 * tf.reduce_mean(\n", + " tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)\n", + " )\n", + " # Classification loss (binary crossentropy)\n", + " classification_loss = tf.reduce_mean(\n", + " keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n", + " )\n", + " total_loss = reconstruction_loss + kl_loss + classification_loss\n", + " \n", + " self.total_loss_tracker.update_state(total_loss)\n", + " self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n", + " self.kl_loss_tracker.update_state(kl_loss)\n", + " self.classification_loss_tracker.update_state(classification_loss)\n", + " self.accuracy_tracker.update_state(y, classification)\n", + " \n", + " return {\n", + " \"total_loss\": self.total_loss_tracker.result(),\n", + " \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n", + " \"kl_loss\": self.kl_loss_tracker.result(),\n", + " \"classification_loss\": self.classification_loss_tracker.result(),\n", + " \"accuracy\": self.accuracy_tracker.result(),\n", + " }\n", + "\n", + "# ============================================================================\n", + "# 5. GROUP K-FOLD CROSS-VALIDATION WITH GRID SEARCH\n", + "# ============================================================================\n", + "\n", + "# Hyperparameter grid\n", + "param_grid = {\n", + " 'latent_dim': [2, 5],\n", + " 'encoder_dims': [[32, 16], [64, 32]],\n", + " 'learning_rate': [0.001, 0.005],\n", + " 'batch_size': [32, 64],\n", + "}\n", + "\n", + "# Generate all combinations\n", + "from itertools import product\n", + "keys = param_grid.keys()\n", + "values = param_grid.values()\n", + "param_combinations = [dict(zip(keys, v)) for v in product(*values)]\n", + "\n", + "print(f\"\\nTotal hyperparameter combinations: {len(param_combinations)}\")\n", + "\n", + "# Group K-Fold setup\n", + "n_splits = 5\n", + "gkf = GroupKFold(n_splits=n_splits)\n", + "\n", + "# Store results\n", + "cv_results = []\n", + "\n", + "# Grid search with cross-validation\n", + "for idx, params in enumerate(param_combinations):\n", + " print(f\"\\n{'='*80}\")\n", + " print(f\"Testing combination {idx+1}/{len(param_combinations)}: {params}\")\n", + " print(f\"{'='*80}\")\n", + " \n", + " fold_results = []\n", + " \n", + " for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_scaled, y_train, groups_train)):\n", + " print(f\"\\nFold {fold+1}/{n_splits}\")\n", + " \n", + " X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]\n", + " y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]\n", + " \n", + " # Build model\n", + " model, encoder, decoder, classifier = build_vae_classifier(\n", + " input_dim=len(au_columns),\n", + " latent_dim=params['latent_dim'],\n", + " encoder_dims=params['encoder_dims'],\n", + " decoder_dims=list(reversed(params['encoder_dims'])),\n", + " classifier_dims=[16]\n", + " )\n", + " \n", + " vae_classifier = VAEClassifier(encoder, decoder, classifier)\n", + " vae_classifier.compile(optimizer=keras.optimizers.Adam(params['learning_rate']))\n", + " \n", + " # Early stopping\n", + " early_stop = keras.callbacks.EarlyStopping(\n", + " monitor='val_total_loss',\n", + " patience=10,\n", + " restore_best_weights=True,\n", + " mode='min'\n", + " )\n", + " \n", + " # Train\n", + " history = vae_classifier.fit(\n", + " X_fold_train, y_fold_train,\n", + " validation_data=(X_fold_val, y_fold_val),\n", + " epochs=60,\n", + " batch_size=params['batch_size'],\n", + " callbacks=[early_stop],\n", + " verbose=0\n", + " )\n", + " \n", + " # Evaluate on validation fold\n", + " z_mean_val, _, _ = encoder.predict(X_fold_val, verbose=0)\n", + " y_pred_proba = classifier.predict(z_mean_val, verbose=0).flatten()\n", + " y_pred = (y_pred_proba > 0.5).astype(int)\n", + " \n", + " fold_metrics = {\n", + " 'accuracy': accuracy_score(y_fold_val, y_pred),\n", + " 'precision': precision_score(y_fold_val, y_pred, zero_division=0),\n", + " 'recall': recall_score(y_fold_val, y_pred, zero_division=0),\n", + " 'f1': f1_score(y_fold_val, y_pred, zero_division=0),\n", + " 'roc_auc': roc_auc_score(y_fold_val, y_pred_proba),\n", + " 'final_recon_loss': history.history['val_reconstruction_loss'][-1],\n", + " 'final_kl_loss': history.history['val_kl_loss'][-1],\n", + " 'final_class_loss': history.history['val_classification_loss'][-1],\n", + " }\n", + " \n", + " fold_results.append(fold_metrics)\n", + " print(f\" Accuracy: {fold_metrics['accuracy']:.4f}, F1: {fold_metrics['f1']:.4f}, AUC: {fold_metrics['roc_auc']:.4f}\")\n", + " \n", + " # Clear session to free memory\n", + " keras.backend.clear_session()\n", + " \n", + " # Average across folds\n", + " avg_results = {\n", + " 'params': params,\n", + " 'mean_accuracy': np.mean([r['accuracy'] for r in fold_results]),\n", + " 'std_accuracy': np.std([r['accuracy'] for r in fold_results]),\n", + " 'mean_f1': np.mean([r['f1'] for r in fold_results]),\n", + " 'std_f1': np.std([r['f1'] for r in fold_results]),\n", + " 'mean_roc_auc': np.mean([r['roc_auc'] for r in fold_results]),\n", + " 'std_roc_auc': np.std([r['roc_auc'] for r in fold_results]),\n", + " 'mean_recon_loss': np.mean([r['final_recon_loss'] for r in fold_results]),\n", + " 'mean_kl_loss': np.mean([r['final_kl_loss'] for r in fold_results]),\n", + " 'mean_class_loss': np.mean([r['final_class_loss'] for r in fold_results]),\n", + " 'fold_results': fold_results\n", + " }\n", + " \n", + " cv_results.append(avg_results)\n", + " \n", + " print(f\"\\nMean CV Accuracy: {avg_results['mean_accuracy']:.4f} ± {avg_results['std_accuracy']:.4f}\")\n", + " print(f\"Mean CV F1: {avg_results['mean_f1']:.4f} ± {avg_results['std_f1']:.4f}\")\n", + " print(f\"Mean CV AUC: {avg_results['mean_roc_auc']:.4f} ± {avg_results['std_roc_auc']:.4f}\")\n", + "\n", + "# ============================================================================\n", + "# 6. SELECT BEST MODEL AND EVALUATE ON TEST SET\n", + "# ============================================================================\n", + "\n", + "# Find best hyperparameters based on mean F1 score\n", + "best_idx = np.argmax([r['mean_f1'] for r in cv_results])\n", + "best_params = cv_results[best_idx]['params']\n", + "\n", + "print(f\"\\n{'='*80}\")\n", + "print(\"BEST HYPERPARAMETERS (based on CV F1 score):\")\n", + "print(f\"{'='*80}\")\n", + "for key, value in best_params.items():\n", + " print(f\"{key}: {value}\")\n", + "print(f\"\\nCV Performance:\")\n", + "print(f\" Accuracy: {cv_results[best_idx]['mean_accuracy']:.4f} ± {cv_results[best_idx]['std_accuracy']:.4f}\")\n", + "print(f\" F1 Score: {cv_results[best_idx]['mean_f1']:.4f} ± {cv_results[best_idx]['std_f1']:.4f}\")\n", + "print(f\" ROC-AUC: {cv_results[best_idx]['mean_roc_auc']:.4f} ± {cv_results[best_idx]['std_roc_auc']:.4f}\")\n", + "\n", + "# Train final model on all training data\n", + "print(f\"\\n{'='*80}\")\n", + "print(\"TRAINING FINAL MODEL ON ALL TRAINING DATA\")\n", + "print(f\"{'='*80}\")\n", + "\n", + "final_model, final_encoder, final_decoder, final_classifier = build_vae_classifier(\n", + " input_dim=len(au_columns),\n", + " latent_dim=best_params['latent_dim'],\n", + " encoder_dims=best_params['encoder_dims'],\n", + " decoder_dims=list(reversed(best_params['encoder_dims'])),\n", + " classifier_dims=[16]\n", + ")\n", + "\n", + "final_vae_classifier = VAEClassifier(final_encoder, final_decoder, final_classifier)\n", + "final_vae_classifier.compile(optimizer=keras.optimizers.Adam(best_params['learning_rate']))\n", + "\n", + "final_history = final_vae_classifier.fit(\n", + " X_train_scaled, y_train,\n", + " validation_split=0.2,\n", + " epochs=100,\n", + " batch_size=best_params['batch_size'],\n", + " callbacks=[keras.callbacks.EarlyStopping(monitor='val_total_loss', patience=15, restore_best_weights=True, mode='min')],\n", + " verbose=1\n", + ")\n", + "\n", + "# Evaluate on held-out test set\n", + "print(f\"\\n{'='*80}\")\n", + "print(\"EVALUATION ON HELD-OUT TEST SET\")\n", + "print(f\"{'='*80}\")\n", + "\n", + "z_mean_test, _, _ = final_encoder.predict(X_test_scaled, verbose=0)\n", + "y_test_pred_proba = final_classifier.predict(z_mean_test, verbose=0).flatten()\n", + "y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n", + "\n", + "test_metrics = {\n", + " 'accuracy': accuracy_score(y_test, y_test_pred),\n", + " 'precision': precision_score(y_test, y_test_pred),\n", + " 'recall': recall_score(y_test, y_test_pred),\n", + " 'f1': f1_score(y_test, y_test_pred),\n", + " 'roc_auc': roc_auc_score(y_test, y_test_pred_proba),\n", + "}\n", + "\n", + "print(\"\\nTest Set Performance:\")\n", + "for metric, value in test_metrics.items():\n", + " print(f\" {metric.capitalize()}: {value:.4f}\")\n", + "\n", + "print(\"\\nConfusion Matrix:\")\n", + "print(confusion_matrix(y_test, y_test_pred))\n", + "\n", + "print(\"\\nClassification Report:\")\n", + "print(classification_report(y_test, y_test_pred, target_names=['Low Workload', 'High Workload']))\n", + "\n", + "# ============================================================================\n", + "# 7. VISUALIZATION\n", + "# ============================================================================\n", + "\n", + "# Plot training history\n", + "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n", + "\n", + "axes[0, 0].plot(final_history.history['reconstruction_loss'], label='Train')\n", + "axes[0, 0].plot(final_history.history['val_reconstruction_loss'], label='Val')\n", + "axes[0, 0].set_title('Reconstruction Loss')\n", + "axes[0, 0].set_xlabel('Epoch')\n", + "axes[0, 0].set_ylabel('Loss')\n", + "axes[0, 0].legend()\n", + "axes[0, 0].grid(True)\n", + "\n", + "axes[0, 1].plot(final_history.history['kl_loss'], label='Train')\n", + "axes[0, 1].plot(final_history.history['val_kl_loss'], label='Val')\n", + "axes[0, 1].set_title('KL Divergence Loss')\n", + "axes[0, 1].set_xlabel('Epoch')\n", + "axes[0, 1].set_ylabel('Loss')\n", + "axes[0, 1].legend()\n", + "axes[0, 1].grid(True)\n", + "\n", + "axes[1, 0].plot(final_history.history['classification_loss'], label='Train')\n", + "axes[1, 0].plot(final_history.history['val_classification_loss'], label='Val')\n", + "axes[1, 0].set_title('Classification Loss')\n", + "axes[1, 0].set_xlabel('Epoch')\n", + "axes[1, 0].set_ylabel('Loss')\n", + "axes[1, 0].legend()\n", + "axes[1, 0].grid(True)\n", + "\n", + "axes[1, 1].plot(final_history.history['accuracy'], label='Train')\n", + "axes[1, 1].plot(final_history.history['val_accuracy'], label='Val')\n", + "axes[1, 1].set_title('Classification Accuracy')\n", + "axes[1, 1].set_xlabel('Epoch')\n", + "axes[1, 1].set_ylabel('Accuracy')\n", + "axes[1, 1].legend()\n", + "axes[1, 1].grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Visualize latent space (if 2D or 3D)\n", + "if best_params['latent_dim'] == 2:\n", + " z_mean_train, _, _ = final_encoder.predict(X_train_scaled, verbose=0)\n", + " \n", + " plt.figure(figsize=(10, 8))\n", + " scatter = plt.scatter(z_mean_train[:, 0], z_mean_train[:, 1], \n", + " c=y_train, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n", + " plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n", + " plt.xlabel('Latent Dimension 1')\n", + " plt.ylabel('Latent Dimension 2')\n", + " plt.title('2D Latent Space Representation (Training Data)')\n", + " plt.grid(True, alpha=0.3)\n", + " plt.show()\n", + " \n", + " # Test set latent space\n", + " plt.figure(figsize=(10, 8))\n", + " scatter = plt.scatter(z_mean_test[:, 0], z_mean_test[:, 1], \n", + " c=y_test, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n", + " plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n", + " plt.xlabel('Latent Dimension 1')\n", + " plt.ylabel('Latent Dimension 2')\n", + " plt.title('2D Latent Space Representation (Test Data)')\n", + " plt.grid(True, alpha=0.3)\n", + " plt.show()\n", + "\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"TRAINING COMPLETE!\")\n", + "print(\"=\"*80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79bcfc58", + "metadata": {}, + "outputs": [], + "source": [ + "### Save Trained VAE Classifier Model\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "\n", + "# Define save path\n", + "model_dir = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models\")\n", + "model_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", + "model_path = model_dir / f\"vae_classifier_{timestamp}.keras\"\n", + "\n", + "# Save the complete model\n", + "final_vae_classifier.save(model_path)\n", + "\n", + "print(f\"Model saved to: {model_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d700e517", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d8d100", + "metadata": {}, + "outputs": [], + "source": [ + "### Plot Confusion Matrix for Final Model\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "x = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models/vae_classifier_20251210_230121.keras\")\n", + "# Load the saved model\n", + "print(f\"Loading model from: {x}\")\n", + "# loaded_vae_classifier = tf.keras.models.load_model(x)\n", + "loaded_vae_classifier = final_vae_classifier\n", + "print(\"✓ Model loaded successfully!\")\n", + "\n", + "# Extract encoder and classifier from loaded model\n", + "loaded_encoder = loaded_vae_classifier.encoder\n", + "loaded_classifier = loaded_vae_classifier.classifier\n", + "\n", + "# Get predictions on test set\n", + "z_mean_test, _, _ = loaded_encoder.predict(X_test_scaled, verbose=0)\n", + "y_test_pred_proba = loaded_classifier.predict(z_mean_test, verbose=0).flatten()\n", + "y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n", + "\n", + "# Create and plot confusion matrix\n", + "cm = confusion_matrix(y_test, y_test_pred)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, \n", + " display_labels=['Low Workload', 'High Workload'])\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "disp.plot(ax=ax, cmap='Blues', values_format='d')\n", + "plt.title('Confusion Matrix - Test Set (Loaded Model)')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print metrics\n", + "print(f\"\\nTest Set Performance (Loaded Model):\")\n", + "print(f\" Accuracy: {accuracy_score(y_test, y_test_pred):.4f}\")\n", + "print(f\" Precision: {precision_score(y_test, y_test_pred):.4f}\")\n", + "print(f\" Recall: {recall_score(y_test, y_test_pred):.4f}\")\n", + "print(f\" F1 Score: {f1_score(y_test, y_test_pred):.4f}\")\n", + "print(f\" ROC-AUC: {roc_auc_score(y_test, y_test_pred_proba):.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e826a998", + "metadata": {}, + "source": [ + "TO DO\n", + " * autoencoder langsam anfangen mit 19 schichten\n", + " * dann AE und SVM mit hybridem training wie bei claude?!\n", + " * dataset aus eyetracking verwenden?" + ] } ], "metadata": { diff --git a/model_training/tools/scaler.py b/model_training/tools/scaler.py index 7449c9e..e5bf5e2 100644 --- a/model_training/tools/scaler.py +++ b/model_training/tools/scaler.py @@ -1,5 +1,7 @@ -from sklearn.preprocessing import MinMaxScaler, StandardScaler -import pandas as pd +import pickle +from sklearn.preprocessing import StandardScaler, MinMaxScaler +import numpy as np +import os def fit_normalizer(train_data, au_columns, method='standard', scope='global'): """ @@ -19,9 +21,8 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'): Returns: -------- dict - Dictionary containing fitted scalers + Dictionary containing fitted scalers and statistics for new subjects """ - # Select scaler based on method if method == 'standard': Scaler = StandardScaler elif method == 'minmax': @@ -30,19 +31,54 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'): raise ValueError("method must be 'standard' or 'minmax'") scalers = {} - if scope == 'subject': # Fit one scaler per subject + subject_stats = [] + for subject in train_data['subjectID'].unique(): subject_mask = train_data['subjectID'] == subject scaler = Scaler() - scaler.fit(train_data.loc[subject_mask, au_columns]) + scaler.fit(train_data.loc[subject_mask, au_columns].values) scalers[subject] = scaler + + # Store statistics for averaging + if method == 'standard': + subject_stats.append({ + 'mean': scaler.mean_, + 'std': scaler.scale_ + }) + elif method == 'minmax': + subject_stats.append({ + 'min': scaler.data_min_, + 'max': scaler.data_max_ + }) + + # Calculate average statistics for new subjects + if method == 'standard': + avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0) + avg_std = np.mean([s['std'] for s in subject_stats], axis=0) + fallback_scaler = StandardScaler() + fallback_scaler.mean_ = avg_mean + fallback_scaler.scale_ = avg_std + fallback_scaler.var_ = avg_std ** 2 + fallback_scaler.n_features_in_ = len(au_columns) + elif method == 'minmax': + avg_min = np.mean([s['min'] for s in subject_stats], axis=0) + avg_max = np.mean([s['max'] for s in subject_stats], axis=0) + fallback_scaler = MinMaxScaler() + fallback_scaler.data_min_ = avg_min + fallback_scaler.data_max_ = avg_max + fallback_scaler.data_range_ = avg_max - avg_min + fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_ + fallback_scaler.min_ = -avg_min * fallback_scaler.scale_ + fallback_scaler.n_features_in_ = len(au_columns) + + scalers['_fallback'] = fallback_scaler elif scope == 'global': # Fit one scaler for all subjects scaler = Scaler() - scaler.fit(train_data[au_columns]) + scaler.fit(train_data[au_columns].values) scalers['global'] = scaler else: @@ -50,7 +86,7 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'): return {'scalers': scalers, 'method': method, 'scope': scope} -def apply_normalizer(data, au_columns, normalizer_dict): +def apply_normalizer(data, columns, normalizer_dict): """ Apply fitted normalization scalers to data. @@ -71,28 +107,70 @@ def apply_normalizer(data, au_columns, normalizer_dict): normalized_data = data.copy() scalers = normalizer_dict['scalers'] scope = normalizer_dict['scope'] - + normalized_data[columns] = normalized_data[columns].astype(np.float64) + if scope == 'subject': # Apply per-subject normalization for subject in data['subjectID'].unique(): subject_mask = data['subjectID'] == subject - # Use the subject's scaler if available, otherwise use a fitted scaler from training + # Use the subject's scaler if available, otherwise use fallback if subject in scalers: scaler = scalers[subject] else: - # For new subjects not seen in training, use the first available scaler - # (This is a fallback - ideally all test subjects should be in training for subject-level normalization) - print(f"Warning: Subject {subject} not found in training data. Using fallback scaler.") - scaler = list(scalers.values())[0] + # Use averaged scaler for new subjects + scaler = scalers['_fallback'] + print(f"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.") - normalized_data.loc[subject_mask, au_columns] = scaler.transform( - data.loc[subject_mask, au_columns] + normalized_data.loc[subject_mask, columns] = scaler.transform( + data.loc[subject_mask, columns].values ) elif scope == 'global': # Apply global normalization scaler = scalers['global'] - normalized_data[au_columns] = scaler.transform(data[au_columns]) + normalized_data[columns] = scaler.transform(data[columns].values) - return normalized_data \ No newline at end of file + return normalized_data + + + +def save_normalizer(normalizer_dict, filepath): + """ + Save fitted normalizer to disk. + + Parameters: + ----------- + normalizer_dict : dict + Dictionary containing fitted scalers from fit_normalizer() + filepath : str + Path to save the normalizer (e.g., 'normalizer.pkl') + """ + # Create directory if it does not exist + dirpath = os.path.dirname(filepath) + if dirpath: + os.makedirs(dirpath, exist_ok=True) + + with open(filepath, 'wb') as f: + pickle.dump(normalizer_dict, f) + + print(f"Normalizer saved to {filepath}") + +def load_normalizer(filepath): + """ + Load fitted normalizer from disk. + + Parameters: + ----------- + filepath : str + Path to the saved normalizer file + + Returns: + -------- + dict + Dictionary containing fitted scalers + """ + with open(filepath, 'rb') as f: + normalizer_dict = pickle.load(f) + print(f"Normalizer loaded from {filepath}") + return normalizer_dict \ No newline at end of file