From af3f9d16b28dbc7f8171b5d7bdbf3fa9dd06d5a4 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 4 Mar 2026 12:25:07 +0100 Subject: [PATCH] minor fixes to new paths / dataset with all columns --- EDA/histogramms.ipynb | 67 ++++++++++++++++--- .../IsolationForest/iforest_training.ipynb | 62 +++++++++-------- 2 files changed, 89 insertions(+), 40 deletions(-) diff --git a/EDA/histogramms.ipynb b/EDA/histogramms.ipynb index e35055f..35b9c04 100644 --- a/EDA/histogramms.ipynb +++ b/EDA/histogramms.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "cc08936c", + "metadata": {}, + "source": [ + "## Insights into the dataset with histogramms and scatter plots" + ] + }, { "cell_type": "markdown", "id": "1014c5e0", @@ -17,7 +25,8 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "from pathlib import Path" ] }, { @@ -27,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n", + "path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n", "df = pd.read_parquet(path=path)" ] }, @@ -104,21 +113,27 @@ "metadata": {}, "outputs": [], "source": [ - "# Get all columns that start with 'AU'\n", - "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", + "face_au_cols = [c for c in low_all.columns if c.startswith(\"FACE_AU\")]\n", + "eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", + " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", + " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", + " 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n", + " 'Pupil_mean', 'Pupil_IPA']\n", + "\n", + "cols = face_au_cols+eye_cols\n", "\n", "# Calculate number of rows and columns for subplots\n", - "n_cols = len(au_columns)\n", - "n_rows = 4\n", + "n_cols = len(cols)\n", + "n_rows = 7\n", "n_cols_subplot = 5\n", "\n", "# Create figure with subplots\n", "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n", "axes = axes.flatten()\n", - "fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n", + "fig.suptitle('Feature Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n", "\n", "# Create histogram for each AU column\n", - "for idx, col in enumerate(au_columns):\n", + "for idx, col in enumerate(cols):\n", " ax = axes[idx]\n", " \n", " # Plot overlapping histograms\n", @@ -133,18 +148,48 @@ " ax.grid(True, alpha=0.3)\n", "\n", "# Hide any unused subplots\n", - "for idx in range(len(au_columns), len(axes)):\n", + "for idx in range(len(cols), len(axes)):\n", " axes[idx].set_visible(False)\n", "\n", "# Adjust layout\n", "plt.tight_layout()\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cd53cdb", + "metadata": {}, + "outputs": [], + "source": [ + "# Create figure with subplots\n", + "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n", + "axes = axes.flatten()\n", + "fig.suptitle('Feature Scatter: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n", + "\n", + "for idx, col in enumerate(cols):\n", + " ax = axes[idx]\n", + "\n", + " # Scatterplots\n", + " ax.scatter(range(len(low_all[col])), low_all[col], alpha=0.6, color='blue', label='low_all', s=10)\n", + " ax.scatter(range(len(high_all[col])), high_all[col], alpha=0.6, color='red', label='high_all', s=10)\n", + "\n", + " ax.set_title(col, fontsize=10, fontweight='bold')\n", + " ax.set_xlabel('Sample index', fontsize=8)\n", + " ax.set_ylabel('Value', fontsize=8)\n", + " ax.legend(fontsize=8)\n", + " ax.grid(True, alpha=0.3)\n", + "\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] } ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -158,7 +203,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/model_training/IsolationForest/iforest_training.ipynb b/model_training/IsolationForest/iforest_training.ipynb index 22f27ac..c98320b 100644 --- a/model_training/IsolationForest/iforest_training.ipynb +++ b/model_training/IsolationForest/iforest_training.ipynb @@ -28,7 +28,7 @@ "sys.path.append(base_dir)\n", "print(base_dir)\n", "\n", - "from tools import evaluation_tools\n", + "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.ensemble import IsolationForest\n", "from sklearn.model_selection import GridSearchCV, KFold\n", @@ -52,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")" + "data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")" ] }, { @@ -301,20 +301,26 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 2: Get AU columns and prepare datasets\n", - "# Get all column names that start with 'AU'\n", - "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", + "au_columns = [col for col in low_all.columns if \"face\" in col.lower()] \n", "\n", + "eye_columns = [ \n", + " 'Fix_count_short_66_150','Fix_count_medium_300_500','Fix_count_long_gt_1000', \n", + " 'Fix_count_100','Fix_mean_duration','Fix_median_duration', \n", + " 'Sac_count','Sac_mean_amp','Sac_mean_dur','Sac_median_dur', \n", + " 'Blink_count','Blink_mean_dur','Blink_median_dur', \n", + " 'Pupil_mean','Pupil_IPA' \n", + "] \n", + "cols = au_columns +eye_columns\n", "# Prepare training data (only normal/low data)\n", - "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n", + "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + cols].copy()\n", "\n", "# Prepare validation data (normal and anomaly)\n", - "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", - "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", + "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n", + "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n", "\n", "# Prepare test data (normal and anomaly)\n", - "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", - "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", + "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n", + "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n", "\n", "print(f\"Train samples: {len(train_data)}\")\n", "print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n", @@ -328,8 +334,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 3: Fit normalizer on training data\n", - "normalizer = fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n", + "# Fit normalizer on training data\n", + "normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n", "print(\"Normalizer fitted on training data\")" ] }, @@ -340,12 +346,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 4: Apply normalization to all datasets\n", - "train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n", - "val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n", - "val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n", - "test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n", - "test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n", + "# Apply normalization to all datasets\n", + "train_normalized = apply_normalizer(train_data, cols, normalizer)\n", + "val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n", + "val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n", + "test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n", + "test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n", "\n", "print(\"Normalization applied to all datasets\")" ] @@ -357,11 +363,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 5: Extract AU columns and create labels for grid search\n", - "# Extract only AU columns (drop subjectID)\n", - "X_train = train_normalized[au_columns].copy()\n", - "X_val_normal = val_normal_normalized[au_columns].copy()\n", - "X_val_high = val_high_normalized[au_columns].copy()\n", + "X_train = train_normalized[cols].copy()\n", + "X_val_normal = val_normal_normalized[cols].copy()\n", + "X_val_high = val_high_normalized[cols].copy()\n", "\n", "# Combine train and validation sets for grid search\n", "X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n", @@ -416,7 +420,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 7: Train final model with best parameters on training data\n", + "# Train final model with best parameters on training data\n", "final_model = IsolationForest(**best_params, random_state=42)\n", "final_model.fit(X_train.values)\n", "\n", @@ -430,9 +434,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Cell 8: Prepare independent test set\n", - "X_test_normal = test_normal_normalized[au_columns].copy()\n", - "X_test_high = test_high_normalized[au_columns].copy()\n", + "# Prepare independent test set\n", + "X_test_normal = test_normal_normalized[cols].copy()\n", + "X_test_high = test_high_normalized[cols].copy()\n", "\n", "# Combine test sets\n", "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n", @@ -483,7 +487,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -497,7 +501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.10" } }, "nbformat": 4,