iforest training

2025-11-05 21:31:16 +01:00 · 2025-11-05 21:31:16 +01:00 · fb29a1333e
commit fb29a1333e
parent 77dbc4a109
2 changed files with 258 additions and 1 deletions
--- a/model_training/IsolationForest/iforest_training.ipynb
+++ b/model_training/IsolationForest/iforest_training.ipynb
@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "74bda07c",
+   "metadata": {},
+   "source": [
+    "Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a265b02c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.ensemble import IsolationForest\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
+    "sys.path.append(base_dir)\n",
+    "print(base_dir)\n",
+    "\n",
+    "from  tools import evaluation_tools"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "092835af",
+   "metadata": {},
+   "source": [
+    "Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc788eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4d39a6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(path=data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8f36561",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subjects = df['subjectID'].unique()\n",
+    "print(subjects)\n",
+    "print(len(subjects))\n",
+    "print(len(subjects)*0.66)\n",
+    "print(len(subjects)*0.33)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a46e47dd",
+   "metadata": {},
+   "source": [
+    "Split of subjects and labeling "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fb6dc4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_all = df[\n",
+    "    ((df[\"PHASE\"] == \"baseline\") |\n",
+    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
+    "]\n",
+    "print(f\"low all: {low_all.shape}\")\n",
+    "\n",
+    "high_nback = df[\n",
+    "    (df[\"STUDY\"]==\"n-back\") &\n",
+    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+    "]\n",
+    "print(f\"high n-back: {high_nback.shape}\")\n",
+    "high_kdrive = df[\n",
+    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+    "]\n",
+    "print(f\"high k-drive: {high_kdrive.shape}\")\n",
+    "\n",
+    "high_all = pd.concat([high_nback, high_kdrive])\n",
+    "print(f\"high all: {high_all.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20394aca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_subjects, test_subjects = train_test_split(\n",
+    "    subjects, \n",
+    "    train_size=12, \n",
+    "    test_size=6, \n",
+    "    random_state=42\n",
+    ")\n",
+    "\n",
+    "# Get all column names that start with 'AU'\n",
+    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
+    "\n",
+    "# Create train set: only normal samples from train subjects, only AU columns\n",
+    "X_train = low_all[low_all['subjectID'].isin(train_subjects)][au_columns].copy()\n",
+    "y_train = np.ones(len(X_train))  # Label 1 for normal samples\n",
+    "\n",
+    "# Create test set: both normal and high load from test subjects, only AU columns\n",
+    "X_test_normal = low_all[low_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
+    "X_test_high = high_all[high_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
+    "\n",
+    "# Combine test sets\n",
+    "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
+    "\n",
+    "# Create labels for test set\n",
+    "y_test_normal = np.ones(len(X_test_normal))  # 1 for normal\n",
+    "y_test_high = -np.ones(len(X_test_high))     # -1 for anomalies\n",
+    "y_test = np.concatenate([y_test_normal, y_test_high])\n",
+    "\n",
+    "\n",
+    "print(f\"Number of AU features: {len(au_columns)}\")\n",
+    "print(f\"AU columns: {au_columns}\")\n",
+    "print(f\"\\nTrain set: {len(X_train)} normal samples\")\n",
+    "print(f\"Test set: {len(X_test_normal)} normal + {len(X_test_high)} high load = {len(X_test)} total samples\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "697b3cf7",
+   "metadata": {},
+   "source": [
+    "Training of Isolation Forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5cd4ac6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iforest = IsolationForest(random_state=42)\n",
+    "iforest.fit(X_train)\n",
+    "iforest_scores = iforest.score_samples(X_test)\n",
+    "iforest_predictions = iforest.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c45f66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_tools.plot_confusion_matrix(true_labels=y_test, predictions=iforest_predictions, label_names=['high load', 'low load'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "326fcb47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_tools.plot_roc_curve_IF(y_test, iforest_scores)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "141267e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iforest.offset_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bf81d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(classification_report(y_test, iforest_predictions))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/model_training/tools/evaluation_tools.py
+++ b/model_training/tools/evaluation_tools.py
@ -1 +1,23 @@
-import pandas as pd
+import pandas as pd
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
+import matplotlib.pyplot as plt
+
+def plot_confusion_matrix(true_labels, predictions, label_names):
+    for normalize in [None, 'true']:
+      cm = confusion_matrix(true_labels, predictions, normalize=normalize)
+      cm_disp = ConfusionMatrixDisplay(cm,  display_labels=label_names)
+      cm_disp.plot(cmap="Blues")
+
+
+def plot_roc_curve_IF(true_labels, scores):
+    fpr, tpr, thr = roc_curve(true_labels, -scores, pos_label=-1)
+    auc_score = auc(fpr, tpr)
+
+    plt.figure()
+    plt.plot(fpr, tpr, '-')
+    plt.text(0.5, 0.5, f'AUC: {auc_score:.4f}')
+    plt.xlabel('False positive rate')
+    plt.ylabel('True positive rate')
+    plt.show()
+
+