From fb29a1333e803b5ea691d64a810c22574667e2b2 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 5 Nov 2025 21:31:16 +0100 Subject: [PATCH] iforest training --- .../IsolationForest/iforest_training.ipynb | 235 ++++++++++++++++++ model_training/tools/evaluation_tools.py | 24 +- 2 files changed, 258 insertions(+), 1 deletion(-) diff --git a/model_training/IsolationForest/iforest_training.ipynb b/model_training/IsolationForest/iforest_training.ipynb index e69de29..3fbc561 100644 --- a/model_training/IsolationForest/iforest_training.ipynb +++ b/model_training/IsolationForest/iforest_training.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74bda07c", + "metadata": {}, + "source": [ + "Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a265b02c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from pathlib import Path\n", + "import sys\n", + "import os\n", + "from sklearn.metrics import classification_report\n", + "\n", + "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", + "sys.path.append(base_dir)\n", + "print(base_dir)\n", + "\n", + "from tools import evaluation_tools" + ] + }, + { + "cell_type": "markdown", + "id": "092835af", + "metadata": {}, + "source": [ + "Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc788eb4", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4d39a6e", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(path=data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f36561", + "metadata": {}, + "outputs": [], + "source": [ + "subjects = df['subjectID'].unique()\n", + "print(subjects)\n", + "print(len(subjects))\n", + "print(len(subjects)*0.66)\n", + "print(len(subjects)*0.33)" + ] + }, + { + "cell_type": "markdown", + "id": "a46e47dd", + "metadata": {}, + "source": [ + "Split of subjects and labeling " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fb6dc4b", + "metadata": {}, + "outputs": [], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n", + "]\n", + "print(f\"low all: {low_all.shape}\")\n", + "\n", + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "print(f\"high n-back: {high_nback.shape}\")\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(f\"high k-drive: {high_kdrive.shape}\")\n", + "\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"high all: {high_all.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20394aca", + "metadata": {}, + "outputs": [], + "source": [ + "train_subjects, test_subjects = train_test_split(\n", + " subjects, \n", + " train_size=12, \n", + " test_size=6, \n", + " random_state=42\n", + ")\n", + "\n", + "# Get all column names that start with 'AU'\n", + "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", + "\n", + "# Create train set: only normal samples from train subjects, only AU columns\n", + "X_train = low_all[low_all['subjectID'].isin(train_subjects)][au_columns].copy()\n", + "y_train = np.ones(len(X_train)) # Label 1 for normal samples\n", + "\n", + "# Create test set: both normal and high load from test subjects, only AU columns\n", + "X_test_normal = low_all[low_all['subjectID'].isin(test_subjects)][au_columns].copy()\n", + "X_test_high = high_all[high_all['subjectID'].isin(test_subjects)][au_columns].copy()\n", + "\n", + "# Combine test sets\n", + "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n", + "\n", + "# Create labels for test set\n", + "y_test_normal = np.ones(len(X_test_normal)) # 1 for normal\n", + "y_test_high = -np.ones(len(X_test_high)) # -1 for anomalies\n", + "y_test = np.concatenate([y_test_normal, y_test_high])\n", + "\n", + "\n", + "print(f\"Number of AU features: {len(au_columns)}\")\n", + "print(f\"AU columns: {au_columns}\")\n", + "print(f\"\\nTrain set: {len(X_train)} normal samples\")\n", + "print(f\"Test set: {len(X_test_normal)} normal + {len(X_test_high)} high load = {len(X_test)} total samples\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "697b3cf7", + "metadata": {}, + "source": [ + "Training of Isolation Forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5cd4ac6", + "metadata": {}, + "outputs": [], + "source": [ + "iforest = IsolationForest(random_state=42)\n", + "iforest.fit(X_train)\n", + "iforest_scores = iforest.score_samples(X_test)\n", + "iforest_predictions = iforest.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c45f66", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_tools.plot_confusion_matrix(true_labels=y_test, predictions=iforest_predictions, label_names=['high load', 'low load'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "326fcb47", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_tools.plot_roc_curve_IF(y_test, iforest_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "141267e4", + "metadata": {}, + "outputs": [], + "source": [ + "iforest.offset_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bf81d7b", + "metadata": {}, + "outputs": [], + "source": [ + "print(classification_report(y_test, iforest_predictions))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/model_training/tools/evaluation_tools.py b/model_training/tools/evaluation_tools.py index 386860a..7fe8103 100644 --- a/model_training/tools/evaluation_tools.py +++ b/model_training/tools/evaluation_tools.py @@ -1 +1,23 @@ -import pandas as pd \ No newline at end of file +import pandas as pd +from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc +import matplotlib.pyplot as plt + +def plot_confusion_matrix(true_labels, predictions, label_names): + for normalize in [None, 'true']: + cm = confusion_matrix(true_labels, predictions, normalize=normalize) + cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names) + cm_disp.plot(cmap="Blues") + + +def plot_roc_curve_IF(true_labels, scores): + fpr, tpr, thr = roc_curve(true_labels, -scores, pos_label=-1) + auc_score = auc(fpr, tpr) + + plt.figure() + plt.plot(fpr, tpr, '-') + plt.text(0.5, 0.5, f'AUC: {auc_score:.4f}') + plt.xlabel('False positive rate') + plt.ylabel('True positive rate') + plt.show() + + \ No newline at end of file