iforest training

2025-11-05 21:31:16 +01:00 · 2025-11-05 21:31:16 +01:00 · fb29a1333e
commit fb29a1333e
parent 77dbc4a109
2 changed files with 258 additions and 1 deletions
--- a/model_training/IsolationForest/iforest_training.ipynb
+++ b/model_training/IsolationForest/iforest_training.ipynb
@ -0,0 +1,235 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "74bda07c",
   "metadata": {},
   "source": [
    "Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a265b02c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import IsolationForest\n",
    "from sklearn.model_selection import train_test_split\n",
    "from pathlib import Path\n",
    "import sys\n",
    "import os\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
    "sys.path.append(base_dir)\n",
    "print(base_dir)\n",
    "\n",
    "from  tools import evaluation_tools"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "092835af",
   "metadata": {},
   "source": [
    "Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc788eb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4d39a6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet(path=data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8f36561",
   "metadata": {},
   "outputs": [],
   "source": [
    "subjects = df['subjectID'].unique()\n",
    "print(subjects)\n",
    "print(len(subjects))\n",
    "print(len(subjects)*0.66)\n",
    "print(len(subjects)*0.33)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a46e47dd",
   "metadata": {},
   "source": [
    "Split of subjects and labeling "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fb6dc4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "low_all = df[\n",
    "    ((df[\"PHASE\"] == \"baseline\") |\n",
    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
    "]\n",
    "print(f\"low all: {low_all.shape}\")\n",
    "\n",
    "high_nback = df[\n",
    "    (df[\"STUDY\"]==\"n-back\") &\n",
    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
    "]\n",
    "print(f\"high n-back: {high_nback.shape}\")\n",
    "high_kdrive = df[\n",
    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
    "]\n",
    "print(f\"high k-drive: {high_kdrive.shape}\")\n",
    "\n",
    "high_all = pd.concat([high_nback, high_kdrive])\n",
    "print(f\"high all: {high_all.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20394aca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_subjects, test_subjects = train_test_split(\n",
    "    subjects, \n",
    "    train_size=12, \n",
    "    test_size=6, \n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "# Get all column names that start with 'AU'\n",
    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
    "\n",
    "# Create train set: only normal samples from train subjects, only AU columns\n",
    "X_train = low_all[low_all['subjectID'].isin(train_subjects)][au_columns].copy()\n",
    "y_train = np.ones(len(X_train))  # Label 1 for normal samples\n",
    "\n",
    "# Create test set: both normal and high load from test subjects, only AU columns\n",
    "X_test_normal = low_all[low_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
    "X_test_high = high_all[high_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
    "\n",
    "# Combine test sets\n",
    "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
    "\n",
    "# Create labels for test set\n",
    "y_test_normal = np.ones(len(X_test_normal))  # 1 for normal\n",
    "y_test_high = -np.ones(len(X_test_high))     # -1 for anomalies\n",
    "y_test = np.concatenate([y_test_normal, y_test_high])\n",
    "\n",
    "\n",
    "print(f\"Number of AU features: {len(au_columns)}\")\n",
    "print(f\"AU columns: {au_columns}\")\n",
    "print(f\"\\nTrain set: {len(X_train)} normal samples\")\n",
    "print(f\"Test set: {len(X_test_normal)} normal + {len(X_test_high)} high load = {len(X_test)} total samples\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "697b3cf7",
   "metadata": {},
   "source": [
    "Training of Isolation Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5cd4ac6",
   "metadata": {},
   "outputs": [],
   "source": [
    "iforest = IsolationForest(random_state=42)\n",
    "iforest.fit(X_train)\n",
    "iforest_scores = iforest.score_samples(X_test)\n",
    "iforest_predictions = iforest.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15c45f66",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluation_tools.plot_confusion_matrix(true_labels=y_test, predictions=iforest_predictions, label_names=['high load', 'low load'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "326fcb47",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluation_tools.plot_roc_curve_IF(y_test, iforest_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "141267e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "iforest.offset_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bf81d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, iforest_predictions))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/model_training/tools/evaluation_tools.py
+++ b/model_training/tools/evaluation_tools.py
@ -1 +1,23 @@
-import pandas as pd
+import pandas as pd
 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
 import matplotlib.pyplot as plt
 def plot_confusion_matrix(true_labels, predictions, label_names):
    for normalize in [None, 'true']:
      cm = confusion_matrix(true_labels, predictions, normalize=normalize)
      cm_disp = ConfusionMatrixDisplay(cm,  display_labels=label_names)
      cm_disp.plot(cmap="Blues")
 def plot_roc_curve_IF(true_labels, scores):
    fpr, tpr, thr = roc_curve(true_labels, -scores, pos_label=-1)
    auc_score = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, '-')
    plt.text(0.5, 0.5, f'AUC: {auc_score:.4f}')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.show()