iforest training

This commit is contained in:
Michael Weig 2025-11-05 21:31:16 +01:00
parent 77dbc4a109
commit fb29a1333e
2 changed files with 258 additions and 1 deletions

View File

@ -0,0 +1,235 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "74bda07c",
"metadata": {},
"source": [
"Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a265b02c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import IsolationForest\n",
"from sklearn.model_selection import train_test_split\n",
"from pathlib import Path\n",
"import sys\n",
"import os\n",
"from sklearn.metrics import classification_report\n",
"\n",
"base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"sys.path.append(base_dir)\n",
"print(base_dir)\n",
"\n",
"from tools import evaluation_tools"
]
},
{
"cell_type": "markdown",
"id": "092835af",
"metadata": {},
"source": [
"Load dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc788eb4",
"metadata": {},
"outputs": [],
"source": [
"data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4d39a6e",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(path=data_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8f36561",
"metadata": {},
"outputs": [],
"source": [
"subjects = df['subjectID'].unique()\n",
"print(subjects)\n",
"print(len(subjects))\n",
"print(len(subjects)*0.66)\n",
"print(len(subjects)*0.33)"
]
},
{
"cell_type": "markdown",
"id": "a46e47dd",
"metadata": {},
"source": [
"Split of subjects and labeling "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fb6dc4b",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
"]\n",
"print(f\"low all: {low_all.shape}\")\n",
"\n",
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"print(f\"high n-back: {high_nback.shape}\")\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
"\n",
"high_all = pd.concat([high_nback, high_kdrive])\n",
"print(f\"high all: {high_all.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20394aca",
"metadata": {},
"outputs": [],
"source": [
"train_subjects, test_subjects = train_test_split(\n",
" subjects, \n",
" train_size=12, \n",
" test_size=6, \n",
" random_state=42\n",
")\n",
"\n",
"# Get all column names that start with 'AU'\n",
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
"\n",
"# Create train set: only normal samples from train subjects, only AU columns\n",
"X_train = low_all[low_all['subjectID'].isin(train_subjects)][au_columns].copy()\n",
"y_train = np.ones(len(X_train)) # Label 1 for normal samples\n",
"\n",
"# Create test set: both normal and high load from test subjects, only AU columns\n",
"X_test_normal = low_all[low_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
"X_test_high = high_all[high_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
"\n",
"# Combine test sets\n",
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
"\n",
"# Create labels for test set\n",
"y_test_normal = np.ones(len(X_test_normal)) # 1 for normal\n",
"y_test_high = -np.ones(len(X_test_high)) # -1 for anomalies\n",
"y_test = np.concatenate([y_test_normal, y_test_high])\n",
"\n",
"\n",
"print(f\"Number of AU features: {len(au_columns)}\")\n",
"print(f\"AU columns: {au_columns}\")\n",
"print(f\"\\nTrain set: {len(X_train)} normal samples\")\n",
"print(f\"Test set: {len(X_test_normal)} normal + {len(X_test_high)} high load = {len(X_test)} total samples\")\n"
]
},
{
"cell_type": "markdown",
"id": "697b3cf7",
"metadata": {},
"source": [
"Training of Isolation Forest"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5cd4ac6",
"metadata": {},
"outputs": [],
"source": [
"iforest = IsolationForest(random_state=42)\n",
"iforest.fit(X_train)\n",
"iforest_scores = iforest.score_samples(X_test)\n",
"iforest_predictions = iforest.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15c45f66",
"metadata": {},
"outputs": [],
"source": [
"evaluation_tools.plot_confusion_matrix(true_labels=y_test, predictions=iforest_predictions, label_names=['high load', 'low load'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "326fcb47",
"metadata": {},
"outputs": [],
"source": [
"evaluation_tools.plot_roc_curve_IF(y_test, iforest_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "141267e4",
"metadata": {},
"outputs": [],
"source": [
"iforest.offset_"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bf81d7b",
"metadata": {},
"outputs": [],
"source": [
"print(classification_report(y_test, iforest_predictions))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1 +1,23 @@
import pandas as pd import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
def plot_confusion_matrix(true_labels, predictions, label_names):
for normalize in [None, 'true']:
cm = confusion_matrix(true_labels, predictions, normalize=normalize)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)
cm_disp.plot(cmap="Blues")
def plot_roc_curve_IF(true_labels, scores):
fpr, tpr, thr = roc_curve(true_labels, -scores, pos_label=-1)
auc_score = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, '-')
plt.text(0.5, 0.5, f'AUC: {auc_score:.4f}')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()