iforest training
This commit is contained in:
parent
77dbc4a109
commit
fb29a1333e
@ -0,0 +1,235 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74bda07c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a265b02c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.ensemble import IsolationForest\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from pathlib import Path\n",
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"\n",
|
||||
"base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
|
||||
"sys.path.append(base_dir)\n",
|
||||
"print(base_dir)\n",
|
||||
"\n",
|
||||
"from tools import evaluation_tools"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "092835af",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc788eb4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4d39a6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_parquet(path=data_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a8f36561",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"subjects = df['subjectID'].unique()\n",
|
||||
"print(subjects)\n",
|
||||
"print(len(subjects))\n",
|
||||
"print(len(subjects)*0.66)\n",
|
||||
"print(len(subjects)*0.33)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a46e47dd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Split of subjects and labeling "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3fb6dc4b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"low_all = df[\n",
|
||||
" ((df[\"PHASE\"] == \"baseline\") |\n",
|
||||
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
|
||||
"]\n",
|
||||
"print(f\"low all: {low_all.shape}\")\n",
|
||||
"\n",
|
||||
"high_nback = df[\n",
|
||||
" (df[\"STUDY\"]==\"n-back\") &\n",
|
||||
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
|
||||
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
|
||||
"]\n",
|
||||
"print(f\"high n-back: {high_nback.shape}\")\n",
|
||||
"high_kdrive = df[\n",
|
||||
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
|
||||
"]\n",
|
||||
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
|
||||
"\n",
|
||||
"high_all = pd.concat([high_nback, high_kdrive])\n",
|
||||
"print(f\"high all: {high_all.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "20394aca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_subjects, test_subjects = train_test_split(\n",
|
||||
" subjects, \n",
|
||||
" train_size=12, \n",
|
||||
" test_size=6, \n",
|
||||
" random_state=42\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Get all column names that start with 'AU'\n",
|
||||
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||
"\n",
|
||||
"# Create train set: only normal samples from train subjects, only AU columns\n",
|
||||
"X_train = low_all[low_all['subjectID'].isin(train_subjects)][au_columns].copy()\n",
|
||||
"y_train = np.ones(len(X_train)) # Label 1 for normal samples\n",
|
||||
"\n",
|
||||
"# Create test set: both normal and high load from test subjects, only AU columns\n",
|
||||
"X_test_normal = low_all[low_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
|
||||
"X_test_high = high_all[high_all['subjectID'].isin(test_subjects)][au_columns].copy()\n",
|
||||
"\n",
|
||||
"# Combine test sets\n",
|
||||
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
|
||||
"\n",
|
||||
"# Create labels for test set\n",
|
||||
"y_test_normal = np.ones(len(X_test_normal)) # 1 for normal\n",
|
||||
"y_test_high = -np.ones(len(X_test_high)) # -1 for anomalies\n",
|
||||
"y_test = np.concatenate([y_test_normal, y_test_high])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(f\"Number of AU features: {len(au_columns)}\")\n",
|
||||
"print(f\"AU columns: {au_columns}\")\n",
|
||||
"print(f\"\\nTrain set: {len(X_train)} normal samples\")\n",
|
||||
"print(f\"Test set: {len(X_test_normal)} normal + {len(X_test_high)} high load = {len(X_test)} total samples\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "697b3cf7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Training of Isolation Forest"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b5cd4ac6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"iforest = IsolationForest(random_state=42)\n",
|
||||
"iforest.fit(X_train)\n",
|
||||
"iforest_scores = iforest.score_samples(X_test)\n",
|
||||
"iforest_predictions = iforest.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15c45f66",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluation_tools.plot_confusion_matrix(true_labels=y_test, predictions=iforest_predictions, label_names=['high load', 'low load'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "326fcb47",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluation_tools.plot_roc_curve_IF(y_test, iforest_scores)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "141267e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"iforest.offset_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4bf81d7b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(classification_report(y_test, iforest_predictions))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@ -1 +1,23 @@
|
||||
import pandas as pd
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def plot_confusion_matrix(true_labels, predictions, label_names):
|
||||
for normalize in [None, 'true']:
|
||||
cm = confusion_matrix(true_labels, predictions, normalize=normalize)
|
||||
cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)
|
||||
cm_disp.plot(cmap="Blues")
|
||||
|
||||
|
||||
def plot_roc_curve_IF(true_labels, scores):
|
||||
fpr, tpr, thr = roc_curve(true_labels, -scores, pos_label=-1)
|
||||
auc_score = auc(fpr, tpr)
|
||||
|
||||
plt.figure()
|
||||
plt.plot(fpr, tpr, '-')
|
||||
plt.text(0.5, 0.5, f'AUC: {auc_score:.4f}')
|
||||
plt.xlabel('False positive rate')
|
||||
plt.ylabel('True positive rate')
|
||||
plt.show()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user