From 58d92fb423bae9ff757ceb6f39f0fc8ab28f3b78 Mon Sep 17 00:00:00 2001
From: Michael <weigmi87303@th-nuernberg.de>
Date: Wed, 19 Nov 2025 17:31:52 +0100
Subject: [PATCH] ae notebook

---
 model_training/OCSVM/ocsvm_with_AE.ipynb | 1080 ++++++++++++++++++++++
 1 file changed, 1080 insertions(+)
 create mode 100644 model_training/OCSVM/ocsvm_with_AE.ipynb

diff --git a/model_training/OCSVM/ocsvm_with_AE.ipynb b/model_training/OCSVM/ocsvm_with_AE.ipynb
new file mode 100644
index 0000000..f78a544
--- /dev/null
+++ b/model_training/OCSVM/ocsvm_with_AE.ipynb
@@ -0,0 +1,1080 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4ac35e21",
+   "metadata": {},
+   "source": [
+    "### Import"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "id": "87513def",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
+    "sys.path.append(base_dir)\n",
+    "print(base_dir)\n",
+    "\n",
+    "# from tools import evaluation_tools\n",
+    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
+    "from sklearn.svm import OneClassSVM\n",
+    "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import pickle\n",
+    "import evaluation_tools\n",
+    "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n",
+    "                                recall_score, f1_score, confusion_matrix, classification_report)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "1c303823",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(tf.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "46e29f47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.getcwd()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83dc3a63",
+   "metadata": {},
+   "source": [
+    "### Load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "0eef0bc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
+    "data_path = Path(r\"output_windowed.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "597880bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(path=data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "fa0f0fec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subjects = df['subjectID'].unique()\n",
+    "print(subjects)\n",
+    "print(len(subjects))\n",
+    "print(len(subjects)*0.66)\n",
+    "print(len(subjects)*0.33)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e80506dc",
+   "metadata": {},
+   "source": [
+    "### Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "897a2342",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_all = df[\n",
+    "    ((df[\"PHASE\"] == \"baseline\") |\n",
+    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
+    "]\n",
+    "print(f\"low all: {low_all.shape}\")\n",
+    "\n",
+    "high_nback = df[\n",
+    "    (df[\"STUDY\"]==\"n-back\") &\n",
+    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+    "]\n",
+    "print(f\"high n-back: {high_nback.shape}\")\n",
+    "high_kdrive = df[\n",
+    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+    "]\n",
+    "print(f\"high k-drive: {high_kdrive.shape}\")\n",
+    "\n",
+    "high_all = pd.concat([high_nback, high_kdrive])\n",
+    "print(f\"high all: {high_all.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "f6f59455",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First split: separate test set\n",
+    "train_val_subjects, test_subjects = train_test_split(\n",
+    "    subjects, \n",
+    "    train_size=14, \n",
+    "    test_size=4, \n",
+    "    random_state=42\n",
+    ")\n",
+    "\n",
+    "# Second split: separate train and validation from the remaining subjects\n",
+    "# Adjust these numbers based on your total subject count\n",
+    "train_subjects, val_subjects = train_test_split(\n",
+    "    train_val_subjects,\n",
+    "    train_size=8,\n",
+    "    test_size=6,\n",
+    "    random_state=42\n",
+    ")\n",
+    "\n",
+    "print(f\"Train subjects: {len(train_subjects)}\")\n",
+    "print(f\"Validation subjects: {len(val_subjects)}\")\n",
+    "print(f\"Test subjects: {len(test_subjects)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "f215beb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
+    "\n",
+    "# Prepare training data (only normal/low data)\n",
+    "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n",
+    "\n",
+    "# Prepare validation data (normal and anomaly) - 500 samples each\n",
+    "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
+    "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
+    "val_normal_data = val_normal_data.sample(n=1000, random_state=42)\n",
+    "val_high_data = val_high_data.sample(n=1000, random_state=42)\n",
+    "\n",
+    "# Prepare test data (normal and anomaly) - 1000 samples each\n",
+    "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
+    "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
+    "test_normal_data = test_normal_data.sample(n=500, random_state=42)\n",
+    "test_high_data = test_high_data.sample(n=500, random_state=42)\n",
+    "\n",
+    "print(f\"Train samples: {len(train_data)}\")\n",
+    "print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
+    "print(f\"Test normal samples: {len(test_normal_data)}, Test high samples: {len(test_high_data)}\")\n",
+    "\n",
+    "# Track subject split\n",
+    "print(f\"\\nSubject split:\")\n",
+    "print(f\"Train subjects ({len(train_subjects)}): {sorted(train_subjects)}\")\n",
+    "print(f\"Val subjects ({len(val_subjects)}): {sorted(val_subjects)}\")\n",
+    "print(f\"Test subjects ({len(test_subjects)}): {sorted(test_subjects)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "d3ebae31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
+    "    \"\"\"\n",
+    "    Fit normalization scalers on training data.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    train_data : pd.DataFrame\n",
+    "        Training dataframe with AU columns and subjectID\n",
+    "    au_columns : list\n",
+    "        List of AU column names to normalize\n",
+    "    method : str, default='standard'\n",
+    "        Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
+    "    scope : str, default='global'\n",
+    "        Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
+    "    \n",
+    "    Returns:\n",
+    "    --------\n",
+    "    dict\n",
+    "        Dictionary containing fitted scalers\n",
+    "    \"\"\"\n",
+    "    # Select scaler based on method\n",
+    "    if method == 'standard':\n",
+    "        Scaler = StandardScaler\n",
+    "    elif method == 'minmax':\n",
+    "        Scaler = MinMaxScaler\n",
+    "    else:\n",
+    "        raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
+    "    \n",
+    "    scalers = {}\n",
+    "    \n",
+    "    if scope == 'subject':\n",
+    "        # Fit one scaler per subject\n",
+    "        for subject in train_data['subjectID'].unique():\n",
+    "            subject_mask = train_data['subjectID'] == subject\n",
+    "            scaler = Scaler()\n",
+    "            scaler.fit(train_data.loc[subject_mask, au_columns])\n",
+    "            scalers[subject] = scaler\n",
+    "    \n",
+    "    elif scope == 'global':\n",
+    "        # Fit one scaler for all subjects\n",
+    "        scaler = Scaler()\n",
+    "        scaler.fit(train_data[au_columns])\n",
+    "        scalers['global'] = scaler\n",
+    "    \n",
+    "    else:\n",
+    "        raise ValueError(\"scope must be 'subject' or 'global'\")\n",
+    "    \n",
+    "    return {'scalers': scalers, 'method': method, 'scope': scope}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "f586a99b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def apply_normalizer(data, au_columns, normalizer_dict):\n",
+    "    \"\"\"\n",
+    "    Apply fitted normalization scalers to data.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    data : pd.DataFrame\n",
+    "        Dataframe with AU columns and subjectID\n",
+    "    au_columns : list\n",
+    "        List of AU column names to normalize\n",
+    "    normalizer_dict : dict\n",
+    "        Dictionary containing fitted scalers from fit_normalizer()\n",
+    "    \n",
+    "    Returns:\n",
+    "    --------\n",
+    "    pd.DataFrame\n",
+    "        DataFrame with normalized AU columns\n",
+    "    \"\"\"\n",
+    "    normalized_data = data.copy()\n",
+    "    scalers = normalizer_dict['scalers']\n",
+    "    scope = normalizer_dict['scope']\n",
+    "    \n",
+    "    if scope == 'subject':\n",
+    "        # Apply per-subject normalization\n",
+    "        for subject in data['subjectID'].unique():\n",
+    "            subject_mask = data['subjectID'] == subject\n",
+    "            \n",
+    "            # Use the subject's scaler if available, otherwise use a fitted scaler from training\n",
+    "            if subject in scalers:\n",
+    "                scaler = scalers[subject]\n",
+    "            else:\n",
+    "                # For new subjects not seen in training, use the first available scaler\n",
+    "                # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n",
+    "                print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n",
+    "                scaler = list(scalers.values())[0]\n",
+    "            \n",
+    "            normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n",
+    "                data.loc[subject_mask, au_columns]\n",
+    "            )\n",
+    "    \n",
+    "    elif scope == 'global':\n",
+    "        # Apply global normalization\n",
+    "        scaler = scalers['global']\n",
+    "        normalized_data[au_columns] = scaler.transform(data[au_columns])\n",
+    "    \n",
+    "    return normalized_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "74e363aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 3: Fit normalizer on training data\n",
+    "normalizer = fit_normalizer(train_data, au_columns, method='standard', scope='global')\n",
+    "print(\"Normalizer fitted on training data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "5dd6c4b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 4: Apply normalization to all datasets\n",
+    "train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n",
+    "val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n",
+    "val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n",
+    "test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n",
+    "test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n",
+    "\n",
+    "print(\"Normalization applied to all datasets\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "624a5374",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 5: Extract AU columns and create labels for grid search\n",
+    "# Extract only AU columns (drop subjectID)\n",
+    "X_train = train_normalized[au_columns].copy()\n",
+    "X_val_normal = val_normal_normalized[au_columns].copy()\n",
+    "X_val_high = val_high_normalized[au_columns].copy()\n",
+    "X_test_high = test_normal_normalized[au_columns].copy()\n",
+    "X_test_normal = test_high_normalized[au_columns].copy()\n",
+    "# Combine train and validation sets for grid search\n",
+    "# X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
+    "\n",
+    "# Create labels for grid search\n",
+    "y_train = np.ones(len(X_train))  # 1 for normal (training)\n",
+    "y_val_normal = np.ones(len(X_val_normal))  # 1 for normal (validation)\n",
+    "y_val_high = -np.ones(len(X_val_high))  # -1 for anomalies (validation)\n",
+    "y_test_normal = np.ones(len(X_test_normal))\n",
+    "y_test_high = -np.ones(len(X_test_high))\n",
+    "\n",
+    "# y_grid_search = np.concatenate([y_train, y_val_normal, y_val_high])\n",
+    "\n",
+    "# print(f\"Grid search data shape: {X_grid_search.shape}\")\n",
+    "# print(f\"Labels distribution: Normal={np.sum(y_grid_search==1)}, Anomaly={np.sum(y_grid_search==-1)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "9bc3f4d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "362c0a6f",
+   "metadata": {},
+   "source": [
+    "AE training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "da6eab4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg = 0.0001\n",
+    "encoder = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(units=X_train.shape[1], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "        tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "        tf.keras.layers.Dense(units=5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg))\n",
+    "        \n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "decoder = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(units=5,activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "        tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n",
+    "        tf.keras.layers.Dense(units=X_train.shape[1], activation='linear', kernel_regularizer=tf.keras.regularizers.l2(reg))\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "d60c566a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "autoencoder = tf.keras.Sequential([encoder, decoder])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "1bb4321d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "autoencoder.compile(\n",
+    "    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n",
+    "    loss='mse',\n",
+    "    \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "ffb09ce6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = autoencoder.fit(\n",
+    "    X_train, X_train,  # Input and target are the same for autoencoder\n",
+    "    epochs=500,\n",
+    "    batch_size=64,\n",
+    "    validation_data=(X_val_normal, X_val_normal),\n",
+    "    verbose=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "211e4720",
+   "metadata": {},
+   "source": [
+    "save of encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "096b858c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save\n",
+    "encoder.save('encoder_model.keras')\n",
+    "\n",
+    "# Load later\n",
+    "encoder_loaded = tf.keras.models.load_model('encoder_model.keras')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1da405de",
+   "metadata": {},
+   "source": [
+    "OCSVM Training with Encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "cd70f560",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_encoded = encoder.predict(X_train)\n",
+    "X_val_normal_encoded = encoder.predict(X_val_normal)\n",
+    "X_val_high_encoded = encoder.predict(X_val_high)\n",
+    "X_test_normal_encoded = encoder.predict(X_test_normal)\n",
+    "X_test_high_encoded = encoder.predict(X_test_high)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "0a292d06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)\n",
+    "ocsvm.fit(X_train_encoded)\n",
+    "\n",
+    "# Predict on validation/test sets\n",
+    "val_predictions = ocsvm.predict(np.vstack([X_val_normal_encoded, X_val_high_encoded]))\n",
+    "test_predictions = ocsvm.predict(np.vstack([X_test_normal_encoded, X_test_high_encoded]))\n",
+    "test_predictions.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "e1e08cc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(classification_report(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=test_predictions))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "552828b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_tools.plot_confusion_matrix(true_labels=np.concatenate([y_test_normal, y_test_high]), predictions=test_predictions, label_names=['high', 'low'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c9c78dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save\n",
+    "with open('ocsvm_model.pkl', 'wb') as f:\n",
+    "    pickle.dump(ocsvm, f)\n",
+    "\n",
+    "# Load later\n",
+    "with open('ocsvm_model.pkl', 'rb') as f:\n",
+    "    ocsvm_loaded = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dbad13e2",
+   "metadata": {},
+   "source": [
+    "### compl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45137e53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val_normal, X_val_high):\n",
+    "    \"\"\"\n",
+    "    Grid search with fixed train/validation split for novelty detection.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    estimator : OneClassSVM\n",
+    "        The OCSVM estimator\n",
+    "    param_grid : dict\n",
+    "        Parameter grid to search\n",
+    "    X_train : array-like\n",
+    "        Normal training samples only (encoded features)\n",
+    "    X_val_normal : array-like  \n",
+    "        Normal validation samples (encoded features)\n",
+    "    X_val_high : array-like\n",
+    "        High/anomaly validation samples (encoded features)\n",
+    "    \"\"\"\n",
+    "  \n",
+    "    \n",
+    "    # Combine validation data\n",
+    "    X_val = np.concatenate([X_val_normal, X_val_high])\n",
+    "    y_val = np.array([1] * len(X_val_normal) + [-1] * len(X_val_high))\n",
+    "    \n",
+    "    results = []\n",
+    "    \n",
+    "    for params in ParameterGrid(param_grid):\n",
+    "        # Set parameters\n",
+    "        estimator.set_params(**params)\n",
+    "        \n",
+    "        # Fit on training normal samples\n",
+    "        estimator.fit(X_train)\n",
+    "        \n",
+    "        # Score on validation set\n",
+    "        val_scores = estimator.score_samples(X_val)\n",
+    "        roc_auc = roc_auc_score(y_val, val_scores)\n",
+    "        \n",
+    "        results.append({\n",
+    "            'params': params,\n",
+    "            'score': roc_auc\n",
+    "        })\n",
+    "        print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n",
+    "    \n",
+    "    # Find best\n",
+    "    best_idx = np.argmax([r['score'] for r in results])\n",
+    "    best_params = results[best_idx]['params']\n",
+    "    best_score = results[best_idx]['score']\n",
+    "    \n",
+    "    print(f\"\\nBest params: {best_params}\")\n",
+    "    print(f\"Best ROC-AUC: {best_score:.4f}\")\n",
+    "    \n",
+    "    return best_params, best_score, results\n",
+    "\n",
+    "\n",
+    "def evaluate_ocsvm(ocsvm, X_normal, X_anomaly, set_name=\"Test\"):\n",
+    "    \"\"\"\n",
+    "    Evaluate OCSVM performance with comprehensive metrics.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    ocsvm : fitted OneClassSVM\n",
+    "        Trained OCSVM model\n",
+    "    X_normal : array-like\n",
+    "        Normal samples\n",
+    "    X_anomaly : array-like\n",
+    "        Anomaly samples\n",
+    "    set_name : str\n",
+    "        Name of the dataset (e.g., \"Validation\", \"Test\")\n",
+    "    \"\"\"\n",
+    " \n",
+    "    # Combine data\n",
+    "    X = np.vstack([X_normal, X_anomaly])\n",
+    "    y_true = np.array([1] * len(X_normal) + [-1] * len(X_anomaly))\n",
+    "    \n",
+    "    # Predictions\n",
+    "    y_pred = ocsvm.predict(X)\n",
+    "    scores = ocsvm.score_samples(X)\n",
+    "    \n",
+    "    # Calculate metrics\n",
+    "    roc_auc = roc_auc_score(y_true, scores)\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    \n",
+    "    # For precision, recall, F1: treat normal (1) as positive class\n",
+    "    precision = precision_score(y_true, y_pred, pos_label=1)\n",
+    "    recall = recall_score(y_true, y_pred, pos_label=1)\n",
+    "    f1 = f1_score(y_true, y_pred, pos_label=1)\n",
+    "    \n",
+    "    # Confusion matrix\n",
+    "    cm = confusion_matrix(y_true, y_pred, labels=[1, -1])\n",
+    "    tn, fp, fn, tp = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(f\"\\n{'='*50}\")\n",
+    "    print(f\"{set_name} Set Evaluation\")\n",
+    "    print(f\"{'='*50}\")\n",
+    "    print(f\"ROC-AUC Score:    {roc_auc:.4f}\")\n",
+    "    print(f\"Accuracy:         {accuracy:.4f}\")\n",
+    "    print(f\"Precision:        {precision:.4f}\")\n",
+    "    print(f\"Recall:           {recall:.4f}\")\n",
+    "    print(f\"F1-Score:         {f1:.4f}\")\n",
+    "    print(f\"\\nConfusion Matrix:\")\n",
+    "    print(f\"                Predicted\")\n",
+    "    print(f\"                Normal  Anomaly\")\n",
+    "    print(f\"Actual Normal   {tp:6d}  {fn:6d}\")\n",
+    "    print(f\"Actual Anomaly  {fp:6d}  {tn:6d}\")\n",
+    "    print(f\"\\nTrue Positives (Normal detected as Normal):   {tp}\")\n",
+    "    print(f\"False Negatives (Normal detected as Anomaly):  {fn}\")\n",
+    "    print(f\"False Positives (Anomaly detected as Normal):  {fp}\")\n",
+    "    print(f\"True Negatives (Anomaly detected as Anomaly):  {tn}\")\n",
+    "    \n",
+    "    # Detailed classification report\n",
+    "    print(f\"\\nClassification Report:\")\n",
+    "    print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly'], \n",
+    "                                labels=[1, -1]))\n",
+    "    \n",
+    "    return {\n",
+    "        'roc_auc': roc_auc,\n",
+    "        'accuracy': accuracy,\n",
+    "        'precision': precision,\n",
+    "        'recall': recall,\n",
+    "        'f1': f1,\n",
+    "        'confusion_matrix': cm,\n",
+    "        'predictions': y_pred,\n",
+    "        'scores': scores\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "e54c2ebd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Usage example:\n",
+    "# 1. Grid search\n",
+    "best_params, best_score, all_results = simple_one_class_grid_search(\n",
+    "    estimator=OneClassSVM(),\n",
+    "    param_grid={\n",
+    "        'nu': np.linspace(0.01, 0.3, 10),\n",
+    "        'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale', 'auto'],\n",
+    "        'kernel': ['rbf']\n",
+    "    },\n",
+    "    X_train=X_train_encoded,\n",
+    "    X_val_normal=X_val_normal_encoded,\n",
+    "    X_val_high=X_val_high_encoded\n",
+    ")\n",
+    "\n",
+    "# 2. Train final model with best params\n",
+    "final_ocsvm = OneClassSVM(**best_params)\n",
+    "final_ocsvm.fit(X_train_encoded)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "f7637b60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. Evaluate on validation set\n",
+    "val_metrics = evaluate_ocsvm(\n",
+    "    final_ocsvm, \n",
+    "    X_val_normal_encoded, \n",
+    "    X_val_high_encoded, \n",
+    "    set_name=\"Validation\"\n",
+    ")\n",
+    "\n",
+    "# 4. Evaluate on test set\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9576230f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_metrics = evaluate_ocsvm(\n",
+    "    final_ocsvm, \n",
+    "    X_test_normal_encoded, \n",
+    "    X_test_high_encoded, \n",
+    "    set_name=\"Test\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55109590",
+   "metadata": {},
+   "source": [
+    "## OneClass GridSearch (old)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "671b47a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def supervised_one_class_grid_search(estimator, param_grid, data, labels, seed=None):\n",
+    "    np.random.seed(seed)\n",
+    "    idx = np.arange(data.shape[0])\n",
+    "    anomaly_idx = idx[labels==-1]\n",
+    "    normal_idx = idx[labels!=-1]\n",
+    "\n",
+    "    np.random.shuffle(normal_idx)\n",
+    "\n",
+    "    cv = [(normal_idx[pair[0]], np.concatenate([normal_idx[pair[1]], anomaly_idx], axis=0)) for pair in KFold().split(normal_idx)]\n",
+    "    print(f\"CV Folds created:\")\n",
+    "    for i, (train_idx, val_idx) in enumerate(cv):\n",
+    "        print(f\"  Fold {i+1}: Train={len(train_idx)} normal, Val={len(val_idx)} total ({len(val_idx)-len(anomaly_idx)} normal + {len(anomaly_idx)} anomaly)\")\n",
+    "    grid_search = GridSearchCV(estimator=estimator,\n",
+    "                               param_grid=param_grid,\n",
+    "                               scoring=lambda est, X, y: roc_auc_score(y_true=y, y_score=est.score_samples(X)),\n",
+    "                               n_jobs=-2,\n",
+    "                               cv=cv,\n",
+    "                               verbose=1,\n",
+    "                               refit=False)\n",
+    "    \n",
+    "    grid_search.fit(data, labels)\n",
+    "\n",
+    "    return grid_search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8fabf322",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "estimator = OneClassSVM()\n",
+    "grid_search = supervised_one_class_grid_search(estimator=estimator,\n",
+    "                                               param_grid={'nu': np.linspace(0.01, 0.2, 10),\n",
+    "                                                           'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']},\n",
+    "                                               data=X_grid_search,\n",
+    "                                               labels=y_grid_search,\n",
+    "                                               seed=42)\n",
+    "print(f'Best results with {grid_search.best_params_}:')\n",
+    "print(f'Validation results: {grid_search.cv_results_[\"mean_test_score\"][grid_search.best_index_]} +- {grid_search.cv_results_[\"std_test_score\"][grid_search.best_index_]}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "651fc483",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xticks = [f'({param_dict[\"nu\"]:.3f}, {param_dict[\"gamma\"]})' for param_dict in grid_search.cv_results_['params']]\n",
+    "\n",
+    "plt.figure(figsize=(30,10))\n",
+    "plt.grid('on')\n",
+    "plt.title('Means scores +- std.')\n",
+    "plt.ylabel('Mean balanced accuracy')\n",
+    "plt.xlabel(r'($\\nu$, $\\gamma$)')\n",
+    "plt.errorbar(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['std_test_score'])\n",
+    "plt.xticks(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), xticks, rotation='vertical')\n",
+    "plt.show()\n",
+    "\n",
+    "estimator.set_params(**grid_search.best_params_)\n",
+    "estimator.fit(X_grid_search[y_grid_search == 1].to_numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "587f2301",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 8: Prepare independent test set\n",
+    "X_test_normal = test_normal_normalized[au_columns].copy()\n",
+    "X_test_high = test_high_normalized[au_columns].copy()\n",
+    "\n",
+    "# Combine test sets\n",
+    "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
+    "\n",
+    "# Create labels for test set\n",
+    "y_test_normal = np.ones(len(X_test_normal))  # 1 for normal\n",
+    "y_test_high = -np.ones(len(X_test_high))  # -1 for anomalies\n",
+    "y_test = np.concatenate([y_test_normal, y_test_high])\n",
+    "\n",
+    "print(f\"Test set shape: {X_test.shape}\")\n",
+    "print(f\"Test labels distribution: Normal={np.sum(y_test==1)}, Anomaly={np.sum(y_test==-1)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40703231",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get anomaly scores\n",
+    "y_scores = estimator.score_samples(X_test.values)\n",
+    "# Get predictions (-1 for anomaly, 1 for normal)\n",
+    "y_pred = estimator.predict(X_test.values)\n",
+    "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "815fbd78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bc13284",
+   "metadata": {},
+   "source": [
+    "### DEBUG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f0aa77a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test on validation normal samples (should predict as normal)\n",
+    "val_normal_pred = estimator.predict(X_val_normal.values)\n",
+    "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n",
+    "\n",
+    "# Test on validation anomaly samples (should predict as anomaly)\n",
+    "val_high_pred = estimator.predict(X_val_high.values)\n",
+    "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514cc22f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The model should predict most training samples as normal\n",
+    "train_pred = estimator.predict(X_train.values)\n",
+    "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n",
+    "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c9e1e12",
+   "metadata": {},
+   "source": [
+    "### to delete"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60d69a97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 5: Split data properly for grid search\n",
+    "X_train = train_normalized[au_columns].copy()\n",
+    "X_val_normal = val_normal_normalized[au_columns].copy()\n",
+    "X_val_high = val_high_normalized[au_columns].copy()\n",
+    "\n",
+    "# Grid search ONLY on train + val_normal\n",
+    "# Keep val_high completely separate for validation scoring\n",
+    "X_grid_search_train = X_train.copy()\n",
+    "X_grid_search_val = pd.concat([X_val_normal, X_val_high], ignore_index=True)\n",
+    "\n",
+    "y_grid_search_train = np.ones(len(X_train))\n",
+    "y_grid_search_val = np.concatenate([\n",
+    "    np.ones(len(X_val_normal)),\n",
+    "    -np.ones(len(X_val_high))\n",
+    "])\n",
+    "\n",
+    "print(f\"Grid search train shape: {X_grid_search_train.shape}\")\n",
+    "print(f\"Grid search val shape: {X_grid_search_val.shape}\")\n",
+    "print(f\"Val labels: Normal={np.sum(y_grid_search_val==1)}, Anomaly={np.sum(y_grid_search_val==-1)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d53d03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Modified grid search function for simple train/val split\n",
+    "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val, y_val):\n",
+    "    \"\"\"\n",
+    "    Grid search with fixed train/validation split.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    X_train : array-like\n",
+    "        Normal training samples only\n",
+    "    X_val : array-like  \n",
+    "        Validation samples (normal + anomalies)\n",
+    "    y_val : array-like\n",
+    "        Labels for validation samples (1=normal, -1=anomaly)\n",
+    "    \"\"\"\n",
+    "    from sklearn.model_selection import ParameterGrid\n",
+    "    from sklearn.metrics import roc_auc_score\n",
+    "    \n",
+    "    results = []\n",
+    "    \n",
+    "    for params in ParameterGrid(param_grid):\n",
+    "        # Set parameters\n",
+    "        estimator.set_params(**params)\n",
+    "        \n",
+    "        # Fit on training normal samples\n",
+    "        estimator.fit(X_train)\n",
+    "        \n",
+    "        # Score on validation set\n",
+    "        val_scores = estimator.score_samples(X_val)\n",
+    "        roc_auc = roc_auc_score(y_val, val_scores)\n",
+    "        \n",
+    "        results.append({\n",
+    "            'params': params,\n",
+    "            'score': roc_auc\n",
+    "        })\n",
+    "        print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n",
+    "    \n",
+    "    # Find best\n",
+    "    best_idx = np.argmax([r['score'] for r in results])\n",
+    "    best_params = results[best_idx]['params']\n",
+    "    best_score = results[best_idx]['score']\n",
+    "    \n",
+    "    return best_params, best_score, results\n",
+    "\n",
+    "# Run grid search\n",
+    "best_params, best_score, all_results = simple_one_class_grid_search(\n",
+    "    estimator=OneClassSVM(),\n",
+    "    param_grid={\n",
+    "        'nu': np.linspace(0.01, 0.3, 10),\n",
+    "        'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']\n",
+    "    },\n",
+    "    X_train=X_grid_search_train.to_numpy(),\n",
+    "    X_val=X_grid_search_val.to_numpy(),\n",
+    "    y_val=y_grid_search_val\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nBest params: {best_params}\")\n",
+    "print(f\"Best validation ROC-AUC: {best_score:.4f}\")\n",
+    "\n",
+    "# Fit final model on training data with best params\n",
+    "estimator = OneClassSVM(**best_params)\n",
+    "estimator.fit(X_grid_search_train.to_numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03bcef39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get anomaly scores\n",
+    "y_scores = estimator.score_samples(X_test.values)\n",
+    "# Get predictions (-1 for anomaly, 1 for normal)\n",
+    "y_pred = estimator.predict(X_test.values)\n",
+    "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acfb34aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d4bf434",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test on validation normal samples (should predict as normal)\n",
+    "val_normal_pred = estimator.predict(X_val_normal.values)\n",
+    "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n",
+    "\n",
+    "# Test on validation anomaly samples (should predict as anomaly)\n",
+    "val_high_pred = estimator.predict(X_val_high.values)\n",
+    "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n",
+    "# The model should predict most training samples as normal\n",
+    "train_pred = estimator.predict(X_train.values)\n",
+    "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n",
+    "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}