From 58d92fb423bae9ff757ceb6f39f0fc8ab28f3b78 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 19 Nov 2025 17:31:52 +0100 Subject: [PATCH] ae notebook --- model_training/OCSVM/ocsvm_with_AE.ipynb | 1080 ++++++++++++++++++++++ 1 file changed, 1080 insertions(+) create mode 100644 model_training/OCSVM/ocsvm_with_AE.ipynb diff --git a/model_training/OCSVM/ocsvm_with_AE.ipynb b/model_training/OCSVM/ocsvm_with_AE.ipynb new file mode 100644 index 0000000..f78a544 --- /dev/null +++ b/model_training/OCSVM/ocsvm_with_AE.ipynb @@ -0,0 +1,1080 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4ac35e21", + "metadata": {}, + "source": [ + "### Import" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "87513def", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import sys\n", + "import os\n", + "\n", + "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", + "sys.path.append(base_dir)\n", + "print(base_dir)\n", + "\n", + "# from tools import evaluation_tools\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "import pickle\n", + "import evaluation_tools\n", + "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n", + " recall_score, f1_score, confusion_matrix, classification_report) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1c303823", + "metadata": {}, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "46e29f47", + "metadata": {}, + "outputs": [], + "source": [ + "os.getcwd()" + ] + }, + { + "cell_type": "markdown", + "id": "83dc3a63", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0eef0bc8", + "metadata": {}, + "outputs": [], + "source": [ + "# data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n", + "data_path = Path(r\"output_windowed.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "597880bb", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(path=data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fa0f0fec", + "metadata": {}, + "outputs": [], + "source": [ + "subjects = df['subjectID'].unique()\n", + "print(subjects)\n", + "print(len(subjects))\n", + "print(len(subjects)*0.66)\n", + "print(len(subjects)*0.33)" + ] + }, + { + "cell_type": "markdown", + "id": "e80506dc", + "metadata": {}, + "source": [ + "### Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "897a2342", + "metadata": {}, + "outputs": [], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n", + "]\n", + "print(f\"low all: {low_all.shape}\")\n", + "\n", + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "print(f\"high n-back: {high_nback.shape}\")\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(f\"high k-drive: {high_kdrive.shape}\")\n", + "\n", + "high_all = pd.concat([high_nback, high_kdrive])\n", + "print(f\"high all: {high_all.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f6f59455", + "metadata": {}, + "outputs": [], + "source": [ + "# First split: separate test set\n", + "train_val_subjects, test_subjects = train_test_split(\n", + " subjects, \n", + " train_size=14, \n", + " test_size=4, \n", + " random_state=42\n", + ")\n", + "\n", + "# Second split: separate train and validation from the remaining subjects\n", + "# Adjust these numbers based on your total subject count\n", + "train_subjects, val_subjects = train_test_split(\n", + " train_val_subjects,\n", + " train_size=8,\n", + " test_size=6,\n", + " random_state=42\n", + ")\n", + "\n", + "print(f\"Train subjects: {len(train_subjects)}\")\n", + "print(f\"Validation subjects: {len(val_subjects)}\")\n", + "print(f\"Test subjects: {len(test_subjects)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "f215beb5", + "metadata": {}, + "outputs": [], + "source": [ + "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", + "\n", + "# Prepare training data (only normal/low data)\n", + "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n", + "\n", + "# Prepare validation data (normal and anomaly) - 500 samples each\n", + "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", + "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", + "val_normal_data = val_normal_data.sample(n=1000, random_state=42)\n", + "val_high_data = val_high_data.sample(n=1000, random_state=42)\n", + "\n", + "# Prepare test data (normal and anomaly) - 1000 samples each\n", + "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", + "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", + "test_normal_data = test_normal_data.sample(n=500, random_state=42)\n", + "test_high_data = test_high_data.sample(n=500, random_state=42)\n", + "\n", + "print(f\"Train samples: {len(train_data)}\")\n", + "print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n", + "print(f\"Test normal samples: {len(test_normal_data)}, Test high samples: {len(test_high_data)}\")\n", + "\n", + "# Track subject split\n", + "print(f\"\\nSubject split:\")\n", + "print(f\"Train subjects ({len(train_subjects)}): {sorted(train_subjects)}\")\n", + "print(f\"Val subjects ({len(val_subjects)}): {sorted(val_subjects)}\")\n", + "print(f\"Test subjects ({len(test_subjects)}): {sorted(test_subjects)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d3ebae31", + "metadata": {}, + "outputs": [], + "source": [ + "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n", + " \"\"\"\n", + " Fit normalization scalers on training data.\n", + " \n", + " Parameters:\n", + " -----------\n", + " train_data : pd.DataFrame\n", + " Training dataframe with AU columns and subjectID\n", + " au_columns : list\n", + " List of AU column names to normalize\n", + " method : str, default='standard'\n", + " Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n", + " scope : str, default='global'\n", + " Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n", + " \n", + " Returns:\n", + " --------\n", + " dict\n", + " Dictionary containing fitted scalers\n", + " \"\"\"\n", + " # Select scaler based on method\n", + " if method == 'standard':\n", + " Scaler = StandardScaler\n", + " elif method == 'minmax':\n", + " Scaler = MinMaxScaler\n", + " else:\n", + " raise ValueError(\"method must be 'standard' or 'minmax'\")\n", + " \n", + " scalers = {}\n", + " \n", + " if scope == 'subject':\n", + " # Fit one scaler per subject\n", + " for subject in train_data['subjectID'].unique():\n", + " subject_mask = train_data['subjectID'] == subject\n", + " scaler = Scaler()\n", + " scaler.fit(train_data.loc[subject_mask, au_columns])\n", + " scalers[subject] = scaler\n", + " \n", + " elif scope == 'global':\n", + " # Fit one scaler for all subjects\n", + " scaler = Scaler()\n", + " scaler.fit(train_data[au_columns])\n", + " scalers['global'] = scaler\n", + " \n", + " else:\n", + " raise ValueError(\"scope must be 'subject' or 'global'\")\n", + " \n", + " return {'scalers': scalers, 'method': method, 'scope': scope}" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f586a99b", + "metadata": {}, + "outputs": [], + "source": [ + "def apply_normalizer(data, au_columns, normalizer_dict):\n", + " \"\"\"\n", + " Apply fitted normalization scalers to data.\n", + " \n", + " Parameters:\n", + " -----------\n", + " data : pd.DataFrame\n", + " Dataframe with AU columns and subjectID\n", + " au_columns : list\n", + " List of AU column names to normalize\n", + " normalizer_dict : dict\n", + " Dictionary containing fitted scalers from fit_normalizer()\n", + " \n", + " Returns:\n", + " --------\n", + " pd.DataFrame\n", + " DataFrame with normalized AU columns\n", + " \"\"\"\n", + " normalized_data = data.copy()\n", + " scalers = normalizer_dict['scalers']\n", + " scope = normalizer_dict['scope']\n", + " \n", + " if scope == 'subject':\n", + " # Apply per-subject normalization\n", + " for subject in data['subjectID'].unique():\n", + " subject_mask = data['subjectID'] == subject\n", + " \n", + " # Use the subject's scaler if available, otherwise use a fitted scaler from training\n", + " if subject in scalers:\n", + " scaler = scalers[subject]\n", + " else:\n", + " # For new subjects not seen in training, use the first available scaler\n", + " # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n", + " print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n", + " scaler = list(scalers.values())[0]\n", + " \n", + " normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n", + " data.loc[subject_mask, au_columns]\n", + " )\n", + " \n", + " elif scope == 'global':\n", + " # Apply global normalization\n", + " scaler = scalers['global']\n", + " normalized_data[au_columns] = scaler.transform(data[au_columns])\n", + " \n", + " return normalized_data" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "74e363aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 3: Fit normalizer on training data\n", + "normalizer = fit_normalizer(train_data, au_columns, method='standard', scope='global')\n", + "print(\"Normalizer fitted on training data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5dd6c4b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 4: Apply normalization to all datasets\n", + "train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n", + "val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n", + "val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n", + "test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n", + "test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n", + "\n", + "print(\"Normalization applied to all datasets\")" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "624a5374", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 5: Extract AU columns and create labels for grid search\n", + "# Extract only AU columns (drop subjectID)\n", + "X_train = train_normalized[au_columns].copy()\n", + "X_val_normal = val_normal_normalized[au_columns].copy()\n", + "X_val_high = val_high_normalized[au_columns].copy()\n", + "X_test_high = test_normal_normalized[au_columns].copy()\n", + "X_test_normal = test_high_normalized[au_columns].copy()\n", + "# Combine train and validation sets for grid search\n", + "# X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n", + "\n", + "# Create labels for grid search\n", + "y_train = np.ones(len(X_train)) # 1 for normal (training)\n", + "y_val_normal = np.ones(len(X_val_normal)) # 1 for normal (validation)\n", + "y_val_high = -np.ones(len(X_val_high)) # -1 for anomalies (validation)\n", + "y_test_normal = np.ones(len(X_test_normal))\n", + "y_test_high = -np.ones(len(X_test_high))\n", + "\n", + "# y_grid_search = np.concatenate([y_train, y_val_normal, y_val_high])\n", + "\n", + "# print(f\"Grid search data shape: {X_grid_search.shape}\")\n", + "# print(f\"Labels distribution: Normal={np.sum(y_grid_search==1)}, Anomaly={np.sum(y_grid_search==-1)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "9bc3f4d3", + "metadata": {}, + "outputs": [], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "markdown", + "id": "362c0a6f", + "metadata": {}, + "source": [ + "AE training" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "da6eab4f", + "metadata": {}, + "outputs": [], + "source": [ + "reg = 0.0001\n", + "encoder = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Dense(units=X_train.shape[1], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(units=5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + " \n", + " ]\n", + ")\n", + "\n", + "decoder = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Dense(units=5,activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + " tf.keras.layers.Dense(units=X_train.shape[1], activation='linear', kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "d60c566a", + "metadata": {}, + "outputs": [], + "source": [ + "autoencoder = tf.keras.Sequential([encoder, decoder])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "1bb4321d", + "metadata": {}, + "outputs": [], + "source": [ + "autoencoder.compile(\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n", + " loss='mse',\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "ffb09ce6", + "metadata": {}, + "outputs": [], + "source": [ + "history = autoencoder.fit(\n", + " X_train, X_train, # Input and target are the same for autoencoder\n", + " epochs=500,\n", + " batch_size=64,\n", + " validation_data=(X_val_normal, X_val_normal),\n", + " verbose=1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "211e4720", + "metadata": {}, + "source": [ + "save of encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "096b858c", + "metadata": {}, + "outputs": [], + "source": [ + "# Save\n", + "encoder.save('encoder_model.keras')\n", + "\n", + "# Load later\n", + "encoder_loaded = tf.keras.models.load_model('encoder_model.keras')" + ] + }, + { + "cell_type": "markdown", + "id": "1da405de", + "metadata": {}, + "source": [ + "OCSVM Training with Encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "cd70f560", + "metadata": {}, + "outputs": [], + "source": [ + "X_train_encoded = encoder.predict(X_train)\n", + "X_val_normal_encoded = encoder.predict(X_val_normal)\n", + "X_val_high_encoded = encoder.predict(X_val_high)\n", + "X_test_normal_encoded = encoder.predict(X_test_normal)\n", + "X_test_high_encoded = encoder.predict(X_test_high)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "0a292d06", + "metadata": {}, + "outputs": [], + "source": [ + "ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)\n", + "ocsvm.fit(X_train_encoded)\n", + "\n", + "# Predict on validation/test sets\n", + "val_predictions = ocsvm.predict(np.vstack([X_val_normal_encoded, X_val_high_encoded]))\n", + "test_predictions = ocsvm.predict(np.vstack([X_test_normal_encoded, X_test_high_encoded]))\n", + "test_predictions.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "e1e08cc3", + "metadata": {}, + "outputs": [], + "source": [ + "print(classification_report(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=test_predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "552828b7", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_tools.plot_confusion_matrix(true_labels=np.concatenate([y_test_normal, y_test_high]), predictions=test_predictions, label_names=['high', 'low'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c9c78dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Save\n", + "with open('ocsvm_model.pkl', 'wb') as f:\n", + " pickle.dump(ocsvm, f)\n", + "\n", + "# Load later\n", + "with open('ocsvm_model.pkl', 'rb') as f:\n", + " ocsvm_loaded = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "id": "dbad13e2", + "metadata": {}, + "source": [ + "### compl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45137e53", + "metadata": {}, + "outputs": [], + "source": [ + "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val_normal, X_val_high):\n", + " \"\"\"\n", + " Grid search with fixed train/validation split for novelty detection.\n", + " \n", + " Parameters:\n", + " -----------\n", + " estimator : OneClassSVM\n", + " The OCSVM estimator\n", + " param_grid : dict\n", + " Parameter grid to search\n", + " X_train : array-like\n", + " Normal training samples only (encoded features)\n", + " X_val_normal : array-like \n", + " Normal validation samples (encoded features)\n", + " X_val_high : array-like\n", + " High/anomaly validation samples (encoded features)\n", + " \"\"\"\n", + " \n", + " \n", + " # Combine validation data\n", + " X_val = np.concatenate([X_val_normal, X_val_high])\n", + " y_val = np.array([1] * len(X_val_normal) + [-1] * len(X_val_high))\n", + " \n", + " results = []\n", + " \n", + " for params in ParameterGrid(param_grid):\n", + " # Set parameters\n", + " estimator.set_params(**params)\n", + " \n", + " # Fit on training normal samples\n", + " estimator.fit(X_train)\n", + " \n", + " # Score on validation set\n", + " val_scores = estimator.score_samples(X_val)\n", + " roc_auc = roc_auc_score(y_val, val_scores)\n", + " \n", + " results.append({\n", + " 'params': params,\n", + " 'score': roc_auc\n", + " })\n", + " print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n", + " \n", + " # Find best\n", + " best_idx = np.argmax([r['score'] for r in results])\n", + " best_params = results[best_idx]['params']\n", + " best_score = results[best_idx]['score']\n", + " \n", + " print(f\"\\nBest params: {best_params}\")\n", + " print(f\"Best ROC-AUC: {best_score:.4f}\")\n", + " \n", + " return best_params, best_score, results\n", + "\n", + "\n", + "def evaluate_ocsvm(ocsvm, X_normal, X_anomaly, set_name=\"Test\"):\n", + " \"\"\"\n", + " Evaluate OCSVM performance with comprehensive metrics.\n", + " \n", + " Parameters:\n", + " -----------\n", + " ocsvm : fitted OneClassSVM\n", + " Trained OCSVM model\n", + " X_normal : array-like\n", + " Normal samples\n", + " X_anomaly : array-like\n", + " Anomaly samples\n", + " set_name : str\n", + " Name of the dataset (e.g., \"Validation\", \"Test\")\n", + " \"\"\"\n", + " \n", + " # Combine data\n", + " X = np.vstack([X_normal, X_anomaly])\n", + " y_true = np.array([1] * len(X_normal) + [-1] * len(X_anomaly))\n", + " \n", + " # Predictions\n", + " y_pred = ocsvm.predict(X)\n", + " scores = ocsvm.score_samples(X)\n", + " \n", + " # Calculate metrics\n", + " roc_auc = roc_auc_score(y_true, scores)\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " \n", + " # For precision, recall, F1: treat normal (1) as positive class\n", + " precision = precision_score(y_true, y_pred, pos_label=1)\n", + " recall = recall_score(y_true, y_pred, pos_label=1)\n", + " f1 = f1_score(y_true, y_pred, pos_label=1)\n", + " \n", + " # Confusion matrix\n", + " cm = confusion_matrix(y_true, y_pred, labels=[1, -1])\n", + " tn, fp, fn, tp = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]\n", + " \n", + " # Print results\n", + " print(f\"\\n{'='*50}\")\n", + " print(f\"{set_name} Set Evaluation\")\n", + " print(f\"{'='*50}\")\n", + " print(f\"ROC-AUC Score: {roc_auc:.4f}\")\n", + " print(f\"Accuracy: {accuracy:.4f}\")\n", + " print(f\"Precision: {precision:.4f}\")\n", + " print(f\"Recall: {recall:.4f}\")\n", + " print(f\"F1-Score: {f1:.4f}\")\n", + " print(f\"\\nConfusion Matrix:\")\n", + " print(f\" Predicted\")\n", + " print(f\" Normal Anomaly\")\n", + " print(f\"Actual Normal {tp:6d} {fn:6d}\")\n", + " print(f\"Actual Anomaly {fp:6d} {tn:6d}\")\n", + " print(f\"\\nTrue Positives (Normal detected as Normal): {tp}\")\n", + " print(f\"False Negatives (Normal detected as Anomaly): {fn}\")\n", + " print(f\"False Positives (Anomaly detected as Normal): {fp}\")\n", + " print(f\"True Negatives (Anomaly detected as Anomaly): {tn}\")\n", + " \n", + " # Detailed classification report\n", + " print(f\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly'], \n", + " labels=[1, -1]))\n", + " \n", + " return {\n", + " 'roc_auc': roc_auc,\n", + " 'accuracy': accuracy,\n", + " 'precision': precision,\n", + " 'recall': recall,\n", + " 'f1': f1,\n", + " 'confusion_matrix': cm,\n", + " 'predictions': y_pred,\n", + " 'scores': scores\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "e54c2ebd", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Usage example:\n", + "# 1. Grid search\n", + "best_params, best_score, all_results = simple_one_class_grid_search(\n", + " estimator=OneClassSVM(),\n", + " param_grid={\n", + " 'nu': np.linspace(0.01, 0.3, 10),\n", + " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale', 'auto'],\n", + " 'kernel': ['rbf']\n", + " },\n", + " X_train=X_train_encoded,\n", + " X_val_normal=X_val_normal_encoded,\n", + " X_val_high=X_val_high_encoded\n", + ")\n", + "\n", + "# 2. Train final model with best params\n", + "final_ocsvm = OneClassSVM(**best_params)\n", + "final_ocsvm.fit(X_train_encoded)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "f7637b60", + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Evaluate on validation set\n", + "val_metrics = evaluate_ocsvm(\n", + " final_ocsvm, \n", + " X_val_normal_encoded, \n", + " X_val_high_encoded, \n", + " set_name=\"Validation\"\n", + ")\n", + "\n", + "# 4. Evaluate on test set\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9576230f", + "metadata": {}, + "outputs": [], + "source": [ + "test_metrics = evaluate_ocsvm(\n", + " final_ocsvm, \n", + " X_test_normal_encoded, \n", + " X_test_high_encoded, \n", + " set_name=\"Test\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "55109590", + "metadata": {}, + "source": [ + "## OneClass GridSearch (old)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "671b47a1", + "metadata": {}, + "outputs": [], + "source": [ + "def supervised_one_class_grid_search(estimator, param_grid, data, labels, seed=None):\n", + " np.random.seed(seed)\n", + " idx = np.arange(data.shape[0])\n", + " anomaly_idx = idx[labels==-1]\n", + " normal_idx = idx[labels!=-1]\n", + "\n", + " np.random.shuffle(normal_idx)\n", + "\n", + " cv = [(normal_idx[pair[0]], np.concatenate([normal_idx[pair[1]], anomaly_idx], axis=0)) for pair in KFold().split(normal_idx)]\n", + " print(f\"CV Folds created:\")\n", + " for i, (train_idx, val_idx) in enumerate(cv):\n", + " print(f\" Fold {i+1}: Train={len(train_idx)} normal, Val={len(val_idx)} total ({len(val_idx)-len(anomaly_idx)} normal + {len(anomaly_idx)} anomaly)\")\n", + " grid_search = GridSearchCV(estimator=estimator,\n", + " param_grid=param_grid,\n", + " scoring=lambda est, X, y: roc_auc_score(y_true=y, y_score=est.score_samples(X)),\n", + " n_jobs=-2,\n", + " cv=cv,\n", + " verbose=1,\n", + " refit=False)\n", + " \n", + " grid_search.fit(data, labels)\n", + "\n", + " return grid_search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fabf322", + "metadata": {}, + "outputs": [], + "source": [ + "estimator = OneClassSVM()\n", + "grid_search = supervised_one_class_grid_search(estimator=estimator,\n", + " param_grid={'nu': np.linspace(0.01, 0.2, 10),\n", + " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']},\n", + " data=X_grid_search,\n", + " labels=y_grid_search,\n", + " seed=42)\n", + "print(f'Best results with {grid_search.best_params_}:')\n", + "print(f'Validation results: {grid_search.cv_results_[\"mean_test_score\"][grid_search.best_index_]} +- {grid_search.cv_results_[\"std_test_score\"][grid_search.best_index_]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "651fc483", + "metadata": {}, + "outputs": [], + "source": [ + "xticks = [f'({param_dict[\"nu\"]:.3f}, {param_dict[\"gamma\"]})' for param_dict in grid_search.cv_results_['params']]\n", + "\n", + "plt.figure(figsize=(30,10))\n", + "plt.grid('on')\n", + "plt.title('Means scores +- std.')\n", + "plt.ylabel('Mean balanced accuracy')\n", + "plt.xlabel(r'($\\nu$, $\\gamma$)')\n", + "plt.errorbar(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['std_test_score'])\n", + "plt.xticks(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), xticks, rotation='vertical')\n", + "plt.show()\n", + "\n", + "estimator.set_params(**grid_search.best_params_)\n", + "estimator.fit(X_grid_search[y_grid_search == 1].to_numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "587f2301", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 8: Prepare independent test set\n", + "X_test_normal = test_normal_normalized[au_columns].copy()\n", + "X_test_high = test_high_normalized[au_columns].copy()\n", + "\n", + "# Combine test sets\n", + "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n", + "\n", + "# Create labels for test set\n", + "y_test_normal = np.ones(len(X_test_normal)) # 1 for normal\n", + "y_test_high = -np.ones(len(X_test_high)) # -1 for anomalies\n", + "y_test = np.concatenate([y_test_normal, y_test_high])\n", + "\n", + "print(f\"Test set shape: {X_test.shape}\")\n", + "print(f\"Test labels distribution: Normal={np.sum(y_test==1)}, Anomaly={np.sum(y_test==-1)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40703231", + "metadata": {}, + "outputs": [], + "source": [ + "# Get anomaly scores\n", + "y_scores = estimator.score_samples(X_test.values)\n", + "# Get predictions (-1 for anomaly, 1 for normal)\n", + "y_pred = estimator.predict(X_test.values)\n", + "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "815fbd78", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])" + ] + }, + { + "cell_type": "markdown", + "id": "8bc13284", + "metadata": {}, + "source": [ + "### DEBUG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f0aa77a", + "metadata": {}, + "outputs": [], + "source": [ + "# Test on validation normal samples (should predict as normal)\n", + "val_normal_pred = estimator.predict(X_val_normal.values)\n", + "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n", + "\n", + "# Test on validation anomaly samples (should predict as anomaly)\n", + "val_high_pred = estimator.predict(X_val_high.values)\n", + "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514cc22f", + "metadata": {}, + "outputs": [], + "source": [ + "# The model should predict most training samples as normal\n", + "train_pred = estimator.predict(X_train.values)\n", + "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n", + "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c9e1e12", + "metadata": {}, + "source": [ + "### to delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60d69a97", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 5: Split data properly for grid search\n", + "X_train = train_normalized[au_columns].copy()\n", + "X_val_normal = val_normal_normalized[au_columns].copy()\n", + "X_val_high = val_high_normalized[au_columns].copy()\n", + "\n", + "# Grid search ONLY on train + val_normal\n", + "# Keep val_high completely separate for validation scoring\n", + "X_grid_search_train = X_train.copy()\n", + "X_grid_search_val = pd.concat([X_val_normal, X_val_high], ignore_index=True)\n", + "\n", + "y_grid_search_train = np.ones(len(X_train))\n", + "y_grid_search_val = np.concatenate([\n", + " np.ones(len(X_val_normal)),\n", + " -np.ones(len(X_val_high))\n", + "])\n", + "\n", + "print(f\"Grid search train shape: {X_grid_search_train.shape}\")\n", + "print(f\"Grid search val shape: {X_grid_search_val.shape}\")\n", + "print(f\"Val labels: Normal={np.sum(y_grid_search_val==1)}, Anomaly={np.sum(y_grid_search_val==-1)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d53d03e", + "metadata": {}, + "outputs": [], + "source": [ + "# Modified grid search function for simple train/val split\n", + "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val, y_val):\n", + " \"\"\"\n", + " Grid search with fixed train/validation split.\n", + " \n", + " Parameters:\n", + " -----------\n", + " X_train : array-like\n", + " Normal training samples only\n", + " X_val : array-like \n", + " Validation samples (normal + anomalies)\n", + " y_val : array-like\n", + " Labels for validation samples (1=normal, -1=anomaly)\n", + " \"\"\"\n", + " from sklearn.model_selection import ParameterGrid\n", + " from sklearn.metrics import roc_auc_score\n", + " \n", + " results = []\n", + " \n", + " for params in ParameterGrid(param_grid):\n", + " # Set parameters\n", + " estimator.set_params(**params)\n", + " \n", + " # Fit on training normal samples\n", + " estimator.fit(X_train)\n", + " \n", + " # Score on validation set\n", + " val_scores = estimator.score_samples(X_val)\n", + " roc_auc = roc_auc_score(y_val, val_scores)\n", + " \n", + " results.append({\n", + " 'params': params,\n", + " 'score': roc_auc\n", + " })\n", + " print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n", + " \n", + " # Find best\n", + " best_idx = np.argmax([r['score'] for r in results])\n", + " best_params = results[best_idx]['params']\n", + " best_score = results[best_idx]['score']\n", + " \n", + " return best_params, best_score, results\n", + "\n", + "# Run grid search\n", + "best_params, best_score, all_results = simple_one_class_grid_search(\n", + " estimator=OneClassSVM(),\n", + " param_grid={\n", + " 'nu': np.linspace(0.01, 0.3, 10),\n", + " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']\n", + " },\n", + " X_train=X_grid_search_train.to_numpy(),\n", + " X_val=X_grid_search_val.to_numpy(),\n", + " y_val=y_grid_search_val\n", + ")\n", + "\n", + "print(f\"\\nBest params: {best_params}\")\n", + "print(f\"Best validation ROC-AUC: {best_score:.4f}\")\n", + "\n", + "# Fit final model on training data with best params\n", + "estimator = OneClassSVM(**best_params)\n", + "estimator.fit(X_grid_search_train.to_numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03bcef39", + "metadata": {}, + "outputs": [], + "source": [ + "# Get anomaly scores\n", + "y_scores = estimator.score_samples(X_test.values)\n", + "# Get predictions (-1 for anomaly, 1 for normal)\n", + "y_pred = estimator.predict(X_test.values)\n", + "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acfb34aa", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d4bf434", + "metadata": {}, + "outputs": [], + "source": [ + "# Test on validation normal samples (should predict as normal)\n", + "val_normal_pred = estimator.predict(X_val_normal.values)\n", + "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n", + "\n", + "# Test on validation anomaly samples (should predict as anomaly)\n", + "val_high_pred = estimator.predict(X_val_high.values)\n", + "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n", + "# The model should predict most training samples as normal\n", + "train_pred = estimator.predict(X_train.values)\n", + "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n", + "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}