diff --git a/model_training/OCSVM/ocsvm_with_AE.ipynb b/model_training/OCSVM/ocsvm_with_AE.ipynb index f78a544..88db26d 100644 --- a/model_training/OCSVM/ocsvm_with_AE.ipynb +++ b/model_training/OCSVM/ocsvm_with_AE.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "id": "87513def", "metadata": {}, "outputs": [], @@ -25,38 +25,17 @@ "sys.path.append(base_dir)\n", "print(base_dir)\n", "\n", - "# from tools import evaluation_tools\n", - "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import OneClassSVM\n", "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n", "import matplotlib.pyplot as plt\n", "import tensorflow as tf\n", "import pickle\n", - "import evaluation_tools\n", - "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n", + "from tools import evaluation_tools, scaler\n", + "from sklearn.metrics import (balanced_accuracy_score, accuracy_score, precision_score, \n", " recall_score, f1_score, confusion_matrix, classification_report) " ] }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1c303823", - "metadata": {}, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "46e29f47", - "metadata": {}, - "outputs": [], - "source": [ - "os.getcwd()" - ] - }, { "cell_type": "markdown", "id": "83dc3a63", @@ -67,18 +46,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "0eef0bc8", "metadata": {}, "outputs": [], "source": [ - "# data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n", - "data_path = Path(r\"output_windowed.parquet\")" + "data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "597880bb", "metadata": {}, "outputs": [], @@ -88,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "fa0f0fec", "metadata": {}, "outputs": [], @@ -110,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "897a2342", "metadata": {}, "outputs": [], @@ -138,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "f6f59455", "metadata": {}, "outputs": [], @@ -167,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "f215beb5", "metadata": {}, "outputs": [], @@ -177,7 +155,7 @@ "# Prepare training data (only normal/low data)\n", "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n", "\n", - "# Prepare validation data (normal and anomaly) - 500 samples each\n", + "# Prepare validation data (normal and anomaly) \n", "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", "val_normal_data = val_normal_data.sample(n=1000, random_state=42)\n", @@ -202,148 +180,36 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "d3ebae31", - "metadata": {}, - "outputs": [], - "source": [ - "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n", - " \"\"\"\n", - " Fit normalization scalers on training data.\n", - " \n", - " Parameters:\n", - " -----------\n", - " train_data : pd.DataFrame\n", - " Training dataframe with AU columns and subjectID\n", - " au_columns : list\n", - " List of AU column names to normalize\n", - " method : str, default='standard'\n", - " Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n", - " scope : str, default='global'\n", - " Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n", - " \n", - " Returns:\n", - " --------\n", - " dict\n", - " Dictionary containing fitted scalers\n", - " \"\"\"\n", - " # Select scaler based on method\n", - " if method == 'standard':\n", - " Scaler = StandardScaler\n", - " elif method == 'minmax':\n", - " Scaler = MinMaxScaler\n", - " else:\n", - " raise ValueError(\"method must be 'standard' or 'minmax'\")\n", - " \n", - " scalers = {}\n", - " \n", - " if scope == 'subject':\n", - " # Fit one scaler per subject\n", - " for subject in train_data['subjectID'].unique():\n", - " subject_mask = train_data['subjectID'] == subject\n", - " scaler = Scaler()\n", - " scaler.fit(train_data.loc[subject_mask, au_columns])\n", - " scalers[subject] = scaler\n", - " \n", - " elif scope == 'global':\n", - " # Fit one scaler for all subjects\n", - " scaler = Scaler()\n", - " scaler.fit(train_data[au_columns])\n", - " scalers['global'] = scaler\n", - " \n", - " else:\n", - " raise ValueError(\"scope must be 'subject' or 'global'\")\n", - " \n", - " return {'scalers': scalers, 'method': method, 'scope': scope}" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "f586a99b", - "metadata": {}, - "outputs": [], - "source": [ - "def apply_normalizer(data, au_columns, normalizer_dict):\n", - " \"\"\"\n", - " Apply fitted normalization scalers to data.\n", - " \n", - " Parameters:\n", - " -----------\n", - " data : pd.DataFrame\n", - " Dataframe with AU columns and subjectID\n", - " au_columns : list\n", - " List of AU column names to normalize\n", - " normalizer_dict : dict\n", - " Dictionary containing fitted scalers from fit_normalizer()\n", - " \n", - " Returns:\n", - " --------\n", - " pd.DataFrame\n", - " DataFrame with normalized AU columns\n", - " \"\"\"\n", - " normalized_data = data.copy()\n", - " scalers = normalizer_dict['scalers']\n", - " scope = normalizer_dict['scope']\n", - " \n", - " if scope == 'subject':\n", - " # Apply per-subject normalization\n", - " for subject in data['subjectID'].unique():\n", - " subject_mask = data['subjectID'] == subject\n", - " \n", - " # Use the subject's scaler if available, otherwise use a fitted scaler from training\n", - " if subject in scalers:\n", - " scaler = scalers[subject]\n", - " else:\n", - " # For new subjects not seen in training, use the first available scaler\n", - " # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n", - " print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n", - " scaler = list(scalers.values())[0]\n", - " \n", - " normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n", - " data.loc[subject_mask, au_columns]\n", - " )\n", - " \n", - " elif scope == 'global':\n", - " # Apply global normalization\n", - " scaler = scalers['global']\n", - " normalized_data[au_columns] = scaler.transform(data[au_columns])\n", - " \n", - " return normalized_data" - ] - }, - { - "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "74e363aa", "metadata": {}, "outputs": [], "source": [ "# Cell 3: Fit normalizer on training data\n", - "normalizer = fit_normalizer(train_data, au_columns, method='standard', scope='global')\n", + "normalizer = scaler.fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n", "print(\"Normalizer fitted on training data\")" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "5dd6c4b7", "metadata": {}, "outputs": [], "source": [ "# Cell 4: Apply normalization to all datasets\n", - "train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n", - "val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n", - "val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n", - "test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n", - "test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n", + "train_normalized = scaler.apply_normalizer(train_data, au_columns, normalizer)\n", + "val_normal_normalized = scaler.apply_normalizer(val_normal_data, au_columns, normalizer)\n", + "val_high_normalized = scaler.apply_normalizer(val_high_data, au_columns, normalizer)\n", + "test_normal_normalized = scaler.apply_normalizer(test_normal_data, au_columns, normalizer)\n", + "test_high_normalized = scaler.apply_normalizer(test_high_data, au_columns, normalizer)\n", "\n", "print(\"Normalization applied to all datasets\")" ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": null, "id": "624a5374", "metadata": {}, "outputs": [], @@ -353,27 +219,21 @@ "X_train = train_normalized[au_columns].copy()\n", "X_val_normal = val_normal_normalized[au_columns].copy()\n", "X_val_high = val_high_normalized[au_columns].copy()\n", - "X_test_high = test_normal_normalized[au_columns].copy()\n", - "X_test_normal = test_high_normalized[au_columns].copy()\n", - "# Combine train and validation sets for grid search\n", - "# X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n", + "X_test_high = test_high_normalized[au_columns].copy()\n", + "X_test_normal = test_normal_normalized[au_columns].copy()\n", + "\n", "\n", "# Create labels for grid search\n", "y_train = np.ones(len(X_train)) # 1 for normal (training)\n", "y_val_normal = np.ones(len(X_val_normal)) # 1 for normal (validation)\n", "y_val_high = -np.ones(len(X_val_high)) # -1 for anomalies (validation)\n", "y_test_normal = np.ones(len(X_test_normal))\n", - "y_test_high = -np.ones(len(X_test_high))\n", - "\n", - "# y_grid_search = np.concatenate([y_train, y_val_normal, y_val_high])\n", - "\n", - "# print(f\"Grid search data shape: {X_grid_search.shape}\")\n", - "# print(f\"Labels distribution: Normal={np.sum(y_grid_search==1)}, Anomaly={np.sum(y_grid_search==-1)}\")" + "y_test_high = -np.ones(len(X_test_high))" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "9bc3f4d3", "metadata": {}, "outputs": [], @@ -381,43 +241,192 @@ "X_train.shape" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "50fc80dc-fe16-4917-aad6-0dbaa1ce5ef9", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install keras-tuner --quiet # nur einmal nötig\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from kerastuner import HyperModel\n", + "from kerastuner.tuners import RandomSearch\n", + "\n", + "# 1️⃣ HyperModel definieren\n", + "class AutoencoderHyperModel(HyperModel):\n", + " def __init__(self, input_dim):\n", + " self.input_dim = input_dim\n", + "\n", + " def build(self, hp):\n", + " reg = hp.Float(\"l2_reg\", min_value=1e-5, max_value=0.01, sampling=\"log\")\n", + " lr = hp.Float(\"learning_rate\", 1e-4, 1e-2, sampling=\"log\")\n", + "\n", + " # Encoder\n", + " encoder = keras.Sequential([\n", + " keras.layers.Dense(\n", + " units=hp.Int(\"enc_units1\", min_value=10, max_value=self.input_dim, step=10),\n", + " activation=None,\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " keras.layers.LeakyReLU(alpha=0.1),\n", + " keras.layers.Dense(\n", + " units=hp.Int(\"enc_units2\", min_value=5, max_value=20, step=1),\n", + " activation=tf.keras.layers.LeakyReLU(alpha=0.1),\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " keras.layers.Dense(\n", + " units=2, # Bottleneck\n", + " activation='linear',\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " ])\n", + "\n", + " # Decoder\n", + " decoder = keras.Sequential([\n", + " keras.layers.Dense(\n", + " units=hp.Int(\"dec_units1\", min_value=5, max_value=20, step=1),\n", + " activation=tf.keras.layers.LeakyReLU(alpha=0.1),\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " keras.layers.Dense(\n", + " units=hp.Int(\"dec_units2\", min_value=10, max_value=self.input_dim, step=10),\n", + " activation=tf.keras.layers.LeakyReLU(alpha=0.1),\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " keras.layers.Dense(\n", + " units=self.input_dim,\n", + " activation='linear',\n", + " kernel_regularizer=keras.regularizers.l2(reg)\n", + " ),\n", + " ])\n", + "\n", + " # Autoencoder\n", + " inputs = keras.Input(shape=(self.input_dim,))\n", + " encoded = encoder(inputs)\n", + " decoded = decoder(encoded)\n", + " autoencoder = keras.Model(inputs, decoded)\n", + "\n", + " autoencoder.compile(\n", + " optimizer=keras.optimizers.Adam(learning_rate=lr),\n", + " loss='mse'\n", + " )\n", + "\n", + " return autoencoder\n", + "\n", + "# 2️⃣ RandomSearch-Tuner\n", + "hypermodel = AutoencoderHyperModel(input_dim=X_train.shape[1])\n", + "\n", + "tuner = RandomSearch(\n", + " hypermodel,\n", + " objective='val_loss',\n", + " max_trials=10, # Anzahl der getesteten Kombinationen\n", + " executions_per_trial=1, # Anzahl Trainings pro Kombination\n", + " directory='tuner_dir',\n", + " project_name='oc_ae'\n", + ")\n", + "\n", + "# 3️⃣ Hyperparameter-Tuning starten\n", + "tuner.search(\n", + " X_train, X_train,\n", + " epochs=100,\n", + " batch_size=64,\n", + " validation_data=(X_val_normal, X_val_normal),\n", + " verbose=0\n", + ")\n", + "\n", + "# 4️⃣ Beste Architektur holen\n", + "best_model = tuner.get_best_models(num_models=1)[0]\n", + "best_hyperparameters = tuner.get_best_hyperparameters(1)[0]\n", + "\n", + "print(\"Beste Hyperparameter:\", best_hyperparameters.values)\n" + ] + }, { "cell_type": "markdown", "id": "362c0a6f", "metadata": {}, "source": [ - "AE training" + "\n", + "Beste Hyperparameter: {'l2_reg': 1.3757411430582133e-05, 'learning_rate': 0.007321002854350309, 'enc_units1': 20, 'enc_units2': 16, 'dec_units1': 14, 'dec_units2': 10}" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "da6eab4f", "metadata": {}, "outputs": [], "source": [ - "reg = 0.0001\n", - "encoder = tf.keras.Sequential(\n", - " [\n", - " tf.keras.layers.Dense(units=X_train.shape[1], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", - " tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", - " tf.keras.layers.Dense(units=5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + "# reg = 0.1\n", + "# encoder = tf.keras.Sequential(\n", + "# [\n", + "# tf.keras.layers.Dense(units=X_train.shape[1], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + "# tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + "# tf.keras.layers.Dense(units=5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", " \n", - " ]\n", - ")\n", + "# ]\n", + "# )\n", "\n", - "decoder = tf.keras.Sequential(\n", - " [\n", - " tf.keras.layers.Dense(units=5,activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", - " tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", - " tf.keras.layers.Dense(units=X_train.shape[1], activation='linear', kernel_regularizer=tf.keras.regularizers.l2(reg))\n", - " ]\n", - ")" + "# decoder = tf.keras.Sequential(\n", + "# [\n", + "# tf.keras.layers.Dense(units=5,activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + "# tf.keras.layers.Dense(units=10, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(reg)),\n", + "# tf.keras.layers.Dense(units=X_train.shape[1], activation='linear', kernel_regularizer=tf.keras.regularizers.l2(reg))\n", + "# ]\n", + "# )\n", + "\n", + "\n", + "\n", + "reg = 1e-5\n", + "\n", + "# ENCODER\n", + "encoder = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(\n", + " units=X_train.shape[1],\n", + " activation=None,\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + " tf.keras.layers.LeakyReLU(alpha=0.1),\n", + " \n", + " tf.keras.layers.Dense(\n", + " units=12,\n", + " activation=tf.keras.layers.LeakyReLU(negative_slope=0.1),\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + " \n", + " tf.keras.layers.Dense(\n", + " units=8,\n", + " activation='linear', # Bottleneck stays linear\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + "])\n", + "\n", + "# DECODER\n", + "decoder = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(\n", + " units=8,\n", + " activation=tf.keras.layers.LeakyReLU(negative_slope=0.1),\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + " tf.keras.layers.Dense(\n", + " units=12,\n", + " activation=tf.keras.layers.LeakyReLU(negative_slope=0.1),\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + " tf.keras.layers.Dense(\n", + " units=X_train.shape[1],\n", + " activation='linear',\n", + " kernel_regularizer=tf.keras.regularizers.l2(reg)\n", + " ),\n", + "])\n" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "id": "d60c566a", "metadata": {}, "outputs": [], @@ -427,54 +436,61 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "id": "1bb4321d", "metadata": {}, "outputs": [], "source": [ "autoencoder.compile(\n", " optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n", - " loss='mse',\n", - " \n", + " loss='mse'\n", ")" ] }, { "cell_type": "code", - "execution_count": 81, - "id": "ffb09ce6", + "execution_count": null, + "id": "eff138ee-b7b4-41f2-b97e-5dfe573a0e42", "metadata": {}, "outputs": [], "source": [ "history = autoencoder.fit(\n", " X_train, X_train, # Input and target are the same for autoencoder\n", - " epochs=500,\n", + " epochs=200,\n", " batch_size=64,\n", " validation_data=(X_val_normal, X_val_normal),\n", " verbose=1\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "01f5691b-d12c-416b-bf60-f351f8c1cd83", + "metadata": {}, + "outputs": [], + "source": [ + "save_path = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/saved_models/encoder_model_2_neurons_minmax.keras\")\n", + "encoder.save(save_path)" + ] + }, { "cell_type": "markdown", - "id": "211e4720", + "id": "a74bba5f-c147-43e9-99bd-92a4d63aa624", "metadata": {}, "source": [ - "save of encoder" + "## Load of encoder" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "096b858c", "metadata": {}, "outputs": [], "source": [ - "# Save\n", - "encoder.save('encoder_model.keras')\n", - "\n", - "# Load later\n", - "encoder_loaded = tf.keras.models.load_model('encoder_model.keras')" + "load_path = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/saved_models/encoder_model_2_neurons_minmax.keras\")\n", + "encoder = tf.keras.models.load_model(load_path)" ] }, { @@ -482,12 +498,12 @@ "id": "1da405de", "metadata": {}, "source": [ - "OCSVM Training with Encoder" + "## OCSVM Training with Encoder" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "cd70f560", "metadata": {}, "outputs": [], @@ -501,12 +517,22 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": null, + "id": "cc036af7-5fb8-45aa-9420-07d2b772579e", + "metadata": {}, + "outputs": [], + "source": [ + "X_train_encoded[-20:, :5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "0a292d06", "metadata": {}, "outputs": [], "source": [ - "ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)\n", + "ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.2)\n", "ocsvm.fit(X_train_encoded)\n", "\n", "# Predict on validation/test sets\n", @@ -517,7 +543,126 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": null, + "id": "759118d8-989d-489c-9d35-331454b4795e", + "metadata": {}, + "outputs": [], + "source": [ + "all_zero = {\n", + " \"X_train_encoded\": np.all(X_train_encoded == 0),\n", + " \"X_val_normal_encoded\": np.all(X_val_normal_encoded == 0),\n", + " \"X_val_high_encoded\": np.all(X_val_high_encoded == 0),\n", + " \"X_test_normal_encoded\": np.all(X_test_normal_encoded == 0),\n", + " \"X_test_high_encoded\": np.all(X_test_high_encoded == 0),\n", + "}\n", + "\n", + "print(all_zero)\n", + "print(X_train_encoded.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c0ad242-5db5-4346-a718-278c87426322", + "metadata": {}, + "outputs": [], + "source": [ + "# fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n", + "\n", + "# # Subplot A: Normal\n", + "# axes[0].scatter(\n", + "# X_val_normal_encoded[:, 0],\n", + "# X_val_normal_encoded[:, 1],\n", + "# color=\"blue\",\n", + "# label=\"Normal\"\n", + "# )\n", + "# axes[0].set_title(\"Val Normal (encoded)\")\n", + "# axes[0].set_xlabel(\"latent feature 1\")\n", + "# axes[0].set_ylabel(\"latent feature 2\")\n", + "# axes[0].legend()\n", + "\n", + "# # Subplot B: High\n", + "# axes[1].scatter(\n", + "# X_val_high_encoded[:, 0],\n", + "# X_val_high_encoded[:, 1],\n", + "# color=\"orange\",\n", + "# label=\"High\"\n", + "# )\n", + "# axes[1].set_title(\"ValHigh (encoded)\")\n", + "# axes[1].set_xlabel(\"latent feature 1\")\n", + "# axes[1].set_ylabel(\"latent feature 2\")\n", + "# axes[1].legend()\n", + "\n", + "# # Subplot C: Both\n", + "# axes[2].scatter(\n", + "# X_val_normal_encoded[:, 0],\n", + "# X_val_normal_encoded[:, 1],\n", + "# color=\"blue\",\n", + "# label=\"Normal\"\n", + "# )\n", + "# axes[2].scatter(\n", + "# X_val_high_encoded[:, 0],\n", + "# X_val_high_encoded[:, 1],\n", + "# color=\"orange\",\n", + "# label=\"High\"\n", + "# )\n", + "# axes[2].set_title(\"Normal vs High (encoded)\")\n", + "# axes[2].set_xlabel(\"latent feature 1\")\n", + "# axes[2].set_ylabel(\"latent feature 2\")\n", + "# axes[2].legend()\n", + "\n", + "\n", + "\n", + "latent_dim = 8\n", + "fig, axes = plt.subplots(2, 4, figsize=(20, 10))\n", + "axes = axes.flatten() # flatten to index easily\n", + "\n", + "for i in range(latent_dim):\n", + " axes[i].scatter(\n", + " X_val_normal_encoded[:, i],\n", + " [0]*X_val_normal_encoded.shape[0], # optional: place on a line for 1D visualization\n", + " color='blue',\n", + " label='Normal',\n", + " alpha=0.6\n", + " )\n", + " axes[i].scatter(\n", + " X_val_high_encoded[:, i],\n", + " [0]*X_val_high_encoded.shape[0], # same for High\n", + " color='orange',\n", + " label='High',\n", + " alpha=0.6\n", + " )\n", + " axes[i].set_title(f'Latent dim {i+1}')\n", + " axes[i].set_xlabel(f'Feature {i+1}')\n", + " axes[i].set_yticks([]) # hide y-axis as it's just a 1D comparison\n", + " axes[i].legend()\n", + " axes[i].grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82be2766-5f59-4019-9f29-7c6580ddd2fc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(classification_report(y_true=np.concatenate([y_val_normal, y_val_high]),y_pred=val_predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "e1e08cc3", "metadata": {}, "outputs": [], @@ -525,6 +670,26 @@ "print(classification_report(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=test_predictions))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c16bfeee-ea3f-4e4a-8afe-ac05d36c515b", + "metadata": {}, + "outputs": [], + "source": [ + "f1_score(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=test_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f39eb5d-6189-4219-a4db-49808202fb8b", + "metadata": {}, + "outputs": [], + "source": [ + "balanced_accuracy_score(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=test_predictions)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -556,516 +721,149 @@ "id": "dbad13e2", "metadata": {}, "source": [ - "### compl" + "### One Class Gridsearch Approach" ] }, { "cell_type": "code", "execution_count": null, - "id": "45137e53", + "id": "b99a07d3-8cc9-46b2-bf33-6d3c40904f28", "metadata": {}, "outputs": [], "source": [ - "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val_normal, X_val_high):\n", - " \"\"\"\n", - " Grid search with fixed train/validation split for novelty detection.\n", - " \n", - " Parameters:\n", - " -----------\n", - " estimator : OneClassSVM\n", - " The OCSVM estimator\n", - " param_grid : dict\n", - " Parameter grid to search\n", - " X_train : array-like\n", - " Normal training samples only (encoded features)\n", - " X_val_normal : array-like \n", - " Normal validation samples (encoded features)\n", - " X_val_high : array-like\n", - " High/anomaly validation samples (encoded features)\n", - " \"\"\"\n", - " \n", - " \n", - " # Combine validation data\n", - " X_val = np.concatenate([X_val_normal, X_val_high])\n", - " y_val = np.array([1] * len(X_val_normal) + [-1] * len(X_val_high))\n", - " \n", - " results = []\n", - " \n", - " for params in ParameterGrid(param_grid):\n", - " # Set parameters\n", - " estimator.set_params(**params)\n", - " \n", - " # Fit on training normal samples\n", - " estimator.fit(X_train)\n", - " \n", - " # Score on validation set\n", - " val_scores = estimator.score_samples(X_val)\n", - " roc_auc = roc_auc_score(y_val, val_scores)\n", - " \n", - " results.append({\n", - " 'params': params,\n", - " 'score': roc_auc\n", - " })\n", - " print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n", - " \n", - " # Find best\n", - " best_idx = np.argmax([r['score'] for r in results])\n", - " best_params = results[best_idx]['params']\n", - " best_score = results[best_idx]['score']\n", - " \n", - " print(f\"\\nBest params: {best_params}\")\n", - " print(f\"Best ROC-AUC: {best_score:.4f}\")\n", - " \n", - " return best_params, best_score, results\n", - "\n", - "\n", - "def evaluate_ocsvm(ocsvm, X_normal, X_anomaly, set_name=\"Test\"):\n", - " \"\"\"\n", - " Evaluate OCSVM performance with comprehensive metrics.\n", - " \n", - " Parameters:\n", - " -----------\n", - " ocsvm : fitted OneClassSVM\n", - " Trained OCSVM model\n", - " X_normal : array-like\n", - " Normal samples\n", - " X_anomaly : array-like\n", - " Anomaly samples\n", - " set_name : str\n", - " Name of the dataset (e.g., \"Validation\", \"Test\")\n", - " \"\"\"\n", - " \n", - " # Combine data\n", - " X = np.vstack([X_normal, X_anomaly])\n", - " y_true = np.array([1] * len(X_normal) + [-1] * len(X_anomaly))\n", - " \n", - " # Predictions\n", - " y_pred = ocsvm.predict(X)\n", - " scores = ocsvm.score_samples(X)\n", - " \n", - " # Calculate metrics\n", - " roc_auc = roc_auc_score(y_true, scores)\n", - " accuracy = accuracy_score(y_true, y_pred)\n", - " \n", - " # For precision, recall, F1: treat normal (1) as positive class\n", - " precision = precision_score(y_true, y_pred, pos_label=1)\n", - " recall = recall_score(y_true, y_pred, pos_label=1)\n", - " f1 = f1_score(y_true, y_pred, pos_label=1)\n", - " \n", - " # Confusion matrix\n", - " cm = confusion_matrix(y_true, y_pred, labels=[1, -1])\n", - " tn, fp, fn, tp = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]\n", - " \n", - " # Print results\n", - " print(f\"\\n{'='*50}\")\n", - " print(f\"{set_name} Set Evaluation\")\n", - " print(f\"{'='*50}\")\n", - " print(f\"ROC-AUC Score: {roc_auc:.4f}\")\n", - " print(f\"Accuracy: {accuracy:.4f}\")\n", - " print(f\"Precision: {precision:.4f}\")\n", - " print(f\"Recall: {recall:.4f}\")\n", - " print(f\"F1-Score: {f1:.4f}\")\n", - " print(f\"\\nConfusion Matrix:\")\n", - " print(f\" Predicted\")\n", - " print(f\" Normal Anomaly\")\n", - " print(f\"Actual Normal {tp:6d} {fn:6d}\")\n", - " print(f\"Actual Anomaly {fp:6d} {tn:6d}\")\n", - " print(f\"\\nTrue Positives (Normal detected as Normal): {tp}\")\n", - " print(f\"False Negatives (Normal detected as Anomaly): {fn}\")\n", - " print(f\"False Positives (Anomaly detected as Normal): {fp}\")\n", - " print(f\"True Negatives (Anomaly detected as Anomaly): {tn}\")\n", - " \n", - " # Detailed classification report\n", - " print(f\"\\nClassification Report:\")\n", - " print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly'], \n", - " labels=[1, -1]))\n", - " \n", - " return {\n", - " 'roc_auc': roc_auc,\n", - " 'accuracy': accuracy,\n", - " 'precision': precision,\n", - " 'recall': recall,\n", - " 'f1': f1,\n", - " 'confusion_matrix': cm,\n", - " 'predictions': y_pred,\n", - " 'scores': scores\n", - " }\n" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "id": "e54c2ebd", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Usage example:\n", - "# 1. Grid search\n", - "best_params, best_score, all_results = simple_one_class_grid_search(\n", - " estimator=OneClassSVM(),\n", - " param_grid={\n", - " 'nu': np.linspace(0.01, 0.3, 10),\n", - " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale', 'auto'],\n", - " 'kernel': ['rbf']\n", - " },\n", - " X_train=X_train_encoded,\n", - " X_val_normal=X_val_normal_encoded,\n", - " X_val_high=X_val_high_encoded\n", - ")\n", - "\n", - "# 2. Train final model with best params\n", - "final_ocsvm = OneClassSVM(**best_params)\n", - "final_ocsvm.fit(X_train_encoded)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "id": "f7637b60", - "metadata": {}, - "outputs": [], - "source": [ - "# 3. Evaluate on validation set\n", - "val_metrics = evaluate_ocsvm(\n", - " final_ocsvm, \n", - " X_val_normal_encoded, \n", - " X_val_high_encoded, \n", - " set_name=\"Validation\"\n", - ")\n", - "\n", - "# 4. Evaluate on test set\n" + "# X_combined = np.concatenate([X_train_encoded, X_val_normal_encoded, X_val_high_encoded], axis=0)\n", + "# y_combined = np.concatenate([\n", + "# np.ones(X_train_encoded.shape[0]+X_val_normal_encoded.shape[0]), # normal = 1\n", + "# -np.ones(X_val_high_encoded.shape[0]) # anomaly = -1\n", + "# ], axis=0)\n", + "X_combined = np.concatenate([X_train_encoded, X_val_high_encoded], axis=0)\n", + "y_combined = np.concatenate([\n", + " np.ones(X_train_encoded.shape[0]), # normal = 1\n", + " -np.ones(X_val_high_encoded.shape[0]) # anomaly = -1\n", + "], axis=0)" ] }, { "cell_type": "code", "execution_count": null, - "id": "9576230f", + "id": "e52a4859-08e3-4aad-922a-8ec7d4f75b12", "metadata": {}, "outputs": [], "source": [ - "test_metrics = evaluate_ocsvm(\n", - " final_ocsvm, \n", - " X_test_normal_encoded, \n", - " X_test_high_encoded, \n", - " set_name=\"Test\"\n", + "np.random.seed(42)\n", + "idx = np.arange(X_combined.shape[0])\n", + "anomaly_idx = idx[y_combined == -1]\n", + "normal_idx = idx[y_combined != -1]\n", + "\n", + "np.random.shuffle(normal_idx)\n", + "cv_splits = [\n", + " (normal_idx[train], np.concatenate([normal_idx[test], anomaly_idx]))\n", + " for train, test in KFold(n_splits=5).split(normal_idx)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02e9e79b-9a26-428b-872a-96120e80e699", + "metadata": {}, + "outputs": [], + "source": [ + "supervised_ocsvm = Pipeline([\n", + " ('ocsvm', OneClassSVM())\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3b14a88-2438-42a7-ab4d-ba4fed4457be", + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {\n", + " 'ocsvm__nu': [*np.linspace(0.01, 0.2, 10).tolist(), 0.2],\n", + " 'ocsvm__gamma': [*np.logspace(-2, 2, 20).tolist(), 'scale','auto'],\n", + " 'ocsvm__kernel': ['rbf', 'poly']\n", + "}\n", + "grid_search = GridSearchCV(\n", + " estimator=supervised_ocsvm,\n", + " param_grid=param_grid,\n", + " scoring=lambda est, X, y: balanced_accuracy_score(y, est.predict(X)),\n", + " cv=cv_splits,\n", + " n_jobs=-1,\n", + " verbose=1,\n", + " refit=False # Set to True to get best estimator\n", ")" ] }, { - "cell_type": "markdown", - "id": "55109590", + "cell_type": "code", + "execution_count": null, + "id": "6b24b9f3-b82a-4e6c-a5d7-d3cebc7c04c4", "metadata": {}, + "outputs": [], "source": [ - "## OneClass GridSearch (old)" + "grid_search.fit(X_combined, y_combined)" ] }, { "cell_type": "code", "execution_count": null, - "id": "671b47a1", + "id": "0a5aaca2-1623-4ad3-a612-d401ef76284e", "metadata": {}, "outputs": [], "source": [ - "def supervised_one_class_grid_search(estimator, param_grid, data, labels, seed=None):\n", - " np.random.seed(seed)\n", - " idx = np.arange(data.shape[0])\n", - " anomaly_idx = idx[labels==-1]\n", - " normal_idx = idx[labels!=-1]\n", + "# Results\n", + "print(f'Best parameters: {grid_search.best_params_}')\n", + "print(f'Best score: {grid_search.best_score_:.4f}')\n", + "print(f'Std: {grid_search.cv_results_[\"std_test_score\"][grid_search.best_index_]:.4f}')\n", "\n", - " np.random.shuffle(normal_idx)\n", + "# Access the best model\n", + "supervised_ocsvm.set_params(**grid_search.best_params_)\n", + "supervised_ocsvm.fit(X_train_encoded)\n", "\n", - " cv = [(normal_idx[pair[0]], np.concatenate([normal_idx[pair[1]], anomaly_idx], axis=0)) for pair in KFold().split(normal_idx)]\n", - " print(f\"CV Folds created:\")\n", - " for i, (train_idx, val_idx) in enumerate(cv):\n", - " print(f\" Fold {i+1}: Train={len(train_idx)} normal, Val={len(val_idx)} total ({len(val_idx)-len(anomaly_idx)} normal + {len(anomaly_idx)} anomaly)\")\n", - " grid_search = GridSearchCV(estimator=estimator,\n", - " param_grid=param_grid,\n", - " scoring=lambda est, X, y: roc_auc_score(y_true=y, y_score=est.score_samples(X)),\n", - " n_jobs=-2,\n", - " cv=cv,\n", - " verbose=1,\n", - " refit=False)\n", - " \n", - " grid_search.fit(data, labels)\n", - "\n", - " return grid_search" + "# Make predictions\n", + "predictions = supervised_ocsvm.predict(np.vstack([X_test_normal_encoded, X_test_high_encoded]))" ] }, { "cell_type": "code", "execution_count": null, - "id": "8fabf322", + "id": "5e02c19e-a2d4-4808-b66e-6fc431c9fa02", "metadata": {}, "outputs": [], "source": [ - "estimator = OneClassSVM()\n", - "grid_search = supervised_one_class_grid_search(estimator=estimator,\n", - " param_grid={'nu': np.linspace(0.01, 0.2, 10),\n", - " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']},\n", - " data=X_grid_search,\n", - " labels=y_grid_search,\n", - " seed=42)\n", - "print(f'Best results with {grid_search.best_params_}:')\n", - "print(f'Validation results: {grid_search.cv_results_[\"mean_test_score\"][grid_search.best_index_]} +- {grid_search.cv_results_[\"std_test_score\"][grid_search.best_index_]}')" + "evaluation_tools.plot_confusion_matrix(true_labels=np.concatenate([y_test_normal, y_test_high]), predictions=predictions, label_names=['high', 'low'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "651fc483", + "id": "ca037a2b-6dbd-4386-8e9f-f73a92dbb7a8", "metadata": {}, "outputs": [], "source": [ - "xticks = [f'({param_dict[\"nu\"]:.3f}, {param_dict[\"gamma\"]})' for param_dict in grid_search.cv_results_['params']]\n", - "\n", - "plt.figure(figsize=(30,10))\n", - "plt.grid('on')\n", - "plt.title('Means scores +- std.')\n", - "plt.ylabel('Mean balanced accuracy')\n", - "plt.xlabel(r'($\\nu$, $\\gamma$)')\n", - "plt.errorbar(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['std_test_score'])\n", - "plt.xticks(np.arange(grid_search.cv_results_['mean_test_score'].shape[0]), xticks, rotation='vertical')\n", - "plt.show()\n", - "\n", - "estimator.set_params(**grid_search.best_params_)\n", - "estimator.fit(X_grid_search[y_grid_search == 1].to_numpy())" + "balanced_accuracy_score(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=predictions)" ] }, { "cell_type": "code", "execution_count": null, - "id": "587f2301", + "id": "15a837f4-db3d-4dee-8cd1-e5d9118f82e8", "metadata": {}, "outputs": [], "source": [ - "# Cell 8: Prepare independent test set\n", - "X_test_normal = test_normal_normalized[au_columns].copy()\n", - "X_test_high = test_high_normalized[au_columns].copy()\n", - "\n", - "# Combine test sets\n", - "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n", - "\n", - "# Create labels for test set\n", - "y_test_normal = np.ones(len(X_test_normal)) # 1 for normal\n", - "y_test_high = -np.ones(len(X_test_high)) # -1 for anomalies\n", - "y_test = np.concatenate([y_test_normal, y_test_high])\n", - "\n", - "print(f\"Test set shape: {X_test.shape}\")\n", - "print(f\"Test labels distribution: Normal={np.sum(y_test==1)}, Anomaly={np.sum(y_test==-1)}\")" + "f1_score(y_true=np.concatenate([y_test_normal, y_test_high]), y_pred=predictions)" ] }, { "cell_type": "code", "execution_count": null, - "id": "40703231", + "id": "9e412041-7534-40fe-8486-ee97349a6168", "metadata": {}, "outputs": [], - "source": [ - "# Get anomaly scores\n", - "y_scores = estimator.score_samples(X_test.values)\n", - "# Get predictions (-1 for anomaly, 1 for normal)\n", - "y_pred = estimator.predict(X_test.values)\n", - "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "815fbd78", - "metadata": {}, - "outputs": [], - "source": [ - "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])" - ] - }, - { - "cell_type": "markdown", - "id": "8bc13284", - "metadata": {}, - "source": [ - "### DEBUG" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f0aa77a", - "metadata": {}, - "outputs": [], - "source": [ - "# Test on validation normal samples (should predict as normal)\n", - "val_normal_pred = estimator.predict(X_val_normal.values)\n", - "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n", - "\n", - "# Test on validation anomaly samples (should predict as anomaly)\n", - "val_high_pred = estimator.predict(X_val_high.values)\n", - "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "514cc22f", - "metadata": {}, - "outputs": [], - "source": [ - "# The model should predict most training samples as normal\n", - "train_pred = estimator.predict(X_train.values)\n", - "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n", - "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")" - ] - }, - { - "cell_type": "markdown", - "id": "3c9e1e12", - "metadata": {}, - "source": [ - "### to delete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60d69a97", - "metadata": {}, - "outputs": [], - "source": [ - "# Cell 5: Split data properly for grid search\n", - "X_train = train_normalized[au_columns].copy()\n", - "X_val_normal = val_normal_normalized[au_columns].copy()\n", - "X_val_high = val_high_normalized[au_columns].copy()\n", - "\n", - "# Grid search ONLY on train + val_normal\n", - "# Keep val_high completely separate for validation scoring\n", - "X_grid_search_train = X_train.copy()\n", - "X_grid_search_val = pd.concat([X_val_normal, X_val_high], ignore_index=True)\n", - "\n", - "y_grid_search_train = np.ones(len(X_train))\n", - "y_grid_search_val = np.concatenate([\n", - " np.ones(len(X_val_normal)),\n", - " -np.ones(len(X_val_high))\n", - "])\n", - "\n", - "print(f\"Grid search train shape: {X_grid_search_train.shape}\")\n", - "print(f\"Grid search val shape: {X_grid_search_val.shape}\")\n", - "print(f\"Val labels: Normal={np.sum(y_grid_search_val==1)}, Anomaly={np.sum(y_grid_search_val==-1)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d53d03e", - "metadata": {}, - "outputs": [], - "source": [ - "# Modified grid search function for simple train/val split\n", - "def simple_one_class_grid_search(estimator, param_grid, X_train, X_val, y_val):\n", - " \"\"\"\n", - " Grid search with fixed train/validation split.\n", - " \n", - " Parameters:\n", - " -----------\n", - " X_train : array-like\n", - " Normal training samples only\n", - " X_val : array-like \n", - " Validation samples (normal + anomalies)\n", - " y_val : array-like\n", - " Labels for validation samples (1=normal, -1=anomaly)\n", - " \"\"\"\n", - " from sklearn.model_selection import ParameterGrid\n", - " from sklearn.metrics import roc_auc_score\n", - " \n", - " results = []\n", - " \n", - " for params in ParameterGrid(param_grid):\n", - " # Set parameters\n", - " estimator.set_params(**params)\n", - " \n", - " # Fit on training normal samples\n", - " estimator.fit(X_train)\n", - " \n", - " # Score on validation set\n", - " val_scores = estimator.score_samples(X_val)\n", - " roc_auc = roc_auc_score(y_val, val_scores)\n", - " \n", - " results.append({\n", - " 'params': params,\n", - " 'score': roc_auc\n", - " })\n", - " print(f\"Params {params}: ROC-AUC = {roc_auc:.4f}\")\n", - " \n", - " # Find best\n", - " best_idx = np.argmax([r['score'] for r in results])\n", - " best_params = results[best_idx]['params']\n", - " best_score = results[best_idx]['score']\n", - " \n", - " return best_params, best_score, results\n", - "\n", - "# Run grid search\n", - "best_params, best_score, all_results = simple_one_class_grid_search(\n", - " estimator=OneClassSVM(),\n", - " param_grid={\n", - " 'nu': np.linspace(0.01, 0.3, 10),\n", - " 'gamma': [*np.logspace(-2, 2, 10).tolist(), 'scale']\n", - " },\n", - " X_train=X_grid_search_train.to_numpy(),\n", - " X_val=X_grid_search_val.to_numpy(),\n", - " y_val=y_grid_search_val\n", - ")\n", - "\n", - "print(f\"\\nBest params: {best_params}\")\n", - "print(f\"Best validation ROC-AUC: {best_score:.4f}\")\n", - "\n", - "# Fit final model on training data with best params\n", - "estimator = OneClassSVM(**best_params)\n", - "estimator.fit(X_grid_search_train.to_numpy())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03bcef39", - "metadata": {}, - "outputs": [], - "source": [ - "# Get anomaly scores\n", - "y_scores = estimator.score_samples(X_test.values)\n", - "# Get predictions (-1 for anomaly, 1 for normal)\n", - "y_pred = estimator.predict(X_test.values)\n", - "print(classification_report(y_test, y_pred, target_names=['Anomaly', 'Normal']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acfb34aa", - "metadata": {}, - "outputs": [], - "source": [ - "evaluation_tools.plot_confusion_matrix(y_test, y_pred, label_names=['Anomaly', 'Normal'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d4bf434", - "metadata": {}, - "outputs": [], - "source": [ - "# Test on validation normal samples (should predict as normal)\n", - "val_normal_pred = estimator.predict(X_val_normal.values)\n", - "print(f\"Validation normal predictions: {np.sum(val_normal_pred==1)} normal, {np.sum(val_normal_pred==-1)} anomaly\")\n", - "\n", - "# Test on validation anomaly samples (should predict as anomaly)\n", - "val_high_pred = estimator.predict(X_val_high.values)\n", - "print(f\"Validation anomaly predictions: {np.sum(val_high_pred==1)} normal, {np.sum(val_high_pred==-1)} anomaly\")\n", - "# The model should predict most training samples as normal\n", - "train_pred = estimator.predict(X_train.values)\n", - "print(f\"Training predictions: {np.sum(train_pred==1)} normal, {np.sum(train_pred==-1)} anomaly\")\n", - "print(f\"Training anomaly rate: {np.sum(train_pred==-1)/len(train_pred)*100:.2f}%\")" - ] + "source": [] } ], "metadata": { @@ -1073,6 +871,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" } }, "nbformat": 4,