minor fixes to new paths / dataset with all columns
This commit is contained in:
parent
3d8c7c6639
commit
af3f9d16b2
@ -1,5 +1,13 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc08936c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Insights into the dataset with histogramms and scatter plots"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1014c5e0",
|
||||
@ -17,7 +25,8 @@
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -27,7 +36,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
|
||||
"path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n",
|
||||
"df = pd.read_parquet(path=path)"
|
||||
]
|
||||
},
|
||||
@ -104,21 +113,27 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all columns that start with 'AU'\n",
|
||||
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||
"face_au_cols = [c for c in low_all.columns if c.startswith(\"FACE_AU\")]\n",
|
||||
"eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
|
||||
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
|
||||
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
|
||||
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
|
||||
" 'Pupil_mean', 'Pupil_IPA']\n",
|
||||
"\n",
|
||||
"cols = face_au_cols+eye_cols\n",
|
||||
"\n",
|
||||
"# Calculate number of rows and columns for subplots\n",
|
||||
"n_cols = len(au_columns)\n",
|
||||
"n_rows = 4\n",
|
||||
"n_cols = len(cols)\n",
|
||||
"n_rows = 7\n",
|
||||
"n_cols_subplot = 5\n",
|
||||
"\n",
|
||||
"# Create figure with subplots\n",
|
||||
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
||||
"axes = axes.flatten()\n",
|
||||
"fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
||||
"fig.suptitle('Feature Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
||||
"\n",
|
||||
"# Create histogram for each AU column\n",
|
||||
"for idx, col in enumerate(au_columns):\n",
|
||||
"for idx, col in enumerate(cols):\n",
|
||||
" ax = axes[idx]\n",
|
||||
" \n",
|
||||
" # Plot overlapping histograms\n",
|
||||
@ -133,18 +148,48 @@
|
||||
" ax.grid(True, alpha=0.3)\n",
|
||||
"\n",
|
||||
"# Hide any unused subplots\n",
|
||||
"for idx in range(len(au_columns), len(axes)):\n",
|
||||
"for idx in range(len(cols), len(axes)):\n",
|
||||
" axes[idx].set_visible(False)\n",
|
||||
"\n",
|
||||
"# Adjust layout\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6cd53cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create figure with subplots\n",
|
||||
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
||||
"axes = axes.flatten()\n",
|
||||
"fig.suptitle('Feature Scatter: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
||||
"\n",
|
||||
"for idx, col in enumerate(cols):\n",
|
||||
" ax = axes[idx]\n",
|
||||
"\n",
|
||||
" # Scatterplots\n",
|
||||
" ax.scatter(range(len(low_all[col])), low_all[col], alpha=0.6, color='blue', label='low_all', s=10)\n",
|
||||
" ax.scatter(range(len(high_all[col])), high_all[col], alpha=0.6, color='red', label='high_all', s=10)\n",
|
||||
"\n",
|
||||
" ax.set_title(col, fontsize=10, fontweight='bold')\n",
|
||||
" ax.set_xlabel('Sample index', fontsize=8)\n",
|
||||
" ax.set_ylabel('Value', fontsize=8)\n",
|
||||
" ax.legend(fontsize=8)\n",
|
||||
" ax.grid(True, alpha=0.3)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -158,7 +203,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -28,7 +28,7 @@
|
||||
"sys.path.append(base_dir)\n",
|
||||
"print(base_dir)\n",
|
||||
"\n",
|
||||
"from tools import evaluation_tools\n",
|
||||
"from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n",
|
||||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
||||
"from sklearn.ensemble import IsolationForest\n",
|
||||
"from sklearn.model_selection import GridSearchCV, KFold\n",
|
||||
@ -52,7 +52,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
|
||||
"data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -301,20 +301,26 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 2: Get AU columns and prepare datasets\n",
|
||||
"# Get all column names that start with 'AU'\n",
|
||||
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||
"au_columns = [col for col in low_all.columns if \"face\" in col.lower()] \n",
|
||||
"\n",
|
||||
"eye_columns = [ \n",
|
||||
" 'Fix_count_short_66_150','Fix_count_medium_300_500','Fix_count_long_gt_1000', \n",
|
||||
" 'Fix_count_100','Fix_mean_duration','Fix_median_duration', \n",
|
||||
" 'Sac_count','Sac_mean_amp','Sac_mean_dur','Sac_median_dur', \n",
|
||||
" 'Blink_count','Blink_mean_dur','Blink_median_dur', \n",
|
||||
" 'Pupil_mean','Pupil_IPA' \n",
|
||||
"] \n",
|
||||
"cols = au_columns +eye_columns\n",
|
||||
"# Prepare training data (only normal/low data)\n",
|
||||
"train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||
"train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + cols].copy()\n",
|
||||
"\n",
|
||||
"# Prepare validation data (normal and anomaly)\n",
|
||||
"val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||
"val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||
"val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
|
||||
"val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
|
||||
"\n",
|
||||
"# Prepare test data (normal and anomaly)\n",
|
||||
"test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||
"test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||
"test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
|
||||
"test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
|
||||
"\n",
|
||||
"print(f\"Train samples: {len(train_data)}\")\n",
|
||||
"print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
|
||||
@ -328,8 +334,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 3: Fit normalizer on training data\n",
|
||||
"normalizer = fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n",
|
||||
"# Fit normalizer on training data\n",
|
||||
"normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n",
|
||||
"print(\"Normalizer fitted on training data\")"
|
||||
]
|
||||
},
|
||||
@ -340,12 +346,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 4: Apply normalization to all datasets\n",
|
||||
"train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n",
|
||||
"val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n",
|
||||
"val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n",
|
||||
"test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n",
|
||||
"test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n",
|
||||
"# Apply normalization to all datasets\n",
|
||||
"train_normalized = apply_normalizer(train_data, cols, normalizer)\n",
|
||||
"val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n",
|
||||
"val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n",
|
||||
"test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n",
|
||||
"test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n",
|
||||
"\n",
|
||||
"print(\"Normalization applied to all datasets\")"
|
||||
]
|
||||
@ -357,11 +363,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 5: Extract AU columns and create labels for grid search\n",
|
||||
"# Extract only AU columns (drop subjectID)\n",
|
||||
"X_train = train_normalized[au_columns].copy()\n",
|
||||
"X_val_normal = val_normal_normalized[au_columns].copy()\n",
|
||||
"X_val_high = val_high_normalized[au_columns].copy()\n",
|
||||
"X_train = train_normalized[cols].copy()\n",
|
||||
"X_val_normal = val_normal_normalized[cols].copy()\n",
|
||||
"X_val_high = val_high_normalized[cols].copy()\n",
|
||||
"\n",
|
||||
"# Combine train and validation sets for grid search\n",
|
||||
"X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
|
||||
@ -416,7 +420,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 7: Train final model with best parameters on training data\n",
|
||||
"# Train final model with best parameters on training data\n",
|
||||
"final_model = IsolationForest(**best_params, random_state=42)\n",
|
||||
"final_model.fit(X_train.values)\n",
|
||||
"\n",
|
||||
@ -430,9 +434,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cell 8: Prepare independent test set\n",
|
||||
"X_test_normal = test_normal_normalized[au_columns].copy()\n",
|
||||
"X_test_high = test_high_normalized[au_columns].copy()\n",
|
||||
"# Prepare independent test set\n",
|
||||
"X_test_normal = test_normal_normalized[cols].copy()\n",
|
||||
"X_test_high = test_high_normalized[cols].copy()\n",
|
||||
"\n",
|
||||
"# Combine test sets\n",
|
||||
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
|
||||
@ -483,7 +487,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -497,7 +501,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user