outsourcing of scaler in iforest and deep svdd, removal of paths

This commit is contained in:
Michael Weig 2026-03-04 16:51:22 +01:00
parent 6cc38291df
commit 13bd76631f
2 changed files with 69 additions and 402 deletions

View File

@ -41,7 +41,6 @@
"import time\n", "import time\n",
"base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"sys.path.append(base_dir)\n", "sys.path.append(base_dir)\n",
"print(base_dir)\n",
"\n", "\n",
"from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal, performance_split\n", "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal, performance_split\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
@ -51,7 +50,7 @@
"import tensorflow as tf\n", "import tensorflow as tf\n",
"from tensorflow.keras import layers, models, regularizers\n", "from tensorflow.keras import layers, models, regularizers\n",
"import pickle\n", "import pickle\n",
"from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay, auc, roc_curve) " "from sklearn.metrics import (accuracy_score, auc, roc_curve, f1_score) "
] ]
}, },
{ {
@ -89,15 +88,40 @@
"id": "f00a477c", "id": "f00a477c",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Data Preprocessing" "### Configuration of paths and data preprocessing"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "code",
"id": "504c1df7", "execution_count": null,
"id": "5136fcec",
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"Laden der Daten" "# TODO: set path where to save normalizer\n",
"normalizer_path=Path('.pkl') # TODO: set manually"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2115f65",
"metadata": {},
"outputs": [],
"source": [
"performance_path = Path(r\".csv\") # TODO: set manually\n",
"performance_df = pd.read_csv(performance_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "559eb8d2",
"metadata": {},
"outputs": [],
"source": [
"encoder_save_path = Path('.keras') # TODO: set manually\n",
"deep_svdd_save_path = Path('.keras') # TODO: set manually"
] ]
}, },
{ {
@ -107,8 +131,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"dataset_path = Path(r\"data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n", "dataset_path = Path(r\".parquet\") # TODO: set manually"
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")"
] ]
}, },
{ {
@ -121,17 +144,6 @@
"df = pd.read_parquet(path=dataset_path)" "df = pd.read_parquet(path=dataset_path)"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "c2115f65",
"metadata": {},
"outputs": [],
"source": [
"performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n",
"performance_df = pd.read_csv(performance_path)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "c045c46d", "id": "c045c46d",
@ -215,7 +227,7 @@
"\n", "\n",
"data = pd.concat([low, high], ignore_index=True)\n", "data = pd.concat([low, high], ignore_index=True)\n",
"df = data.drop_duplicates()\n", "df = data.drop_duplicates()\n",
"\n", "df = df.dropna()\n",
"print(\"Label distribution:\")\n", "print(\"Label distribution:\")\n",
"print(df[\"label\"].value_counts())" "print(df[\"label\"].value_counts())"
] ]
@ -275,210 +287,6 @@
"Normalization" "Normalization"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "acec4a03",
"metadata": {},
"outputs": [],
"source": [
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
" \"\"\"\n",
" Fit normalization scalers on training data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" train_data : pd.DataFrame\n",
" Training dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" method : str, default='standard'\n",
" Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
" scope : str, default='global'\n",
" Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing fitted scalers and statistics for new subjects\n",
" \"\"\"\n",
" if method == 'standard':\n",
" Scaler = StandardScaler\n",
" elif method == 'minmax':\n",
" Scaler = MinMaxScaler\n",
" else:\n",
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
" \n",
" scalers = {}\n",
" if scope == 'subject':\n",
" # Fit one scaler per subject\n",
" subject_stats = []\n",
" \n",
" for subject in train_data['subjectID'].unique():\n",
" subject_mask = train_data['subjectID'] == subject\n",
" scaler = Scaler()\n",
" scaler.fit(train_data.loc[subject_mask, au_columns].values)\n",
" scalers[subject] = scaler\n",
" \n",
" # Store statistics for averaging\n",
" if method == 'standard':\n",
" subject_stats.append({\n",
" 'mean': scaler.mean_,\n",
" 'std': scaler.scale_\n",
" })\n",
" elif method == 'minmax':\n",
" subject_stats.append({\n",
" 'min': scaler.data_min_,\n",
" 'max': scaler.data_max_\n",
" })\n",
" \n",
" # Calculate average statistics for new subjects\n",
" if method == 'standard':\n",
" avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n",
" avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n",
" fallback_scaler = StandardScaler()\n",
" fallback_scaler.mean_ = avg_mean\n",
" fallback_scaler.scale_ = avg_std\n",
" fallback_scaler.var_ = avg_std ** 2\n",
" fallback_scaler.n_features_in_ = len(au_columns)\n",
" elif method == 'minmax':\n",
" avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n",
" avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n",
" fallback_scaler = MinMaxScaler()\n",
" fallback_scaler.data_min_ = avg_min\n",
" fallback_scaler.data_max_ = avg_max\n",
" fallback_scaler.data_range_ = avg_max - avg_min\n",
" fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n",
" fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n",
" fallback_scaler.n_features_in_ = len(au_columns)\n",
" \n",
" scalers['_fallback'] = fallback_scaler\n",
" \n",
" elif scope == 'global':\n",
" # Fit one scaler for all subjects\n",
" scaler = Scaler()\n",
" scaler.fit(train_data[au_columns].values)\n",
" scalers['global'] = scaler\n",
" \n",
" else:\n",
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
" \n",
" return {'scalers': scalers, 'method': method, 'scope': scope}\n",
"\n",
"def apply_normalizer(data, columns, normalizer_dict):\n",
" \"\"\"\n",
" Apply fitted normalization scalers to data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" data : pd.DataFrame\n",
" Dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" normalizer_dict : dict\n",
" Dictionary containing fitted scalers from fit_normalizer()\n",
" \n",
" Returns:\n",
" --------\n",
" pd.DataFrame\n",
" DataFrame with normalized AU columns\n",
" \"\"\"\n",
" normalized_data = data.copy()\n",
" scalers = normalizer_dict['scalers']\n",
" scope = normalizer_dict['scope']\n",
" normalized_data[columns] = normalized_data[columns].astype(np.float64)\n",
"\n",
" if scope == 'subject':\n",
" # Apply per-subject normalization\n",
" for subject in data['subjectID'].unique():\n",
" subject_mask = data['subjectID'] == subject\n",
" \n",
" # Use the subject's scaler if available, otherwise use fallback\n",
" if subject in scalers:\n",
" scaler = scalers[subject]\n",
" else:\n",
" # Use averaged scaler for new subjects\n",
" scaler = scalers['_fallback']\n",
" print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n",
" \n",
" normalized_data.loc[subject_mask, columns] = scaler.transform(\n",
" data.loc[subject_mask, columns].values\n",
" )\n",
" \n",
" elif scope == 'global':\n",
" # Apply global normalization\n",
" scaler = scalers['global']\n",
" normalized_data[columns] = scaler.transform(data[columns].values)\n",
" \n",
" return normalized_data\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53c6ee6f",
"metadata": {},
"outputs": [],
"source": [
"def save_normalizer(normalizer_dict, filepath):\n",
" \"\"\"\n",
" Save fitted normalizer to disk.\n",
"\n",
" Parameters:\n",
" -----------\n",
" normalizer_dict : dict\n",
" Dictionary containing fitted scalers from fit_normalizer()\n",
" filepath : str\n",
" Path to save the normalizer (e.g., 'normalizer.pkl')\n",
" \"\"\"\n",
" # Create directory if it does not exist\n",
" dirpath = os.path.dirname(filepath)\n",
" if dirpath:\n",
" os.makedirs(dirpath, exist_ok=True)\n",
"\n",
" with open(filepath, 'wb') as f:\n",
" pickle.dump(normalizer_dict, f)\n",
"\n",
" print(f\"Normalizer saved to {filepath}\")\n",
"\n",
"def load_normalizer(filepath):\n",
" \"\"\"\n",
" Load fitted normalizer from disk.\n",
" \n",
" Parameters:\n",
" -----------\n",
" filepath : str\n",
" Path to the saved normalizer file\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing fitted scalers\n",
" \"\"\"\n",
" with open(filepath, 'rb') as f:\n",
" normalizer_dict = pickle.load(f)\n",
" print(f\"Normalizer loaded from {filepath}\")\n",
" return normalizer_dict"
]
},
{
"cell_type": "markdown",
"id": "7280f64f",
"metadata": {},
"source": [
"save Normalizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8420afc2",
"metadata": {},
"outputs": [],
"source": [
"normalizer_path=Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/normalizer_min_max_global.pkl')"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -495,8 +303,10 @@
"print(len(eye_cols))\n", "print(len(eye_cols))\n",
"all_signal_columns = face_au_cols+eye_cols\n", "all_signal_columns = face_au_cols+eye_cols\n",
"print(len(all_signal_columns))\n", "print(len(all_signal_columns))\n",
"normalizer = fit_normalizer(train_df, all_signal_columns, method='minmax', scope='global')\n", "\n",
"save_normalizer(normalizer, normalizer_path )" "# fit and save normalizer\n",
"normalizer = scaler.fit_normalizer(train_df, all_signal_columns, method='minmax', scope='global')\n",
"scaler.save_normalizer(normalizer, normalizer_path )"
] ]
}, },
{ {
@ -506,11 +316,11 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"normalizer = load_normalizer(normalizer_path)\n", "normalizer = scaler.load_normalizer(normalizer_path)\n",
"# 3. Apply normalization to all sets\n", "# Apply normalization to all sets\n",
"train_df_norm = apply_normalizer(train_df, all_signal_columns, normalizer)\n", "train_df_norm = scaler.apply_normalizer(train_df, all_signal_columns, normalizer)\n",
"val_df_norm = apply_normalizer(val_df, all_signal_columns, normalizer)\n", "val_df_norm = scaler.apply_normalizer(val_df, all_signal_columns, normalizer)\n",
"test_df_norm = apply_normalizer(test_df, all_signal_columns, normalizer)" "test_df_norm = scaler.apply_normalizer(test_df, all_signal_columns, normalizer)"
] ]
}, },
{ {
@ -566,13 +376,13 @@
"def build_intermediate_fusion_autoencoder(\n", "def build_intermediate_fusion_autoencoder(\n",
" input_dim_mod1=15,\n", " input_dim_mod1=15,\n",
" input_dim_mod2=20,\n", " input_dim_mod2=20,\n",
" encoder_hidden_dim_mod1=12, # individuell\n", " encoder_hidden_dim_mod1=12, # TODO: set manually\n",
" encoder_hidden_dim_mod2=20, # individuell\n", " encoder_hidden_dim_mod2=20, # TODO: set manually\n",
" latent_dim=6, # Änderung: Bottleneck vergrößert für stabilere Repräsentation\n", " latent_dim=6, # TODO: set manually\n",
" dropout_rate=0.4, # Dropout in Hidden Layers\n", " dropout_rate=0.4, # TODO: set manually\n",
" neg_slope=0.1,\n", " neg_slope=0.1, # TODO: set manually\n",
" weight_decay=1e-4,\n", " weight_decay=1e-4, # TODO: set manually\n",
" decoder_hidden_dims=[16, 32] # Änderung: Decoder größer für bessere Rekonstruktion\n", " decoder_hidden_dims=[16, 32] # TODO: set manually\n",
"):\n", "):\n",
" \"\"\"\n", " \"\"\"\n",
" Verbesserter Intermediate-Fusion Autoencoder für Deep SVDD.\n", " Verbesserter Intermediate-Fusion Autoencoder für Deep SVDD.\n",
@ -597,10 +407,10 @@
" kernel_regularizer=l2\n", " kernel_regularizer=l2\n",
" )(x1_in)\n", " )(x1_in)\n",
" e1 = act(e1)\n", " e1 = act(e1)\n",
" e1 = layers.Dropout(dropout_rate)(e1) # Dropout nur hier\n", " e1 = layers.Dropout(dropout_rate)(e1) \n",
"\n", "\n",
" e1 = layers.Dense(\n", " e1 = layers.Dense(\n",
" 16, # Änderung: Hidden Layer größer für stabilere Fusion\n", " 16, \n",
" use_bias=False,\n", " use_bias=False,\n",
" kernel_regularizer=l2\n", " kernel_regularizer=l2\n",
" )(e1)\n", " )(e1)\n",
@ -613,20 +423,20 @@
" kernel_regularizer=l2\n", " kernel_regularizer=l2\n",
" )(x2_in)\n", " )(x2_in)\n",
" e2 = act(e2)\n", " e2 = act(e2)\n",
" e2 = layers.Dropout(dropout_rate)(e2) # Dropout nur hier\n", " e2 = layers.Dropout(dropout_rate)(e2) \n",
"\n", "\n",
" e2 = layers.Dense(\n", " e2 = layers.Dense(\n",
" 16, # Änderung: Hidden Layer größer\n", " 16, \n",
" use_bias=False,\n", " use_bias=False,\n",
" kernel_regularizer=l2\n", " kernel_regularizer=l2\n",
" )(e2)\n", " )(e2)\n",
" e2 = act(e2)\n", " e2 = act(e2)\n",
"\n", "\n",
" # -------- Intermediate Fusion --------\n", " # -------- Intermediate Fusion --------\n",
" fused = layers.Concatenate(name=\"fusion\")([e1, e2]) # 16+16=32 Dimensionen\n", " fused = layers.Concatenate(name=\"fusion\")([e1, e2]) # 16+16=32 dimensions\n",
"\n", "\n",
" # -------- Joint Encoder / Bottleneck --------\n", " # -------- Joint Encoder / Bottleneck --------\n",
" # sinnvoll kleiner als Fusion\n", "\n",
" h = layers.Dense(\n", " h = layers.Dense(\n",
" latent_dim,\n", " latent_dim,\n",
" use_bias=False,\n", " use_bias=False,\n",
@ -637,16 +447,16 @@
"\n", "\n",
" z = layers.Dense(\n", " z = layers.Dense(\n",
" latent_dim,\n", " latent_dim,\n",
" activation=None, # linear, für Deep SVDD\n", " activation=None, # linear for Deep SVDD\n",
" use_bias=False,\n", " use_bias=False,\n",
" kernel_regularizer=l2,\n", " kernel_regularizer=l2,\n",
" name=\"latent\"\n", " name=\"latent\"\n",
" )(h)\n", " )(h)\n",
" # Dropout entfernt direkt vor Bottleneck\n", "\n",
"\n", "\n",
" # -------- Decoder --------\n", " # -------- Decoder --------\n",
" d = layers.Dense(\n", " d = layers.Dense(\n",
" decoder_hidden_dims[0], # größerer Decoder\n", " decoder_hidden_dims[0], \n",
" use_bias=False,\n", " use_bias=False,\n",
" kernel_regularizer=l2\n", " kernel_regularizer=l2\n",
" )(z)\n", " )(z)\n",
@ -692,10 +502,10 @@
"model = build_intermediate_fusion_autoencoder(\n", "model = build_intermediate_fusion_autoencoder(\n",
" input_dim_mod1=len(face_au_cols),\n", " input_dim_mod1=len(face_au_cols),\n",
" input_dim_mod2=len(eye_cols),\n", " input_dim_mod2=len(eye_cols),\n",
" encoder_hidden_dim_mod1=12, # individuell\n", " encoder_hidden_dim_mod1=12, # TODO: set manually\n",
" encoder_hidden_dim_mod2=8, # individuell\n", " encoder_hidden_dim_mod2=8, # TODO: set manually\n",
" latent_dim=4,\n", " latent_dim=4,\n",
" dropout_rate=0.7, # einstellbar\n", " dropout_rate=0.7, # TODO: set manually\n",
" neg_slope=0.1,\n", " neg_slope=0.1,\n",
" weight_decay=1e-3\n", " weight_decay=1e-3\n",
")\n", ")\n",
@ -780,7 +590,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"encoder_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/encoder_8_deep.keras')\n",
"encoder.save(encoder_save_path)" "encoder.save(encoder_save_path)"
] ]
}, },
@ -876,22 +685,6 @@
"center = get_center(deep_svdd_net, [X_face, X_eye])" "center = get_center(deep_svdd_net, [X_face, X_eye])"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "da140072",
"metadata": {},
"outputs": [],
"source": [
"# def get_radius(nu, dataset):\n",
"# x_face, x_eye = dataset # <-- zwingend entpacken\n",
"\n",
"# dataset_tuple=[x_face, x_eye]\n",
"\n",
"# dists = dist_per_sample(deep_svdd_net.predict(dataset_tuple), center)\n",
"# return np.quantile(np.sqrt(dists), 1-nu).astype(np.float32)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -944,10 +737,9 @@
" return get_radius_from_arrays(nu, X_face, X_eye)\n", " return get_radius_from_arrays(nu, X_face, X_eye)\n",
"\n", "\n",
"\n", "\n",
"nu = 0.25\n", "nu = 0.05 # Set nu respectively\n",
"\n", "\n",
"train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye)).shuffle(64).batch(64)\n", "train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye)).shuffle(64).batch(64)\n",
"# train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye))\n",
"\n", "\n",
"optimizer = tf.keras.optimizers.Adam(1e-3)\n", "optimizer = tf.keras.optimizers.Adam(1e-3)\n",
"train(train_dataset, epochs=150, nu=nu)\n", "train(train_dataset, epochs=150, nu=nu)\n",
@ -1019,7 +811,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"deep_svdd_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/deep_svdd_06.keras')\n",
"deep_svdd_net.save(deep_svdd_save_path)" "deep_svdd_net.save(deep_svdd_save_path)"
] ]
}, },

View File

@ -28,7 +28,7 @@
"sys.path.append(base_dir)\n", "sys.path.append(base_dir)\n",
"print(base_dir)\n", "print(base_dir)\n",
"\n", "\n",
"from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n", "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"from sklearn.ensemble import IsolationForest\n", "from sklearn.ensemble import IsolationForest\n",
"from sklearn.model_selection import GridSearchCV, KFold\n", "from sklearn.model_selection import GridSearchCV, KFold\n",
@ -52,7 +52,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")" "data_path = Path(r\".parquet\") # TODO: set manually"
] ]
}, },
{ {
@ -115,118 +115,6 @@
"print(f\"high all: {high_all.shape}\")" "print(f\"high all: {high_all.shape}\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "47a0f44d",
"metadata": {},
"outputs": [],
"source": [
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
" \"\"\"\n",
" Fit normalization scalers on training data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" train_data : pd.DataFrame\n",
" Training dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" method : str, default='standard'\n",
" Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
" scope : str, default='global'\n",
" Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing fitted scalers\n",
" \"\"\"\n",
" # Select scaler based on method\n",
" if method == 'standard':\n",
" Scaler = StandardScaler\n",
" elif method == 'minmax':\n",
" Scaler = MinMaxScaler\n",
" else:\n",
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
" \n",
" scalers = {}\n",
" \n",
" if scope == 'subject':\n",
" # Fit one scaler per subject\n",
" for subject in train_data['subjectID'].unique():\n",
" subject_mask = train_data['subjectID'] == subject\n",
" scaler = Scaler()\n",
" scaler.fit(train_data.loc[subject_mask, au_columns])\n",
" scalers[subject] = scaler\n",
" \n",
" elif scope == 'global':\n",
" # Fit one scaler for all subjects\n",
" scaler = Scaler()\n",
" scaler.fit(train_data[au_columns])\n",
" scalers['global'] = scaler\n",
" \n",
" else:\n",
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
" \n",
" return {'scalers': scalers, 'method': method, 'scope': scope}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "642d0017",
"metadata": {},
"outputs": [],
"source": [
"def apply_normalizer(data, au_columns, normalizer_dict):\n",
" \"\"\"\n",
" Apply fitted normalization scalers to data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" data : pd.DataFrame\n",
" Dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" normalizer_dict : dict\n",
" Dictionary containing fitted scalers from fit_normalizer()\n",
" \n",
" Returns:\n",
" --------\n",
" pd.DataFrame\n",
" DataFrame with normalized AU columns\n",
" \"\"\"\n",
" normalized_data = data.copy()\n",
" scalers = normalizer_dict['scalers']\n",
" scope = normalizer_dict['scope']\n",
" \n",
" if scope == 'subject':\n",
" # Apply per-subject normalization\n",
" for subject in data['subjectID'].unique():\n",
" subject_mask = data['subjectID'] == subject\n",
" \n",
" # Use the subject's scaler if available, otherwise use a fitted scaler from training\n",
" if subject in scalers:\n",
" scaler = scalers[subject]\n",
" else:\n",
" # For new subjects not seen in training, use the first available scaler\n",
" # (This is a fallback - ideally all test subjects should be in training for subject-level normalization)\n",
" print(f\"Warning: Subject {subject} not found in training data. Using fallback scaler.\")\n",
" scaler = list(scalers.values())[0]\n",
" \n",
" normalized_data.loc[subject_mask, au_columns] = scaler.transform(\n",
" data.loc[subject_mask, au_columns]\n",
" )\n",
" \n",
" elif scope == 'global':\n",
" # Apply global normalization\n",
" scaler = scalers['global']\n",
" normalized_data[au_columns] = scaler.transform(data[au_columns])\n",
" \n",
" return normalized_data"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "697b3cf7", "id": "697b3cf7",
@ -335,7 +223,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Fit normalizer on training data\n", "# Fit normalizer on training data\n",
"normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n", "normalizer = scaler.fit_normalizer(train_data, cols, method='minmax', scope='global')\n",
"print(\"Normalizer fitted on training data\")" "print(\"Normalizer fitted on training data\")"
] ]
}, },
@ -347,11 +235,11 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Apply normalization to all datasets\n", "# Apply normalization to all datasets\n",
"train_normalized = apply_normalizer(train_data, cols, normalizer)\n", "train_normalized = scaler.apply_normalizer(train_data, cols, normalizer)\n",
"val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n", "val_normal_normalized = scaler.apply_normalizer(val_normal_data, cols, normalizer)\n",
"val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n", "val_high_normalized = scaler.apply_normalizer(val_high_data, cols, normalizer)\n",
"test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n", "test_normal_normalized = scaler.apply_normalizer(test_normal_data, cols, normalizer)\n",
"test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n", "test_high_normalized = scaler.apply_normalizer(test_high_data, cols, normalizer)\n",
"\n", "\n",
"print(\"Normalization applied to all datasets\")" "print(\"Normalization applied to all datasets\")"
] ]
@ -490,18 +378,6 @@
"display_name": "Python 3 (ipykernel)", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
} }
}, },
"nbformat": 4, "nbformat": 4,