2026-01-27 19:10:24 +01:00

176 lines
5.8 KiB
Python

import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import os
def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
"""
Fit normalization scalers on training data.
Parameters:
-----------
train_data : pd.DataFrame
Training dataframe with AU columns and subjectID
au_columns : list
List of AU column names to normalize
method : str, default='standard'
Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler
scope : str, default='global'
Normalization scope: 'subject' for per-subject or 'global' for across all subjects
Returns:
--------
dict
Dictionary containing fitted scalers and statistics for new subjects
"""
if method == 'standard':
Scaler = StandardScaler
elif method == 'minmax':
Scaler = MinMaxScaler
else:
raise ValueError("method must be 'standard' or 'minmax'")
scalers = {}
if scope == 'subject':
# Fit one scaler per subject
subject_stats = []
for subject in train_data['subjectID'].unique():
subject_mask = train_data['subjectID'] == subject
scaler = Scaler()
scaler.fit(train_data.loc[subject_mask, au_columns].values)
scalers[subject] = scaler
# Store statistics for averaging
if method == 'standard':
subject_stats.append({
'mean': scaler.mean_,
'std': scaler.scale_
})
elif method == 'minmax':
subject_stats.append({
'min': scaler.data_min_,
'max': scaler.data_max_
})
# Calculate average statistics for new subjects
if method == 'standard':
avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)
avg_std = np.mean([s['std'] for s in subject_stats], axis=0)
fallback_scaler = StandardScaler()
fallback_scaler.mean_ = avg_mean
fallback_scaler.scale_ = avg_std
fallback_scaler.var_ = avg_std ** 2
fallback_scaler.n_features_in_ = len(au_columns)
elif method == 'minmax':
avg_min = np.mean([s['min'] for s in subject_stats], axis=0)
avg_max = np.mean([s['max'] for s in subject_stats], axis=0)
fallback_scaler = MinMaxScaler()
fallback_scaler.data_min_ = avg_min
fallback_scaler.data_max_ = avg_max
fallback_scaler.data_range_ = avg_max - avg_min
fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_
fallback_scaler.min_ = -avg_min * fallback_scaler.scale_
fallback_scaler.n_features_in_ = len(au_columns)
scalers['_fallback'] = fallback_scaler
elif scope == 'global':
# Fit one scaler for all subjects
scaler = Scaler()
scaler.fit(train_data[au_columns].values)
scalers['global'] = scaler
else:
raise ValueError("scope must be 'subject' or 'global'")
return {'scalers': scalers, 'method': method, 'scope': scope}
def apply_normalizer(data, columns, normalizer_dict):
"""
Apply fitted normalization scalers to data.
Parameters:
-----------
data : pd.DataFrame
Dataframe with AU columns and subjectID
au_columns : list
List of AU column names to normalize
normalizer_dict : dict
Dictionary containing fitted scalers from fit_normalizer()
Returns:
--------
pd.DataFrame
DataFrame with normalized AU columns
"""
normalized_data = data.copy()
scalers = normalizer_dict['scalers']
scope = normalizer_dict['scope']
normalized_data[columns] = normalized_data[columns].astype(np.float64)
if scope == 'subject':
# Apply per-subject normalization
for subject in data['subjectID'].unique():
subject_mask = data['subjectID'] == subject
# Use the subject's scaler if available, otherwise use fallback
if subject in scalers:
scaler = scalers[subject]
else:
# Use averaged scaler for new subjects
scaler = scalers['_fallback']
print(f"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.")
normalized_data.loc[subject_mask, columns] = scaler.transform(
data.loc[subject_mask, columns].values
)
elif scope == 'global':
# Apply global normalization
scaler = scalers['global']
normalized_data[columns] = scaler.transform(data[columns].values)
return normalized_data
def save_normalizer(normalizer_dict, filepath):
"""
Save fitted normalizer to disk.
Parameters:
-----------
normalizer_dict : dict
Dictionary containing fitted scalers from fit_normalizer()
filepath : str
Path to save the normalizer (e.g., 'normalizer.pkl')
"""
# Create directory if it does not exist
dirpath = os.path.dirname(filepath)
if dirpath:
os.makedirs(dirpath, exist_ok=True)
with open(filepath, 'wb') as f:
pickle.dump(normalizer_dict, f)
print(f"Normalizer saved to {filepath}")
def load_normalizer(filepath):
"""
Load fitted normalizer from disk.
Parameters:
-----------
filepath : str
Path to the saved normalizer file
Returns:
--------
dict
Dictionary containing fitted scalers
"""
with open(filepath, 'rb') as f:
normalizer_dict = pickle.load(f)
print(f"Normalizer loaded from {filepath}")
return normalizer_dict