import pickle from sklearn.preprocessing import StandardScaler, MinMaxScaler import numpy as np import os def fit_normalizer(train_data, au_columns, method='standard', scope='global'): """ Fit normalization scalers on training data. Parameters: ----------- train_data : pd.DataFrame Training dataframe with AU columns and subjectID au_columns : list List of AU column names to normalize method : str, default='standard' Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler scope : str, default='global' Normalization scope: 'subject' for per-subject or 'global' for across all subjects Returns: -------- dict Dictionary containing fitted scalers and statistics for new subjects """ if method == 'standard': Scaler = StandardScaler elif method == 'minmax': Scaler = MinMaxScaler else: raise ValueError("method must be 'standard' or 'minmax'") scalers = {} if scope == 'subject': # Fit one scaler per subject subject_stats = [] for subject in train_data['subjectID'].unique(): subject_mask = train_data['subjectID'] == subject scaler = Scaler() scaler.fit(train_data.loc[subject_mask, au_columns].values) scalers[subject] = scaler # Store statistics for averaging if method == 'standard': subject_stats.append({ 'mean': scaler.mean_, 'std': scaler.scale_ }) elif method == 'minmax': subject_stats.append({ 'min': scaler.data_min_, 'max': scaler.data_max_ }) # Calculate average statistics for new subjects if method == 'standard': avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0) avg_std = np.mean([s['std'] for s in subject_stats], axis=0) fallback_scaler = StandardScaler() fallback_scaler.mean_ = avg_mean fallback_scaler.scale_ = avg_std fallback_scaler.var_ = avg_std ** 2 fallback_scaler.n_features_in_ = len(au_columns) elif method == 'minmax': avg_min = np.mean([s['min'] for s in subject_stats], axis=0) avg_max = np.mean([s['max'] for s in subject_stats], axis=0) fallback_scaler = MinMaxScaler() fallback_scaler.data_min_ = avg_min fallback_scaler.data_max_ = avg_max fallback_scaler.data_range_ = avg_max - avg_min fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_ fallback_scaler.min_ = -avg_min * fallback_scaler.scale_ fallback_scaler.n_features_in_ = len(au_columns) scalers['_fallback'] = fallback_scaler elif scope == 'global': # Fit one scaler for all subjects scaler = Scaler() scaler.fit(train_data[au_columns].values) scalers['global'] = scaler else: raise ValueError("scope must be 'subject' or 'global'") return {'scalers': scalers, 'method': method, 'scope': scope} def apply_normalizer(data, columns, normalizer_dict): """ Apply fitted normalization scalers to data. Parameters: ----------- data : pd.DataFrame Dataframe with AU columns and subjectID au_columns : list List of AU column names to normalize normalizer_dict : dict Dictionary containing fitted scalers from fit_normalizer() Returns: -------- pd.DataFrame DataFrame with normalized AU columns """ normalized_data = data.copy() scalers = normalizer_dict['scalers'] scope = normalizer_dict['scope'] normalized_data[columns] = normalized_data[columns].astype(np.float64) if scope == 'subject': # Apply per-subject normalization for subject in data['subjectID'].unique(): subject_mask = data['subjectID'] == subject # Use the subject's scaler if available, otherwise use fallback if subject in scalers: scaler = scalers[subject] else: # Use averaged scaler for new subjects scaler = scalers['_fallback'] print(f"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.") normalized_data.loc[subject_mask, columns] = scaler.transform( data.loc[subject_mask, columns].values ) elif scope == 'global': # Apply global normalization scaler = scalers['global'] normalized_data[columns] = scaler.transform(data[columns].values) return normalized_data def save_normalizer(normalizer_dict, filepath): """ Save fitted normalizer to disk. Parameters: ----------- normalizer_dict : dict Dictionary containing fitted scalers from fit_normalizer() filepath : str Path to save the normalizer (e.g., 'normalizer.pkl') """ # Create directory if it does not exist dirpath = os.path.dirname(filepath) if dirpath: os.makedirs(dirpath, exist_ok=True) with open(filepath, 'wb') as f: pickle.dump(normalizer_dict, f) print(f"Normalizer saved to {filepath}") def load_normalizer(filepath): """ Load fitted normalizer from disk. Parameters: ----------- filepath : str Path to the saved normalizer file Returns: -------- dict Dictionary containing fitted scalers """ with open(filepath, 'rb') as f: normalizer_dict = pickle.load(f) print(f"Normalizer loaded from {filepath}") return normalizer_dict