diff --git a/model_training/tools/mad_outlier_removal.py b/model_training/tools/mad_outlier_removal.py index 4fadcc7..41baeb0 100644 --- a/model_training/tools/mad_outlier_removal.py +++ b/model_training/tools/mad_outlier_removal.py @@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826): final_mask = np.logical_and.reduce(masks) return df_clean[final_mask] + +def calculate_mad_params(df, columns): + """ + Calculate median and MAD parameters for each column. + This should be run ONLY on the training data. + + Returns a dictionary: {col: (median, mad)} + """ + params = {} + for col in columns: + median = df[col].median() + mad = np.median(np.abs(df[col] - median)) + params[col] = (median, mad) + return params + +def apply_mad_filter(df, params, threshold=3.5): + """ + Apply MAD-based outlier removal using precomputed parameters. + Works on training, validation, and test data. + + df: DataFrame to filter + params: dictionary {col: (median, mad)} from training data + threshold: cutoff for robust Z-score + """ + df_clean = df.copy() + + for col, (median, mad) in params.items(): + if mad == 0: + continue # no spread; nothing to remove for this column + + robust_z = 0.6745 * (df_clean[col] - median) / mad + outlier_mask = np.abs(robust_z) > threshold + + # Remove values only in this specific column + df_clean.loc[outlier_mask, col] = median + + + print(df_clean.shape) + return df_clean