outsourcing of functions

2026-03-05 13:38:04 +01:00 · 2026-03-05 13:38:04 +01:00 · a064f6cc90
commit a064f6cc90
parent a4b7190756
1 changed files with 39 additions and 0 deletions
--- a/model_training/tools/mad_outlier_removal.py
+++ b/model_training/tools/mad_outlier_removal.py
@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826):
    final_mask = np.logical_and.reduce(masks)
    return df_clean[final_mask]
 def calculate_mad_params(df, columns):
    """
    Calculate median and MAD parameters for each column.
    This should be run ONLY on the training data.
    Returns a dictionary: {col: (median, mad)}
    """
    params = {}
    for col in columns:
        median = df[col].median()
        mad = np.median(np.abs(df[col] - median))
        params[col] = (median, mad)
    return params
 def apply_mad_filter(df, params, threshold=3.5):
    """
    Apply MAD-based outlier removal using precomputed parameters.
    Works on training, validation, and test data.
    df: DataFrame to filter
    params: dictionary {col: (median, mad)} from training data
    threshold: cutoff for robust Z-score
    """
    df_clean = df.copy()
    for col, (median, mad) in params.items():
        if mad == 0:
            continue  # no spread; nothing to remove for this column
        robust_z = 0.6745 * (df_clean[col] - median) / mad
        outlier_mask = np.abs(robust_z) > threshold
        # Remove values only in this specific column
        df_clean.loc[outlier_mask, col] = median
    print(df_clean.shape)
    return df_clean