outsourcing of functions

2026-03-05 13:38:04 +01:00 · 2026-03-05 13:38:04 +01:00 · a064f6cc90
commit a064f6cc90
parent a4b7190756
1 changed files with 39 additions and 0 deletions
--- a/model_training/tools/mad_outlier_removal.py
+++ b/model_training/tools/mad_outlier_removal.py
@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826):

    final_mask = np.logical_and.reduce(masks)
    return df_clean[final_mask]
+
+def calculate_mad_params(df, columns):
+    """
+    Calculate median and MAD parameters for each column.
+    This should be run ONLY on the training data.
+    
+    Returns a dictionary: {col: (median, mad)}
+    """
+    params = {}
+    for col in columns:
+        median = df[col].median()
+        mad = np.median(np.abs(df[col] - median))
+        params[col] = (median, mad)
+    return params
+
+def apply_mad_filter(df, params, threshold=3.5):
+    """
+    Apply MAD-based outlier removal using precomputed parameters.
+    Works on training, validation, and test data.
+    
+    df: DataFrame to filter
+    params: dictionary {col: (median, mad)} from training data
+    threshold: cutoff for robust Z-score
+    """
+    df_clean = df.copy()
+
+    for col, (median, mad) in params.items():
+        if mad == 0:
+            continue  # no spread; nothing to remove for this column
+
+        robust_z = 0.6745 * (df_clean[col] - median) / mad
+        outlier_mask = np.abs(robust_z) > threshold
+
+        # Remove values only in this specific column
+        df_clean.loc[outlier_mask, col] = median
+        
+        
+    print(df_clean.shape)
+    return df_clean