diff --git a/model_training/MAD outlier removal/mad_outlier_removal.ipynb b/model_training/MAD_outlier_removal/mad_outlier_removal.ipynb similarity index 100% rename from model_training/MAD outlier removal/mad_outlier_removal.ipynb rename to model_training/MAD_outlier_removal/mad_outlier_removal.ipynb diff --git a/model_training/tools/mad_outlier_removal.py b/model_training/tools/mad_outlier_removal.py new file mode 100644 index 0000000..4fadcc7 --- /dev/null +++ b/model_training/tools/mad_outlier_removal.py @@ -0,0 +1,23 @@ +import numpy as np +import pandas as pd + +def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826): + df_clean = df.copy() + masks = [] + + for col in columns: + x = df_clean[col].values + median = np.median(x) + mad = np.median(np.abs(x - median)) + + if mad == 0: + continue + + z = np.abs((x - median) / (c * mad)) + masks.append(z <= threshold) + + if not masks: + return df_clean + + final_mask = np.logical_and.reduce(masks) + return df_clean[final_mask]