outsourcing of functions

This commit is contained in:
Michael Weig 2026-03-05 13:38:04 +01:00
parent a4b7190756
commit a064f6cc90

View File

@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826):
final_mask = np.logical_and.reduce(masks)
return df_clean[final_mask]
def calculate_mad_params(df, columns):
"""
Calculate median and MAD parameters for each column.
This should be run ONLY on the training data.
Returns a dictionary: {col: (median, mad)}
"""
params = {}
for col in columns:
median = df[col].median()
mad = np.median(np.abs(df[col] - median))
params[col] = (median, mad)
return params
def apply_mad_filter(df, params, threshold=3.5):
"""
Apply MAD-based outlier removal using precomputed parameters.
Works on training, validation, and test data.
df: DataFrame to filter
params: dictionary {col: (median, mad)} from training data
threshold: cutoff for robust Z-score
"""
df_clean = df.copy()
for col, (median, mad) in params.items():
if mad == 0:
continue # no spread; nothing to remove for this column
robust_z = 0.6745 * (df_clean[col] - median) / mad
outlier_mask = np.abs(robust_z) > threshold
# Remove values only in this specific column
df_clean.loc[outlier_mask, col] = median
print(df_clean.shape)
return df_clean