outsourcing of functions
This commit is contained in:
parent
a4b7190756
commit
a064f6cc90
@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826):
|
||||
|
||||
final_mask = np.logical_and.reduce(masks)
|
||||
return df_clean[final_mask]
|
||||
|
||||
def calculate_mad_params(df, columns):
|
||||
"""
|
||||
Calculate median and MAD parameters for each column.
|
||||
This should be run ONLY on the training data.
|
||||
|
||||
Returns a dictionary: {col: (median, mad)}
|
||||
"""
|
||||
params = {}
|
||||
for col in columns:
|
||||
median = df[col].median()
|
||||
mad = np.median(np.abs(df[col] - median))
|
||||
params[col] = (median, mad)
|
||||
return params
|
||||
|
||||
def apply_mad_filter(df, params, threshold=3.5):
|
||||
"""
|
||||
Apply MAD-based outlier removal using precomputed parameters.
|
||||
Works on training, validation, and test data.
|
||||
|
||||
df: DataFrame to filter
|
||||
params: dictionary {col: (median, mad)} from training data
|
||||
threshold: cutoff for robust Z-score
|
||||
"""
|
||||
df_clean = df.copy()
|
||||
|
||||
for col, (median, mad) in params.items():
|
||||
if mad == 0:
|
||||
continue # no spread; nothing to remove for this column
|
||||
|
||||
robust_z = 0.6745 * (df_clean[col] - median) / mad
|
||||
outlier_mask = np.abs(robust_z) > threshold
|
||||
|
||||
# Remove values only in this specific column
|
||||
df_clean.loc[outlier_mask, col] = median
|
||||
|
||||
|
||||
print(df_clean.shape)
|
||||
return df_clean
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user