outsourcing of functions
This commit is contained in:
parent
a4b7190756
commit
a064f6cc90
@ -21,3 +21,42 @@ def mad_outlier_removal(df, columns, threshold=3.5, c=1.4826):
|
|||||||
|
|
||||||
final_mask = np.logical_and.reduce(masks)
|
final_mask = np.logical_and.reduce(masks)
|
||||||
return df_clean[final_mask]
|
return df_clean[final_mask]
|
||||||
|
|
||||||
|
def calculate_mad_params(df, columns):
|
||||||
|
"""
|
||||||
|
Calculate median and MAD parameters for each column.
|
||||||
|
This should be run ONLY on the training data.
|
||||||
|
|
||||||
|
Returns a dictionary: {col: (median, mad)}
|
||||||
|
"""
|
||||||
|
params = {}
|
||||||
|
for col in columns:
|
||||||
|
median = df[col].median()
|
||||||
|
mad = np.median(np.abs(df[col] - median))
|
||||||
|
params[col] = (median, mad)
|
||||||
|
return params
|
||||||
|
|
||||||
|
def apply_mad_filter(df, params, threshold=3.5):
|
||||||
|
"""
|
||||||
|
Apply MAD-based outlier removal using precomputed parameters.
|
||||||
|
Works on training, validation, and test data.
|
||||||
|
|
||||||
|
df: DataFrame to filter
|
||||||
|
params: dictionary {col: (median, mad)} from training data
|
||||||
|
threshold: cutoff for robust Z-score
|
||||||
|
"""
|
||||||
|
df_clean = df.copy()
|
||||||
|
|
||||||
|
for col, (median, mad) in params.items():
|
||||||
|
if mad == 0:
|
||||||
|
continue # no spread; nothing to remove for this column
|
||||||
|
|
||||||
|
robust_z = 0.6745 * (df_clean[col] - median) / mad
|
||||||
|
outlier_mask = np.abs(robust_z) > threshold
|
||||||
|
|
||||||
|
# Remove values only in this specific column
|
||||||
|
df_clean.loc[outlier_mask, col] = median
|
||||||
|
|
||||||
|
|
||||||
|
print(df_clean.shape)
|
||||||
|
return df_clean
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user