- changed MAD Outlier Removal - Function

- added regulation to xgboost
This commit is contained in:
TimoKurz 2025-12-13 13:45:31 +01:00
parent 15b32a9792
commit 87c5e21daf
3 changed files with 668 additions and 38 deletions

View File

@ -8,6 +8,64 @@
"Im folgenden wird auf die Daten das MAD Outlier removal angewendet." "Im folgenden wird auf die Daten das MAD Outlier removal angewendet."
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "46bd036d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0691732",
"metadata": {},
"outputs": [],
"source": [
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = np.nan\n",
" print(df_clean.shape)\n",
" \n",
" print(df_clean.shape)\n",
" return df_clean"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -15,6 +73,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# old removal - with this we were able to get 85% accuracy\n",
"# the values of the old validation & test data set is stored privately on the Cluster\n",
"\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
@ -32,8 +93,9 @@
" continue # keine Streuung, keine Ausreißer\n", " continue # keine Streuung, keine Ausreißer\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n", " robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" mask = np.abs(robust_z) <= threshold\n", " mask = np.abs(robust_z) <= threshold\n",
" df_clean = df_clean[mask]\n", " output = df_clean[mask]\n",
" return df_clean" "\n",
" return output"
] ]
} }
], ],

View File

@ -0,0 +1,538 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e3be057e-8d2a-4d05-bd42-6b1dc75df5ed",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13ad96f5",
"metadata": {},
"outputs": [],
"source": [
"# data_path = Path(r\"~/Fahrsimulator_MSY2526_AI/model_training/xgboost/output_windowed.parquet\")\n",
"data_path = Path(r\"~/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95e1a351",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(path=data_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68afd83e",
"metadata": {},
"outputs": [],
"source": [
"subjects = df['subjectID'].unique()\n",
"print(subjects)\n",
"print(len(subjects))\n",
"print(len(subjects)*0.66)\n",
"print(len(subjects)*0.33)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52dfd885",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n",
"]\n",
"print(f\"low all: {low_all.shape}\")\n",
"\n",
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"print(f\"high n-back: {high_nback.shape}\")\n",
"\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
"\n",
"high_all = pd.concat([high_nback, high_kdrive])\n",
"print(f\"high all: {high_all.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fba6edf",
"metadata": {},
"outputs": [],
"source": [
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
" if method == 'standard':\n",
" Scaler = StandardScaler\n",
" elif method == 'minmax':\n",
" Scaler = MinMaxScaler\n",
" else:\n",
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
" \n",
" scalers = {}\n",
" \n",
" if scope == 'subject':\n",
" for subject in train_data['subjectID'].unique():\n",
" subject_mask = train_data['subjectID'] == subject\n",
" scaler = Scaler()\n",
" scaler.fit(train_data.loc[subject_mask, au_columns])\n",
" scalers[subject] = scaler\n",
"\n",
" elif scope == 'global':\n",
" scaler = Scaler()\n",
" scaler.fit(train_data[au_columns])\n",
" scalers['global'] = scaler\n",
"\n",
" else:\n",
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
" \n",
" return {'scalers': scalers, 'method': method, 'scope': scope}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24e3a77b",
"metadata": {},
"outputs": [],
"source": [
"%pip install xgboost"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e7fa0fa",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV\n",
"from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix\n",
"import xgboost as xgb\n",
"import joblib\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "325ef71c",
"metadata": {},
"outputs": [],
"source": [
"low = low_all.copy()\n",
"high = high_all.copy()\n",
"\n",
"low[\"label\"] = 0\n",
"high[\"label\"] = 1\n",
"\n",
"data = pd.concat([low, high], ignore_index=True)\n",
"data = data.drop_duplicates()\n",
"\n",
"print(\"Label distribution:\")\n",
"print(data[\"label\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67d70e84",
"metadata": {},
"outputs": [],
"source": [
"au_columns = [col for col in data.columns if col.lower().startswith(\"au\")]\n",
"print(\"Gefundene AU-Spalten:\", au_columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "960bb8c7",
"metadata": {},
"outputs": [],
"source": [
"subjects = np.random.permutation(data[\"subjectID\"].unique())\n",
"\n",
"n = len(subjects)\n",
"n_train = int(n * 0.66)\n",
"\n",
"train_subjects = subjects[:n_train]\n",
"test_subjects = subjects[n_train:]\n",
"train_subs, val_subs = train_test_split(train_subjects, test_size=0.2, random_state=42)\n",
"\n",
"train_df = data[data.subjectID.isin(train_subs)]\n",
"val_df = data[data.subjectID.isin(val_subs)]\n",
"test_df = data[data.subjectID.isin(test_subjects)]\n",
"\n",
"print(train_df.shape, val_df.shape, test_df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "802a45c9",
"metadata": {},
"outputs": [],
"source": [
"def apply_normalizer(df_to_transform, normalizer_dict, au_columns):\n",
" scalers = normalizer_dict[\"scalers\"]\n",
" scope = normalizer_dict[\"scope\"]\n",
" df_out = df_to_transform.copy()\n",
"\n",
" if scope == \"global\":\n",
" scaler = scalers[\"global\"]\n",
" df_out[au_columns] = scaler.transform(df_out[au_columns])\n",
"\n",
" elif scope == \"subject\":\n",
" for subj, subdf in df_out.groupby(\"subjectID\"):\n",
" if subj in scalers:\n",
" df_out.loc[subdf.index, au_columns] = scalers[subj].transform(subdf[au_columns])\n",
" elif \"global\" in scalers:\n",
" df_out.loc[subdf.index, au_columns] = scalers[\"global\"].transform(subdf[au_columns])\n",
"\n",
" return df_out"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbb58abd",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params\n",
"\n",
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = np.nan\n",
" \n",
" return df_clean"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f03f1b4",
"metadata": {},
"outputs": [],
"source": [
"# Step 1: Fit parameters on training data\n",
"params = calculate_mad_params(train_df, au_columns)\n",
"\n",
"# Step 2: Apply filter consistently\n",
"train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
"val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n",
"test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "289f6b89",
"metadata": {},
"outputs": [],
"source": [
"normalizer = fit_normalizer(train_outlier_removed, au_columns, method=\"standard\", scope=\"global\")\n",
"\n",
"train_scaled = apply_normalizer(train_outlier_removed, normalizer, au_columns)\n",
"val_scaled = apply_normalizer(val_df, normalizer, au_columns)\n",
"test_scaled = apply_normalizer(test_df, normalizer, au_columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5df30e8d",
"metadata": {},
"outputs": [],
"source": [
"X_train, y_train = train_scaled[au_columns].values, train_scaled[\"label\"].values\n",
"X_val, y_val = val_scaled[au_columns].values, val_scaled[\"label\"].values\n",
"X_test, y_test = test_scaled[au_columns].values, test_scaled[\"label\"].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fb7c86a",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import RocCurveDisplay, log_loss, accuracy_score\n",
"early_stop = xgb.callback.EarlyStopping(\n",
" rounds=30, metric_name='auc', data_name='validation_0', save_best=True\n",
")\n",
"\n",
"# Basis-Modell\n",
"xgb_clf = xgb.XGBClassifier(\n",
" objective=\"binary:logistic\",\n",
" scale_pos_weight=1100/1550,\n",
" eval_metric=[\"logloss\",\"auc\",\"error\"],\n",
" use_label_encoder=False,\n",
" random_state=42,\n",
" callbacks=[early_stop],\n",
" verbosity=0,\n",
")\n",
"\n",
"# Parameter-Raster\n",
"param_grid = {\n",
" \"learning_rate\": [0.01, 0.02, 0.05],\n",
" \"max_depth\": [2, 3, 4],\n",
" # \"n_estimators\": [200, 500, 800],\n",
" \"subsample\": [0.4, 0.5],\n",
" \"colsample_bytree\": [0.7, 0.8],\n",
" \"reg_alpha\": [0, 0.1, 1, 10], # L1 regularization\n",
" \"reg_lambda\": [0.5, 1, 5, 10] # L2 regularization\n",
"}\n",
"\n",
"# old values - acc 100%\n",
" # \"learning_rate\": [0.01, 0.05, 0.1],\n",
" # \"max_depth\": [4, 6, 8],\n",
" # \"n_estimators\": [200, 500, 800],\n",
" # \"subsample\": [0.8, 1.0],\n",
" # \"colsample_bytree\": [0.8, 1.0]\n",
"\n",
"# K-Fold Cross Validation\n",
"cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
"\n",
"# Grid Search Setup\n",
"grid_search = GridSearchCV(\n",
" estimator=xgb_clf,\n",
" param_grid=param_grid,\n",
" scoring=\"roc_auc\",\n",
" n_jobs=-1,\n",
" cv=cv,\n",
" verbose=0\n",
")\n",
"\n",
"# Training mit Cross Validation\n",
"grid_search.fit(\n",
" X_train, y_train,\n",
" eval_set=[(X_train, y_train), (X_val, y_val)],\n",
" verbose=False,\n",
")\n",
"\n",
"print(\"Beste Parameter:\", grid_search.best_params_)\n",
"print(\"Bestes AUC:\", grid_search.best_score_)\n",
"\n",
"# Bestes Modell extrahieren\n",
"model = grid_search.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2681022",
"metadata": {},
"outputs": [],
"source": [
"# Plots\n",
"\n",
"results = model.evals_result()\n",
"epochs = len(results['validation_0']['auc'])\n",
"x_axis = range(0, epochs)\n",
"\n",
"# --- Plot Loss ---\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, results['validation_0']['logloss'], label='Validation Loss')\n",
"plt.plot(x_axis, results['validation_1']['logloss'], label='Training Loss')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Logloss')\n",
"plt.title('XGBoost Loss during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# --- Plot Accuracy ---\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, [1-e for e in results['validation_0']['error']], label='Validation Accuracy')\n",
"plt.plot(x_axis, [1-e for e in results['validation_1']['error']], label='Training Accuracy')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Accuracy')\n",
"plt.title('XGBoost Accuracy during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# Plot AUC\n",
"\n",
"plt.figure(figsize=(8,6))\n",
"plt.plot(x_axis, results['validation_0']['auc'], label='Validation AUC')\n",
"plt.plot(x_axis, results['validation_0']['auc'], marker='o')\n",
"plt.legend()\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('AUC')\n",
"plt.title('XGBoost AUC during Training')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# ROC-Kurve plotten\n",
"y_pred_proba = model.predict_proba(X_val)[:, 1]\n",
"RocCurveDisplay.from_predictions(y_val, y_pred_proba)\n",
"plt.title(\"ROC Curve (Validation Set)\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# Test: Loss und Accuracy\n",
"y_test_proba = model.predict_proba(X_test)[:,1]\n",
"y_test_pred = (y_test_proba > 0.5).astype(int)\n",
"\n",
"print(\"Test Loss:\", log_loss(y_test, y_test_proba))\n",
"print(\"Test Accuracy:\", accuracy_score(y_test, y_test_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09a8cd21",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay\n",
"\n",
"def evaluate(model, X, y, title=\"Evaluation\"):\n",
" # Vorhersagen\n",
" preds_proba = model.predict_proba(X)[:, 1]\n",
" preds = (preds_proba > 0.5).astype(int)\n",
"\n",
" # Metriken ausgeben\n",
" print(\"Accuracy:\", accuracy_score(y, preds))\n",
" print(\"F1:\", f1_score(y, preds))\n",
" print(\"AUC:\", roc_auc_score(y, preds))\n",
" print(\"Confusion:\\n\", confusion_matrix(y, preds))\n",
" print(classification_report(y, preds))\n",
"\n",
" # Confusion Matrix plotten\n",
" def plot_confusion_matrix(true_labels, predictions, label_names):\n",
" for normalize in [None, 'true']:\n",
" cm = confusion_matrix(true_labels, predictions, normalize=normalize)\n",
" cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)\n",
" cm_disp.plot(cmap=\"Blues\")\n",
" #cm = confusion_matrix(y, preds)\n",
" plot_confusion_matrix(y,preds, label_names=['Low','High'])\n",
" # plt.figure(figsize=(5,4))\n",
" # sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False,\n",
" # xticklabels=[\"Predicted low\", \"Predicted high\"],\n",
" # yticklabels=[\"Actual low\", \"Actual high\"])\n",
" # plt.title(f\"Confusion Matrix - {title}\")\n",
" # plt.ylabel(\"True label\")\n",
" # plt.xlabel(\"Predicted label\")\n",
" # plt.show()\n",
"\n",
"# Aufrufen für Train/Val/Test\n",
"print(\"TRAIN:\")\n",
"evaluate(model, X_train, y_train, title=\"Train\")\n",
"\n",
"print(\"VAL:\")\n",
"evaluate(model, X_val, y_val, title=\"Validation\")\n",
"\n",
"print(\"TEST:\")\n",
"evaluate(model, X_test, y_test, title=\"Test\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c43b0c80",
"metadata": {},
"outputs": [],
"source": [
"joblib.dump(model, \"xgb_model_with_MAD.joblib\")\n",
"joblib.dump(normalizer, \"normalizer_with_MAD.joblib\")\n",
"print(\"Model gespeichert.\")\n",
"\n",
"model.save_model(\"xgb_model_with_MAD.json\") # als JSON (lesbar, portabel)\n",
"model.save_model(\"xgb_model_with_MAD.bin\") # als Binärdatei (kompakt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3195cc84",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.getcwd()\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -76,34 +76,6 @@
"print(f\"high all: {high_all.shape}\")" "print(f\"high all: {high_all.shape}\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "dbb58abd",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"\n",
"def mad_outlier_removal(df, columns, threshold=3.5):\n",
" \"\"\"\n",
" Entfernt Ausreißer basierend auf Median Absolute Deviation (MAD).\n",
" threshold: typischer Wert ist 3.5 (entspricht robustem Z-Score Cutoff).\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
" for col in columns:\n",
" median = df_clean[col].median()\n",
" mad = np.median(np.abs(df_clean[col] - median))\n",
" if mad == 0:\n",
" continue # keine Streuung, keine Ausreißer\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" mask = np.abs(robust_z) <= threshold\n",
" df_clean = df_clean[mask]\n",
" return df_clean"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -244,6 +216,58 @@
" return df_out" " return df_out"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "dbb58abd",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params\n",
"\n",
"\n",
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = np.nan\n",
" print(df_clean.shape)\n",
" \n",
" print(df_clean.shape)\n",
" return df_clean\n",
"\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -251,11 +275,13 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"train_outlier_removed = mad_outlier_removal(train_df, au_columns, 50)\n", "# Step 1: Fit parameters on training data\n",
"val_outlier_removed = mad_outlier_removal(val_df, au_columns, 50)\n", "params = calculate_mad_params(train_df, au_columns)\n",
"test_outlier_removed = mad_outlier_removal(test_df, au_columns, 50)\n", "\n",
"print(train_df.shape)\n", "# Step 2: Apply filter consistently\n",
"print(train_outlier_removed.shape)" "train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
"val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n",
"test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)"
] ]
}, },
{ {
@ -296,7 +322,8 @@
" objective=\"binary:logistic\",\n", " objective=\"binary:logistic\",\n",
" eval_metric=\"auc\",\n", " eval_metric=\"auc\",\n",
" use_label_encoder=False,\n", " use_label_encoder=False,\n",
" random_state=42\n", " random_state=42,\n",
" verbosity=0,\n",
")\n", ")\n",
"\n", "\n",
"# Parameter-Raster\n", "# Parameter-Raster\n",
@ -318,11 +345,14 @@
" scoring=\"roc_auc\",\n", " scoring=\"roc_auc\",\n",
" n_jobs=-1,\n", " n_jobs=-1,\n",
" cv=cv,\n", " cv=cv,\n",
" verbose=2\n", " verbose=0\n",
")\n", ")\n",
"\n", "\n",
"# Training mit Cross Validation\n", "# Training mit Cross Validation\n",
"grid_search.fit(X_train, y_train)\n", "grid_search.fit(\n",
" X_train, y_train, \n",
" verbose=False,\n",
" )\n",
"\n", "\n",
"print(\"Beste Parameter:\", grid_search.best_params_)\n", "print(\"Beste Parameter:\", grid_search.best_params_)\n",
"print(\"Bestes AUC:\", grid_search.best_score_)\n", "print(\"Bestes AUC:\", grid_search.best_score_)\n",