- changed MAD Outlier Removal - Function
- added regulation to xgboost
This commit is contained in:
parent
15b32a9792
commit
87c5e21daf
@ -8,6 +8,64 @@
|
|||||||
"Im folgenden wird auf die Daten das MAD Outlier removal angewendet."
|
"Im folgenden wird auf die Daten das MAD Outlier removal angewendet."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "46bd036d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"def calculate_mad_params(df, columns):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Calculate median and MAD parameters for each column.\n",
|
||||||
|
" This should be run ONLY on the training data.\n",
|
||||||
|
" \n",
|
||||||
|
" Returns a dictionary: {col: (median, mad)}\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" params = {}\n",
|
||||||
|
" for col in columns:\n",
|
||||||
|
" median = df[col].median()\n",
|
||||||
|
" mad = np.median(np.abs(df[col] - median))\n",
|
||||||
|
" params[col] = (median, mad)\n",
|
||||||
|
" return params"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e0691732",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def apply_mad_filter(df, params, threshold=3.5):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Apply MAD-based outlier removal using precomputed parameters.\n",
|
||||||
|
" Works on training, validation, and test data.\n",
|
||||||
|
" \n",
|
||||||
|
" df: DataFrame to filter\n",
|
||||||
|
" params: dictionary {col: (median, mad)} from training data\n",
|
||||||
|
" threshold: cutoff for robust Z-score\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" df_clean = df.copy()\n",
|
||||||
|
"\n",
|
||||||
|
" for col, (median, mad) in params.items():\n",
|
||||||
|
" if mad == 0:\n",
|
||||||
|
" continue # no spread; nothing to remove for this column\n",
|
||||||
|
"\n",
|
||||||
|
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
||||||
|
" outlier_mask = np.abs(robust_z) > threshold\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove values only in this specific column\n",
|
||||||
|
" df_clean.loc[outlier_mask, col] = np.nan\n",
|
||||||
|
" print(df_clean.shape)\n",
|
||||||
|
" \n",
|
||||||
|
" print(df_clean.shape)\n",
|
||||||
|
" return df_clean"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
@ -15,6 +73,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# old removal - with this we were able to get 85% accuracy\n",
|
||||||
|
"# the values of the old validation & test data set is stored privately on the Cluster\n",
|
||||||
|
"\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
||||||
@ -32,8 +93,9 @@
|
|||||||
" continue # keine Streuung, keine Ausreißer\n",
|
" continue # keine Streuung, keine Ausreißer\n",
|
||||||
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
||||||
" mask = np.abs(robust_z) <= threshold\n",
|
" mask = np.abs(robust_z) <= threshold\n",
|
||||||
" df_clean = df_clean[mask]\n",
|
" output = df_clean[mask]\n",
|
||||||
" return df_clean"
|
"\n",
|
||||||
|
" return output"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
538
model_training/xgboost/xgboost_regulated.ipynb
Normal file
538
model_training/xgboost/xgboost_regulated.ipynb
Normal file
@ -0,0 +1,538 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e3be057e-8d2a-4d05-bd42-6b1dc75df5ed",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "13ad96f5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# data_path = Path(r\"~/Fahrsimulator_MSY2526_AI/model_training/xgboost/output_windowed.parquet\")\n",
|
||||||
|
"data_path = Path(r\"~/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "95e1a351",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df = pd.read_parquet(path=data_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "68afd83e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"subjects = df['subjectID'].unique()\n",
|
||||||
|
"print(subjects)\n",
|
||||||
|
"print(len(subjects))\n",
|
||||||
|
"print(len(subjects)*0.66)\n",
|
||||||
|
"print(len(subjects)*0.33)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "52dfd885",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"low_all = df[\n",
|
||||||
|
" ((df[\"PHASE\"] == \"baseline\") |\n",
|
||||||
|
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1, 4]))))\n",
|
||||||
|
"]\n",
|
||||||
|
"print(f\"low all: {low_all.shape}\")\n",
|
||||||
|
"\n",
|
||||||
|
"high_nback = df[\n",
|
||||||
|
" (df[\"STUDY\"]==\"n-back\") &\n",
|
||||||
|
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
|
||||||
|
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
|
||||||
|
"]\n",
|
||||||
|
"print(f\"high n-back: {high_nback.shape}\")\n",
|
||||||
|
"\n",
|
||||||
|
"high_kdrive = df[\n",
|
||||||
|
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
|
||||||
|
"]\n",
|
||||||
|
"print(f\"high k-drive: {high_kdrive.shape}\")\n",
|
||||||
|
"\n",
|
||||||
|
"high_all = pd.concat([high_nback, high_kdrive])\n",
|
||||||
|
"print(f\"high all: {high_all.shape}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8fba6edf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
|
||||||
|
" if method == 'standard':\n",
|
||||||
|
" Scaler = StandardScaler\n",
|
||||||
|
" elif method == 'minmax':\n",
|
||||||
|
" Scaler = MinMaxScaler\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
|
||||||
|
" \n",
|
||||||
|
" scalers = {}\n",
|
||||||
|
" \n",
|
||||||
|
" if scope == 'subject':\n",
|
||||||
|
" for subject in train_data['subjectID'].unique():\n",
|
||||||
|
" subject_mask = train_data['subjectID'] == subject\n",
|
||||||
|
" scaler = Scaler()\n",
|
||||||
|
" scaler.fit(train_data.loc[subject_mask, au_columns])\n",
|
||||||
|
" scalers[subject] = scaler\n",
|
||||||
|
"\n",
|
||||||
|
" elif scope == 'global':\n",
|
||||||
|
" scaler = Scaler()\n",
|
||||||
|
" scaler.fit(train_data[au_columns])\n",
|
||||||
|
" scalers['global'] = scaler\n",
|
||||||
|
"\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
|
||||||
|
" \n",
|
||||||
|
" return {'scalers': scalers, 'method': method, 'scope': scope}\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "24e3a77b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install xgboost"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8e7fa0fa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV\n",
|
||||||
|
"from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix\n",
|
||||||
|
"import xgboost as xgb\n",
|
||||||
|
"import joblib\n",
|
||||||
|
"import matplotlib.pyplot as plt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "325ef71c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"low = low_all.copy()\n",
|
||||||
|
"high = high_all.copy()\n",
|
||||||
|
"\n",
|
||||||
|
"low[\"label\"] = 0\n",
|
||||||
|
"high[\"label\"] = 1\n",
|
||||||
|
"\n",
|
||||||
|
"data = pd.concat([low, high], ignore_index=True)\n",
|
||||||
|
"data = data.drop_duplicates()\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Label distribution:\")\n",
|
||||||
|
"print(data[\"label\"].value_counts())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "67d70e84",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"au_columns = [col for col in data.columns if col.lower().startswith(\"au\")]\n",
|
||||||
|
"print(\"Gefundene AU-Spalten:\", au_columns)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "960bb8c7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"subjects = np.random.permutation(data[\"subjectID\"].unique())\n",
|
||||||
|
"\n",
|
||||||
|
"n = len(subjects)\n",
|
||||||
|
"n_train = int(n * 0.66)\n",
|
||||||
|
"\n",
|
||||||
|
"train_subjects = subjects[:n_train]\n",
|
||||||
|
"test_subjects = subjects[n_train:]\n",
|
||||||
|
"train_subs, val_subs = train_test_split(train_subjects, test_size=0.2, random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"train_df = data[data.subjectID.isin(train_subs)]\n",
|
||||||
|
"val_df = data[data.subjectID.isin(val_subs)]\n",
|
||||||
|
"test_df = data[data.subjectID.isin(test_subjects)]\n",
|
||||||
|
"\n",
|
||||||
|
"print(train_df.shape, val_df.shape, test_df.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "802a45c9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def apply_normalizer(df_to_transform, normalizer_dict, au_columns):\n",
|
||||||
|
" scalers = normalizer_dict[\"scalers\"]\n",
|
||||||
|
" scope = normalizer_dict[\"scope\"]\n",
|
||||||
|
" df_out = df_to_transform.copy()\n",
|
||||||
|
"\n",
|
||||||
|
" if scope == \"global\":\n",
|
||||||
|
" scaler = scalers[\"global\"]\n",
|
||||||
|
" df_out[au_columns] = scaler.transform(df_out[au_columns])\n",
|
||||||
|
"\n",
|
||||||
|
" elif scope == \"subject\":\n",
|
||||||
|
" for subj, subdf in df_out.groupby(\"subjectID\"):\n",
|
||||||
|
" if subj in scalers:\n",
|
||||||
|
" df_out.loc[subdf.index, au_columns] = scalers[subj].transform(subdf[au_columns])\n",
|
||||||
|
" elif \"global\" in scalers:\n",
|
||||||
|
" df_out.loc[subdf.index, au_columns] = scalers[\"global\"].transform(subdf[au_columns])\n",
|
||||||
|
"\n",
|
||||||
|
" return df_out"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dbb58abd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"def calculate_mad_params(df, columns):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Calculate median and MAD parameters for each column.\n",
|
||||||
|
" This should be run ONLY on the training data.\n",
|
||||||
|
" \n",
|
||||||
|
" Returns a dictionary: {col: (median, mad)}\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" params = {}\n",
|
||||||
|
" for col in columns:\n",
|
||||||
|
" median = df[col].median()\n",
|
||||||
|
" mad = np.median(np.abs(df[col] - median))\n",
|
||||||
|
" params[col] = (median, mad)\n",
|
||||||
|
" return params\n",
|
||||||
|
"\n",
|
||||||
|
"def apply_mad_filter(df, params, threshold=3.5):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Apply MAD-based outlier removal using precomputed parameters.\n",
|
||||||
|
" Works on training, validation, and test data.\n",
|
||||||
|
" \n",
|
||||||
|
" df: DataFrame to filter\n",
|
||||||
|
" params: dictionary {col: (median, mad)} from training data\n",
|
||||||
|
" threshold: cutoff for robust Z-score\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" df_clean = df.copy()\n",
|
||||||
|
"\n",
|
||||||
|
" for col, (median, mad) in params.items():\n",
|
||||||
|
" if mad == 0:\n",
|
||||||
|
" continue # no spread; nothing to remove for this column\n",
|
||||||
|
"\n",
|
||||||
|
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
||||||
|
" outlier_mask = np.abs(robust_z) > threshold\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove values only in this specific column\n",
|
||||||
|
" df_clean.loc[outlier_mask, col] = np.nan\n",
|
||||||
|
" \n",
|
||||||
|
" return df_clean"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0f03f1b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Step 1: Fit parameters on training data\n",
|
||||||
|
"params = calculate_mad_params(train_df, au_columns)\n",
|
||||||
|
"\n",
|
||||||
|
"# Step 2: Apply filter consistently\n",
|
||||||
|
"train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
|
||||||
|
"val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n",
|
||||||
|
"test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "289f6b89",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"normalizer = fit_normalizer(train_outlier_removed, au_columns, method=\"standard\", scope=\"global\")\n",
|
||||||
|
"\n",
|
||||||
|
"train_scaled = apply_normalizer(train_outlier_removed, normalizer, au_columns)\n",
|
||||||
|
"val_scaled = apply_normalizer(val_df, normalizer, au_columns)\n",
|
||||||
|
"test_scaled = apply_normalizer(test_df, normalizer, au_columns)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5df30e8d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_train, y_train = train_scaled[au_columns].values, train_scaled[\"label\"].values\n",
|
||||||
|
"X_val, y_val = val_scaled[au_columns].values, val_scaled[\"label\"].values\n",
|
||||||
|
"X_test, y_test = test_scaled[au_columns].values, test_scaled[\"label\"].values"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6fb7c86a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"from sklearn.metrics import RocCurveDisplay, log_loss, accuracy_score\n",
|
||||||
|
"early_stop = xgb.callback.EarlyStopping(\n",
|
||||||
|
" rounds=30, metric_name='auc', data_name='validation_0', save_best=True\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Basis-Modell\n",
|
||||||
|
"xgb_clf = xgb.XGBClassifier(\n",
|
||||||
|
" objective=\"binary:logistic\",\n",
|
||||||
|
" scale_pos_weight=1100/1550,\n",
|
||||||
|
" eval_metric=[\"logloss\",\"auc\",\"error\"],\n",
|
||||||
|
" use_label_encoder=False,\n",
|
||||||
|
" random_state=42,\n",
|
||||||
|
" callbacks=[early_stop],\n",
|
||||||
|
" verbosity=0,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Parameter-Raster\n",
|
||||||
|
"param_grid = {\n",
|
||||||
|
" \"learning_rate\": [0.01, 0.02, 0.05],\n",
|
||||||
|
" \"max_depth\": [2, 3, 4],\n",
|
||||||
|
" # \"n_estimators\": [200, 500, 800],\n",
|
||||||
|
" \"subsample\": [0.4, 0.5],\n",
|
||||||
|
" \"colsample_bytree\": [0.7, 0.8],\n",
|
||||||
|
" \"reg_alpha\": [0, 0.1, 1, 10], # L1 regularization\n",
|
||||||
|
" \"reg_lambda\": [0.5, 1, 5, 10] # L2 regularization\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# old values - acc 100%\n",
|
||||||
|
" # \"learning_rate\": [0.01, 0.05, 0.1],\n",
|
||||||
|
" # \"max_depth\": [4, 6, 8],\n",
|
||||||
|
" # \"n_estimators\": [200, 500, 800],\n",
|
||||||
|
" # \"subsample\": [0.8, 1.0],\n",
|
||||||
|
" # \"colsample_bytree\": [0.8, 1.0]\n",
|
||||||
|
"\n",
|
||||||
|
"# K-Fold Cross Validation\n",
|
||||||
|
"cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"# Grid Search Setup\n",
|
||||||
|
"grid_search = GridSearchCV(\n",
|
||||||
|
" estimator=xgb_clf,\n",
|
||||||
|
" param_grid=param_grid,\n",
|
||||||
|
" scoring=\"roc_auc\",\n",
|
||||||
|
" n_jobs=-1,\n",
|
||||||
|
" cv=cv,\n",
|
||||||
|
" verbose=0\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Training mit Cross Validation\n",
|
||||||
|
"grid_search.fit(\n",
|
||||||
|
" X_train, y_train,\n",
|
||||||
|
" eval_set=[(X_train, y_train), (X_val, y_val)],\n",
|
||||||
|
" verbose=False,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Beste Parameter:\", grid_search.best_params_)\n",
|
||||||
|
"print(\"Bestes AUC:\", grid_search.best_score_)\n",
|
||||||
|
"\n",
|
||||||
|
"# Bestes Modell extrahieren\n",
|
||||||
|
"model = grid_search.best_estimator_"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d2681022",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Plots\n",
|
||||||
|
"\n",
|
||||||
|
"results = model.evals_result()\n",
|
||||||
|
"epochs = len(results['validation_0']['auc'])\n",
|
||||||
|
"x_axis = range(0, epochs)\n",
|
||||||
|
"\n",
|
||||||
|
"# --- Plot Loss ---\n",
|
||||||
|
"plt.figure(figsize=(8,6))\n",
|
||||||
|
"plt.plot(x_axis, results['validation_0']['logloss'], label='Validation Loss')\n",
|
||||||
|
"plt.plot(x_axis, results['validation_1']['logloss'], label='Training Loss')\n",
|
||||||
|
"plt.legend()\n",
|
||||||
|
"plt.xlabel('Epochs')\n",
|
||||||
|
"plt.ylabel('Logloss')\n",
|
||||||
|
"plt.title('XGBoost Loss during Training')\n",
|
||||||
|
"plt.grid(True)\n",
|
||||||
|
"plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# --- Plot Accuracy ---\n",
|
||||||
|
"plt.figure(figsize=(8,6))\n",
|
||||||
|
"plt.plot(x_axis, [1-e for e in results['validation_0']['error']], label='Validation Accuracy')\n",
|
||||||
|
"plt.plot(x_axis, [1-e for e in results['validation_1']['error']], label='Training Accuracy')\n",
|
||||||
|
"plt.legend()\n",
|
||||||
|
"plt.xlabel('Epochs')\n",
|
||||||
|
"plt.ylabel('Accuracy')\n",
|
||||||
|
"plt.title('XGBoost Accuracy during Training')\n",
|
||||||
|
"plt.grid(True)\n",
|
||||||
|
"plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# Plot AUC\n",
|
||||||
|
"\n",
|
||||||
|
"plt.figure(figsize=(8,6))\n",
|
||||||
|
"plt.plot(x_axis, results['validation_0']['auc'], label='Validation AUC')\n",
|
||||||
|
"plt.plot(x_axis, results['validation_0']['auc'], marker='o')\n",
|
||||||
|
"plt.legend()\n",
|
||||||
|
"plt.xlabel('Epochs')\n",
|
||||||
|
"plt.ylabel('AUC')\n",
|
||||||
|
"plt.title('XGBoost AUC during Training')\n",
|
||||||
|
"plt.grid(True)\n",
|
||||||
|
"plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# ROC-Kurve plotten\n",
|
||||||
|
"y_pred_proba = model.predict_proba(X_val)[:, 1]\n",
|
||||||
|
"RocCurveDisplay.from_predictions(y_val, y_pred_proba)\n",
|
||||||
|
"plt.title(\"ROC Curve (Validation Set)\")\n",
|
||||||
|
"plt.grid(True)\n",
|
||||||
|
"plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# Test: Loss und Accuracy\n",
|
||||||
|
"y_test_proba = model.predict_proba(X_test)[:,1]\n",
|
||||||
|
"y_test_pred = (y_test_proba > 0.5).astype(int)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Test Loss:\", log_loss(y_test, y_test_proba))\n",
|
||||||
|
"print(\"Test Accuracy:\", accuracy_score(y_test, y_test_pred))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "09a8cd21",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay\n",
|
||||||
|
"\n",
|
||||||
|
"def evaluate(model, X, y, title=\"Evaluation\"):\n",
|
||||||
|
" # Vorhersagen\n",
|
||||||
|
" preds_proba = model.predict_proba(X)[:, 1]\n",
|
||||||
|
" preds = (preds_proba > 0.5).astype(int)\n",
|
||||||
|
"\n",
|
||||||
|
" # Metriken ausgeben\n",
|
||||||
|
" print(\"Accuracy:\", accuracy_score(y, preds))\n",
|
||||||
|
" print(\"F1:\", f1_score(y, preds))\n",
|
||||||
|
" print(\"AUC:\", roc_auc_score(y, preds))\n",
|
||||||
|
" print(\"Confusion:\\n\", confusion_matrix(y, preds))\n",
|
||||||
|
" print(classification_report(y, preds))\n",
|
||||||
|
"\n",
|
||||||
|
" # Confusion Matrix plotten\n",
|
||||||
|
" def plot_confusion_matrix(true_labels, predictions, label_names):\n",
|
||||||
|
" for normalize in [None, 'true']:\n",
|
||||||
|
" cm = confusion_matrix(true_labels, predictions, normalize=normalize)\n",
|
||||||
|
" cm_disp = ConfusionMatrixDisplay(cm, display_labels=label_names)\n",
|
||||||
|
" cm_disp.plot(cmap=\"Blues\")\n",
|
||||||
|
" #cm = confusion_matrix(y, preds)\n",
|
||||||
|
" plot_confusion_matrix(y,preds, label_names=['Low','High'])\n",
|
||||||
|
" # plt.figure(figsize=(5,4))\n",
|
||||||
|
" # sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False,\n",
|
||||||
|
" # xticklabels=[\"Predicted low\", \"Predicted high\"],\n",
|
||||||
|
" # yticklabels=[\"Actual low\", \"Actual high\"])\n",
|
||||||
|
" # plt.title(f\"Confusion Matrix - {title}\")\n",
|
||||||
|
" # plt.ylabel(\"True label\")\n",
|
||||||
|
" # plt.xlabel(\"Predicted label\")\n",
|
||||||
|
" # plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# Aufrufen für Train/Val/Test\n",
|
||||||
|
"print(\"TRAIN:\")\n",
|
||||||
|
"evaluate(model, X_train, y_train, title=\"Train\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"VAL:\")\n",
|
||||||
|
"evaluate(model, X_val, y_val, title=\"Validation\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"TEST:\")\n",
|
||||||
|
"evaluate(model, X_test, y_test, title=\"Test\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c43b0c80",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"joblib.dump(model, \"xgb_model_with_MAD.joblib\")\n",
|
||||||
|
"joblib.dump(normalizer, \"normalizer_with_MAD.joblib\")\n",
|
||||||
|
"print(\"Model gespeichert.\")\n",
|
||||||
|
"\n",
|
||||||
|
"model.save_model(\"xgb_model_with_MAD.json\") # als JSON (lesbar, portabel)\n",
|
||||||
|
"model.save_model(\"xgb_model_with_MAD.bin\") # als Binärdatei (kompakt)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3195cc84",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"os.getcwd()\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@ -76,34 +76,6 @@
|
|||||||
"print(f\"high all: {high_all.shape}\")"
|
"print(f\"high all: {high_all.shape}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "dbb58abd",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
|
||||||
"\n",
|
|
||||||
"def mad_outlier_removal(df, columns, threshold=3.5):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Entfernt Ausreißer basierend auf Median Absolute Deviation (MAD).\n",
|
|
||||||
" threshold: typischer Wert ist 3.5 (entspricht robustem Z-Score Cutoff).\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" df_clean = df.copy()\n",
|
|
||||||
" for col in columns:\n",
|
|
||||||
" median = df_clean[col].median()\n",
|
|
||||||
" mad = np.median(np.abs(df_clean[col] - median))\n",
|
|
||||||
" if mad == 0:\n",
|
|
||||||
" continue # keine Streuung, keine Ausreißer\n",
|
|
||||||
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
|
||||||
" mask = np.abs(robust_z) <= threshold\n",
|
|
||||||
" df_clean = df_clean[mask]\n",
|
|
||||||
" return df_clean"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
@ -244,6 +216,58 @@
|
|||||||
" return df_out"
|
" return df_out"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dbb58abd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"def calculate_mad_params(df, columns):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Calculate median and MAD parameters for each column.\n",
|
||||||
|
" This should be run ONLY on the training data.\n",
|
||||||
|
" \n",
|
||||||
|
" Returns a dictionary: {col: (median, mad)}\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" params = {}\n",
|
||||||
|
" for col in columns:\n",
|
||||||
|
" median = df[col].median()\n",
|
||||||
|
" mad = np.median(np.abs(df[col] - median))\n",
|
||||||
|
" params[col] = (median, mad)\n",
|
||||||
|
" return params\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def apply_mad_filter(df, params, threshold=3.5):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Apply MAD-based outlier removal using precomputed parameters.\n",
|
||||||
|
" Works on training, validation, and test data.\n",
|
||||||
|
" \n",
|
||||||
|
" df: DataFrame to filter\n",
|
||||||
|
" params: dictionary {col: (median, mad)} from training data\n",
|
||||||
|
" threshold: cutoff for robust Z-score\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" df_clean = df.copy()\n",
|
||||||
|
"\n",
|
||||||
|
" for col, (median, mad) in params.items():\n",
|
||||||
|
" if mad == 0:\n",
|
||||||
|
" continue # no spread; nothing to remove for this column\n",
|
||||||
|
"\n",
|
||||||
|
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
||||||
|
" outlier_mask = np.abs(robust_z) > threshold\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove values only in this specific column\n",
|
||||||
|
" df_clean.loc[outlier_mask, col] = np.nan\n",
|
||||||
|
" print(df_clean.shape)\n",
|
||||||
|
" \n",
|
||||||
|
" print(df_clean.shape)\n",
|
||||||
|
" return df_clean\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
@ -251,11 +275,13 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"train_outlier_removed = mad_outlier_removal(train_df, au_columns, 50)\n",
|
"# Step 1: Fit parameters on training data\n",
|
||||||
"val_outlier_removed = mad_outlier_removal(val_df, au_columns, 50)\n",
|
"params = calculate_mad_params(train_df, au_columns)\n",
|
||||||
"test_outlier_removed = mad_outlier_removal(test_df, au_columns, 50)\n",
|
"\n",
|
||||||
"print(train_df.shape)\n",
|
"# Step 2: Apply filter consistently\n",
|
||||||
"print(train_outlier_removed.shape)"
|
"train_outlier_removed = apply_mad_filter(train_df, params, threshold=3.5)\n",
|
||||||
|
"val_outlier_removed = apply_mad_filter(val_df, params, threshold=50)\n",
|
||||||
|
"test_outlier_removed = apply_mad_filter(test_df, params, threshold=50)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -296,7 +322,8 @@
|
|||||||
" objective=\"binary:logistic\",\n",
|
" objective=\"binary:logistic\",\n",
|
||||||
" eval_metric=\"auc\",\n",
|
" eval_metric=\"auc\",\n",
|
||||||
" use_label_encoder=False,\n",
|
" use_label_encoder=False,\n",
|
||||||
" random_state=42\n",
|
" random_state=42,\n",
|
||||||
|
" verbosity=0,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Parameter-Raster\n",
|
"# Parameter-Raster\n",
|
||||||
@ -318,11 +345,14 @@
|
|||||||
" scoring=\"roc_auc\",\n",
|
" scoring=\"roc_auc\",\n",
|
||||||
" n_jobs=-1,\n",
|
" n_jobs=-1,\n",
|
||||||
" cv=cv,\n",
|
" cv=cv,\n",
|
||||||
" verbose=2\n",
|
" verbose=0\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Training mit Cross Validation\n",
|
"# Training mit Cross Validation\n",
|
||||||
"grid_search.fit(X_train, y_train)\n",
|
"grid_search.fit(\n",
|
||||||
|
" X_train, y_train, \n",
|
||||||
|
" verbose=False,\n",
|
||||||
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Beste Parameter:\", grid_search.best_params_)\n",
|
"print(\"Beste Parameter:\", grid_search.best_params_)\n",
|
||||||
"print(\"Bestes AUC:\", grid_search.best_score_)\n",
|
"print(\"Bestes AUC:\", grid_search.best_score_)\n",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user