In [None]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# data_path = Path(r"~/Fahrsimulator_MSY2526_AI/model_training/xgboost/output_windowed.parquet")
data_path = Path(r"~/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet")

In [None]:
df = pd.read_parquet(path=data_path)

In [None]:
subjects = df['subjectID'].unique()
print(subjects)
print(len(subjects))
print(len(subjects)*0.66)
print(len(subjects)*0.33)

In [None]:
low_all = df[
    ((df["PHASE"] == "baseline") |
     ((df["STUDY"] == "n-back") & (df["PHASE"] != "baseline") & (df["LEVEL"].isin([1, 4]))))
]
print(f"low all: {low_all.shape}")

high_nback = df[
    (df["STUDY"]=="n-back") &
    (df["LEVEL"].isin([2, 3, 5, 6])) &
    (df["PHASE"].isin(["train", "test"]))
]
print(f"high n-back: {high_nback.shape}")

high_kdrive = df[
    (df["STUDY"] == "k-drive") & (df["PHASE"] != "baseline")
]
print(f"high k-drive: {high_kdrive.shape}")

high_all = pd.concat([high_nback, high_kdrive])
print(f"high all: {high_all.shape}")

In [None]:
def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
    if method == 'standard':
        Scaler = StandardScaler
    elif method == 'minmax':
        Scaler = MinMaxScaler
    else:
        raise ValueError("method must be 'standard' or 'minmax'")
    
    scalers = {}
    
    if scope == 'subject':
        for subject in train_data['subjectID'].unique():
            subject_mask = train_data['subjectID'] == subject
            scaler = Scaler()
            scaler.fit(train_data.loc[subject_mask, au_columns])
            scalers[subject] = scaler

    elif scope == 'global':
        scaler = Scaler()
        scaler.fit(train_data[au_columns])
        scalers['global'] = scaler

    else:
        raise ValueError("scope must be 'subject' or 'global'")
    
    return {'scalers': scalers, 'method': method, 'scope': scope}


In [None]:
%pip install xgboost

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
import joblib
import matplotlib.pyplot as plt

In [None]:
low = low_all.copy()
high = high_all.copy()

low["label"] = 0
high["label"] = 1

data = pd.concat([low, high], ignore_index=True)
data = data.drop_duplicates()

print("Label distribution:")
print(data["label"].value_counts())

In [None]:
au_columns = [col for col in data.columns if col.lower().startswith("au")]
print("Gefundene AU-Spalten:", au_columns)

In [None]:
subjects = np.random.permutation(data["subjectID"].unique())

n = len(subjects)
n_train = int(n * 0.66)

train_subjects = subjects[:n_train]
test_subjects  = subjects[n_train:]
train_subs, val_subs = train_test_split(train_subjects, test_size=0.2, random_state=42)

train_df = data[data.subjectID.isin(train_subs)]
val_df   = data[data.subjectID.isin(val_subs)]
test_df  = data[data.subjectID.isin(test_subjects)]

print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
def apply_normalizer(df_to_transform, normalizer_dict, au_columns):
    scalers = normalizer_dict["scalers"]
    scope = normalizer_dict["scope"]
    df_out = df_to_transform.copy()

    if scope == "global":
        scaler = scalers["global"]
        df_out[au_columns] = scaler.transform(df_out[au_columns])

    elif scope == "subject":
        for subj, subdf in df_out.groupby("subjectID"):
            if subj in scalers:
                df_out.loc[subdf.index, au_columns] = scalers[subj].transform(subdf[au_columns])
            elif "global" in scalers:
                df_out.loc[subdf.index, au_columns] = scalers["global"].transform(subdf[au_columns])

    return df_out

In [None]:
normalizer = fit_normalizer(train_df, au_columns, method="standard", scope="global")

train_scaled = apply_normalizer(train_df, normalizer, au_columns)
val_scaled   = apply_normalizer(val_df, normalizer, au_columns)
test_scaled  = apply_normalizer(test_df, normalizer, au_columns)

In [None]:
X_train, y_train = train_scaled[au_columns].values, train_scaled["label"].values
X_val,   y_val   = val_scaled[au_columns].values, val_scaled["label"].values
X_test,  y_test  = test_scaled[au_columns].values, test_scaled["label"].values

In [None]:
# Basis-Modell
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)

# Parameter-Raster
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [4, 6, 8],
    "n_estimators": [200, 500, 800],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

# K-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search Setup
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring="roc_auc",
    n_jobs=-1,
    cv=cv,
    verbose=2
)

# Training mit Cross Validation
grid_search.fit(X_train, y_train)

print("Beste Parameter:", grid_search.best_params_)
print("Bestes AUC:", grid_search.best_score_)

# Bestes Modell extrahieren
model = grid_search.best_estimator_

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay

def evaluate(model, X, y, title="Evaluation"):
    # Vorhersagen
    preds_proba = model.predict_proba(X)[:, 1]
    preds = (preds_proba > 0.5).astype(int)

    # Metriken ausgeben
    print("Accuracy:", accuracy_score(y, preds))
    print("F1:", f1_score(y, preds))
    print("AUC:", roc_auc_score(y, preds))
    print("Confusion:\n", confusion_matrix(y, preds))
    print(classification_report(y, preds))

    # Confusion Matrix plotten
    def plot_confusion_matrix(true_labels, predictions, label_names):
        for normalize in [None, 'true']:
            cm = confusion_matrix(true_labels, predictions, normalize=normalize)
            cm_disp = ConfusionMatrixDisplay(cm,  display_labels=label_names)
            cm_disp.plot(cmap="Blues")
    #cm = confusion_matrix(y, preds)
    plot_confusion_matrix(y,preds, label_names=['Low','High'])
    # plt.figure(figsize=(5,4))
    # sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
    #             xticklabels=["Predicted low", "Predicted high"],
    #             yticklabels=["Actual low", "Actual high"])
    # plt.title(f"Confusion Matrix - {title}")
    # plt.ylabel("True label")
    # plt.xlabel("Predicted label")
    # plt.show()

# Aufrufen für Train/Val/Test
print("TRAIN:")
evaluate(model, X_train, y_train, title="Train")

print("VAL:")
evaluate(model, X_val, y_val, title="Validation")

print("TEST:")
evaluate(model, X_test, y_test, title="Test")


In [None]:
# joblib.dump(model, "xgb_model.joblib")
# joblib.dump(normalizer, "normalizer.joblib")
# print("Model gespeichert.")

# model.save_model("xgb_model.json")   # als JSON (lesbar, portabel)
# model.save_model("xgb_model.bin")    # als Binärdatei (kompakt)

In [None]:
import os
os.getcwd()

