### Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import os

base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(base_dir)
print(base_dir)

from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, 
 recall_score, f1_score, confusion_matrix, classification_report) 

### load Dataset

In [None]:
dataset_path = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet")

In [None]:
df = pd.read_parquet(path=dataset_path)

### Load Performance data and Subject Split

In [None]:
performance_path = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv")
performance_df = pd.read_csv(performance_path)

In [None]:
# Subject IDs aus dem Haupt-Dataset nehmen
subjects_from_df = df["subjectID"].unique()

# Performance-Subset nur für vorhandene Subjects
perf_filtered = performance_df[
 performance_df["subjectID"].isin(subjects_from_df)
][["subjectID", "overall_score"]]

# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen
merged = (
 pd.DataFrame({"subjectID": subjects_from_df})
 .merge(perf_filtered, on="subjectID", how="inner")
)

# Sicherstellen, dass keine Scores fehlen
if merged["overall_score"].isna().any():
 raise ValueError("Es fehlen Score-Werte für manche Subjects.")


In [None]:
merged_sorted = merged.sort_values("overall_score", ascending=False).reset_index(drop=True)

scores = merged_sorted["overall_score"].values
n_total = len(merged_sorted)
n_small = n_total // 3
n_large = n_total - n_small

# Schritt 1: zufällige Start-Aufteilung
idx = np.arange(n_total)
np.random.shuffle(idx)

small_idx = idx[:n_small]
large_idx = idx[n_small:]

def score_diff(small_idx, large_idx):
 return abs(scores[small_idx].mean() - scores[large_idx].mean())

diff = score_diff(small_idx, large_idx)
threshold = 0.01
max_iter = 100
count = 0

# Schritt 2: random swaps bis Differenz klein genug
while diff > threshold and count < max_iter:
 # Zwei zufällige Elemente auswählen
 si = np.random.choice(small_idx)
 li = np.random.choice(large_idx)
 
 # Tausch durchführen
 new_small_idx = small_idx.copy()
 new_large_idx = large_idx.copy()
 
 new_small_idx[new_small_idx == si] = li
 new_large_idx[new_large_idx == li] = si

 # neue Differenz berechnen
 new_diff = score_diff(new_small_idx, new_large_idx)

 # Swap akzeptieren, wenn es besser wird
 if new_diff < diff:
 small_idx = new_small_idx
 large_idx = new_large_idx
 diff = new_diff

 count += 1

# Finalgruppen
group_small = merged_sorted.loc[small_idx].reset_index(drop=True)
group_large = merged_sorted.loc[large_idx].reset_index(drop=True)

print("Finale Score-Differenz:", diff)
print("Größe Gruppe 1:", len(group_small))
print("Größe Gruppe 2:", len(group_large))


In [None]:
group_large['overall_score'].mean()

In [None]:
group_small['overall_score'].mean()

In [None]:
training_subjects = group_large['subjectID'].values
test_subjects = group_small['subjectID'].values
print(training_subjects)
print(test_subjects)

### Data cleaning with mad

In [None]:
# SET
threshold_mad = 100
column_praefix ='AU'

au_columns = [col for col in df.columns if col.startswith(column_praefix)]
cleaned_df = mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)
print(cleaned_df.shape)
print(df.shape)