2025-12-10 19:35:20 +01:00

255 lines
6.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "708c9745",
"metadata": {},
"source": [
"### Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53b10294",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import sys\n",
"import os\n",
"\n",
"base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"sys.path.append(base_dir)\n",
"print(base_dir)\n",
"\n",
"from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"from sklearn.svm import OneClassSVM\n",
"from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n",
"import matplotlib.pyplot as plt\n",
"import tensorflow as tf\n",
"import pickle\n",
"from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n",
" recall_score, f1_score, confusion_matrix, classification_report) "
]
},
{
"cell_type": "markdown",
"id": "68101229",
"metadata": {},
"source": [
"### load Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24a765e8",
"metadata": {},
"outputs": [],
"source": [
"dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "471001b0",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(path=dataset_path)"
]
},
{
"cell_type": "markdown",
"id": "0fdecdaa",
"metadata": {},
"source": [
"### Load Performance data and Subject Split"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "692d1b47",
"metadata": {},
"outputs": [],
"source": [
"performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n",
"performance_df = pd.read_csv(performance_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea617e3f",
"metadata": {},
"outputs": [],
"source": [
"# Subject IDs aus dem Haupt-Dataset nehmen\n",
"subjects_from_df = df[\"subjectID\"].unique()\n",
"\n",
"# Performance-Subset nur für vorhandene Subjects\n",
"perf_filtered = performance_df[\n",
" performance_df[\"subjectID\"].isin(subjects_from_df)\n",
"][[\"subjectID\", \"overall_score\"]]\n",
"\n",
"# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n",
"merged = (\n",
" pd.DataFrame({\"subjectID\": subjects_from_df})\n",
" .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n",
")\n",
"\n",
"# Sicherstellen, dass keine Scores fehlen\n",
"if merged[\"overall_score\"].isna().any():\n",
" raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae43df8d",
"metadata": {},
"outputs": [],
"source": [
"merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n",
"\n",
"scores = merged_sorted[\"overall_score\"].values\n",
"n_total = len(merged_sorted)\n",
"n_small = n_total // 3\n",
"n_large = n_total - n_small\n",
"\n",
"# Schritt 1: zufällige Start-Aufteilung\n",
"idx = np.arange(n_total)\n",
"np.random.shuffle(idx)\n",
"\n",
"small_idx = idx[:n_small]\n",
"large_idx = idx[n_small:]\n",
"\n",
"def score_diff(small_idx, large_idx):\n",
" return abs(scores[small_idx].mean() - scores[large_idx].mean())\n",
"\n",
"diff = score_diff(small_idx, large_idx)\n",
"threshold = 0.01\n",
"max_iter = 100\n",
"count = 0\n",
"\n",
"# Schritt 2: random swaps bis Differenz klein genug\n",
"while diff > threshold and count < max_iter:\n",
" # Zwei zufällige Elemente auswählen\n",
" si = np.random.choice(small_idx)\n",
" li = np.random.choice(large_idx)\n",
" \n",
" # Tausch durchführen\n",
" new_small_idx = small_idx.copy()\n",
" new_large_idx = large_idx.copy()\n",
" \n",
" new_small_idx[new_small_idx == si] = li\n",
" new_large_idx[new_large_idx == li] = si\n",
"\n",
" # neue Differenz berechnen\n",
" new_diff = score_diff(new_small_idx, new_large_idx)\n",
"\n",
" # Swap akzeptieren, wenn es besser wird\n",
" if new_diff < diff:\n",
" small_idx = new_small_idx\n",
" large_idx = new_large_idx\n",
" diff = new_diff\n",
"\n",
" count += 1\n",
"\n",
"# Finalgruppen\n",
"group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n",
"group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n",
"\n",
"print(\"Finale Score-Differenz:\", diff)\n",
"print(\"Größe Gruppe 1:\", len(group_small))\n",
"print(\"Größe Gruppe 2:\", len(group_large))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d1b414e",
"metadata": {},
"outputs": [],
"source": [
"group_large['overall_score'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa71f9a5",
"metadata": {},
"outputs": [],
"source": [
"group_small['overall_score'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79ecb4a2",
"metadata": {},
"outputs": [],
"source": [
"training_subjects = group_large['subjectID'].values\n",
"test_subjects = group_small['subjectID'].values\n",
"print(training_subjects)\n",
"print(test_subjects)"
]
},
{
"cell_type": "markdown",
"id": "4353f87c",
"metadata": {},
"source": [
"### Data cleaning with mad"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76610052",
"metadata": {},
"outputs": [],
"source": [
"# SET\n",
"threshold_mad = 100\n",
"column_praefix ='AU'\n",
"\n",
"au_columns = [col for col in df.columns if col.startswith(column_praefix)]\n",
"cleaned_df = mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n",
"print(cleaned_df.shape)\n",
"print(df.shape)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}