{ "cells": [ { "cell_type": "markdown", "id": "708c9745", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "53b10294", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "import sys\n", "import os\n", "\n", "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "sys.path.append(base_dir)\n", "print(base_dir)\n", "\n", "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools, scaler, mad_outlier_removal\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.svm import OneClassSVM\n", "from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split\n", "import matplotlib.pyplot as plt\n", "import tensorflow as tf\n", "import pickle\n", "from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, \n", " recall_score, f1_score, confusion_matrix, classification_report) " ] }, { "cell_type": "markdown", "id": "68101229", "metadata": {}, "source": [ "### load Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "24a765e8", "metadata": {}, "outputs": [], "source": [ "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/first_AU_dataset/output_windowed.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "id": "471001b0", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(path=dataset_path)" ] }, { "cell_type": "markdown", "id": "0fdecdaa", "metadata": {}, "source": [ "### Load Performance data and Subject Split" ] }, { "cell_type": "code", "execution_count": null, "id": "692d1b47", "metadata": {}, "outputs": [], "source": [ "performance_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/subject_performance/3new_au_performance.csv\")\n", "performance_df = pd.read_csv(performance_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "ea617e3f", "metadata": {}, "outputs": [], "source": [ "# Subject IDs aus dem Haupt-Dataset nehmen\n", "subjects_from_df = df[\"subjectID\"].unique()\n", "\n", "# Performance-Subset nur für vorhandene Subjects\n", "perf_filtered = performance_df[\n", " performance_df[\"subjectID\"].isin(subjects_from_df)\n", "][[\"subjectID\", \"overall_score\"]]\n", "\n", "# Merge: nur Subjects, die sowohl im df als auch im Performance-CSV vorkommen\n", "merged = (\n", " pd.DataFrame({\"subjectID\": subjects_from_df})\n", " .merge(perf_filtered, on=\"subjectID\", how=\"inner\")\n", ")\n", "\n", "# Sicherstellen, dass keine Scores fehlen\n", "if merged[\"overall_score\"].isna().any():\n", " raise ValueError(\"Es fehlen Score-Werte für manche Subjects.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ae43df8d", "metadata": {}, "outputs": [], "source": [ "merged_sorted = merged.sort_values(\"overall_score\", ascending=False).reset_index(drop=True)\n", "\n", "scores = merged_sorted[\"overall_score\"].values\n", "n_total = len(merged_sorted)\n", "n_small = n_total // 3\n", "n_large = n_total - n_small\n", "\n", "# Schritt 1: zufällige Start-Aufteilung\n", "idx = np.arange(n_total)\n", "np.random.shuffle(idx)\n", "\n", "small_idx = idx[:n_small]\n", "large_idx = idx[n_small:]\n", "\n", "def score_diff(small_idx, large_idx):\n", " return abs(scores[small_idx].mean() - scores[large_idx].mean())\n", "\n", "diff = score_diff(small_idx, large_idx)\n", "threshold = 0.01\n", "max_iter = 100\n", "count = 0\n", "\n", "# Schritt 2: random swaps bis Differenz klein genug\n", "while diff > threshold and count < max_iter:\n", " # Zwei zufällige Elemente auswählen\n", " si = np.random.choice(small_idx)\n", " li = np.random.choice(large_idx)\n", " \n", " # Tausch durchführen\n", " new_small_idx = small_idx.copy()\n", " new_large_idx = large_idx.copy()\n", " \n", " new_small_idx[new_small_idx == si] = li\n", " new_large_idx[new_large_idx == li] = si\n", "\n", " # neue Differenz berechnen\n", " new_diff = score_diff(new_small_idx, new_large_idx)\n", "\n", " # Swap akzeptieren, wenn es besser wird\n", " if new_diff < diff:\n", " small_idx = new_small_idx\n", " large_idx = new_large_idx\n", " diff = new_diff\n", "\n", " count += 1\n", "\n", "# Finalgruppen\n", "group_small = merged_sorted.loc[small_idx].reset_index(drop=True)\n", "group_large = merged_sorted.loc[large_idx].reset_index(drop=True)\n", "\n", "print(\"Finale Score-Differenz:\", diff)\n", "print(\"Größe Gruppe 1:\", len(group_small))\n", "print(\"Größe Gruppe 2:\", len(group_large))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9d1b414e", "metadata": {}, "outputs": [], "source": [ "group_large['overall_score'].mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "fa71f9a5", "metadata": {}, "outputs": [], "source": [ "group_small['overall_score'].mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "79ecb4a2", "metadata": {}, "outputs": [], "source": [ "training_subjects = group_large['subjectID'].values\n", "test_subjects = group_small['subjectID'].values\n", "print(training_subjects)\n", "print(test_subjects)" ] }, { "cell_type": "markdown", "id": "4353f87c", "metadata": {}, "source": [ "### Data cleaning with mad" ] }, { "cell_type": "code", "execution_count": null, "id": "76610052", "metadata": {}, "outputs": [], "source": [ "# SET\n", "threshold_mad = 100\n", "column_praefix ='AU'\n", "\n", "au_columns = [col for col in df.columns if col.startswith(column_praefix)]\n", "cleaned_df = mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n", "print(cleaned_df.shape)\n", "print(df.shape)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }