new files for comissioning

rearrarngement of files
tool functions for sqlite database
2026-01-29 20:06:42 +00:00 · 2026-01-29 18:12:13 +01:00 · 2026-01-29 17:04:36 +00:00 · 2026-01-29 13:48:47 +00:00 · 2026-01-27 19:28:12 +01:00 · 2026-01-27 18:42:40 +01:00
12 changed files with 510 additions and 1420 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,5 @@
 !*.py
 !*.ipynb
 !*.md
 !*.parquet
 !.gitignore
--- a/EDA/EDA.ipynb
+++ b/EDA/EDA.ipynb
@ -1,259 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7440a5b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import h5py\n",
    "import os\n",
    "import warnings\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2401aaef",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = \"adabase-public-0020-v_0_0_2.h5py\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46280999",
   "metadata": {},
   "outputs": [],
   "source": [
    "SKT_SR = 100\n",
    "ECG_SR = 500\n",
    "RSP_SR = 250\n",
    "EMG_SR = 1000\n",
    "EDA_SR = 500\n",
    "EYE_SR = 250"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e23eb552",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_signals = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7f494d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd2f4d84",
   "metadata": {},
   "outputs": [],
   "source": [
    "settings = df_signals[['STUDY','PHASE','LEVEL']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1699ddc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "settings.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4731c56",
   "metadata": {},
   "source": [
    "Actions units"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9db0b4b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_signals.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ceccc89",
   "metadata": {},
   "outputs": [],
   "source": [
    "au_data = df_signals.iloc[:,-20:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d4ee088",
   "metadata": {},
   "outputs": [],
   "source": [
    "au_data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d85a8cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(au_data.shape)\n",
    "print(au_data.isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efff356f",
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_au_data = au_data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42ed1bcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_au_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c7c3f14",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(clean_au_data.columns)):\n",
    "    print(clean_au_data.iloc[:,i].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "332740a8",
   "metadata": {},
   "source": [
    "Plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f30b8814",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_signals_ecg = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I'])\n",
    "df_signals_ecg = df_signals[[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I']]\n",
    "df_signals_ecg.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee80fd79",
   "metadata": {},
   "outputs": [],
   "source": [
    "study_filter = df_signals[\"STUDY\"] == \"n-back\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ef29446",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(16, 2))\n",
    "# Set the number of seconds to plot\n",
    "seconds = 20\n",
    "# Get the ECG signal data\n",
    "ecg_signal = df_signals.loc[study_filter, \"RAW_ECG_I\"].dropna()\n",
    "# Set the x-axis limits to the number of samples in the specified time range\n",
    "num_samples = ECG_SR * seconds\n",
    "# Plot the ECG signal\n",
    "ax.plot(ecg_signal.index[:num_samples]/1000, ecg_signal[:num_samples]);\n",
    "ax.set_title(\"ECG I\");\n",
    "ax.set_xlabel('Seconds');\n",
    "# Set figure size with a 16:6 aspect ratio\n",
    "fig, ax = plt.subplots(figsize=(16, 2))\n",
    "# Set the number of seconds to plot\n",
    "start_second = 0\n",
    "end_second = 60*30\n",
    "# Get the EYE signal data - we replace inf with nan to get the original signal.␣\n",
    "\n",
    "eye_left_signal = df_signals.loc[study_filter, \"LEFT_PUPIL_DIAMETER\"].dropna()\n",
    "eye_right_signal = df_signals.loc[study_filter, \"RIGHT_PUPIL_DIAMETER\"].dropna()\n",
    "#eye_left_signal = df_signals.loc[:, \"LEFT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
    "\n",
    "#eye_right_signal = df_signals.loc[:, \"RIGHT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
    "\n",
    "# Set the x-axis limits to the number of samples in the specified time range\n",
    "num_samples_start = EYE_SR * start_second\n",
    "num_samples_end = EYE_SR * end_second\n",
    "ax.plot(eye_left_signal.index[num_samples_start:num_samples_end]/1000,eye_left_signal[num_samples_start:num_samples_end], label=\"Left\")\n",
    "ax.plot(eye_right_signal.index[num_samples_start:num_samples_end]/1000,eye_right_signal[num_samples_start:num_samples_end], label=\"Right\")\n",
    "ax.set_title(\"Pupil Dilation\")\n",
    "ax.set_xlabel('Seconds')\n",
    "ax.legend()\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/EDA/distribution_plots.ipynb
+++ b/EDA/distribution_plots.ipynb
@ -1,625 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "89d81009",
   "metadata": {},
   "source": [
    "### Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7440a5b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from pathlib import Path\n",
    "from sklearn.preprocessing import StandardScaler, MinMaxScaler"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09b7d707",
   "metadata": {},
   "source": [
    "### Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2401aaef",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n",
    "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/60s_combined_dataset_25hz.parquet\")\n",
    "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0282b0b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "FILTER_MAD = True\n",
    "THRESHOLD = 3.5\n",
    "METHOD = 'minmax'\n",
    "SCOPE = 'subject'\n",
    "FILTER_SUBSETS = True"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8f1716b",
   "metadata": {},
   "source": [
    "### Calculations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac32444a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet(dataset_path)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ba4401c",
   "metadata": {},
   "outputs": [],
   "source": [
    "if(FILTER_SUBSETS):\n",
    "    # Special filter: Keep only specific subsets\n",
    "# - k-drive L1 baseline\n",
    "# - n-back L1 baseline  \n",
    "# - k-drive test with levels 1, 2, 3\n",
    "\n",
    "    df = df[\n",
    "        (\n",
    "            # k-drive L1 baseline\n",
    "            ((df['STUDY'] == 'k-drive') & \n",
    "            (df['LEVEL'] == 1) & \n",
    "            (df['PHASE'] == 'baseline'))\n",
    "        ) | \n",
    "        (\n",
    "            # n-back L1 baseline\n",
    "            ((df['STUDY'] == 'n-back') & \n",
    "            (df['LEVEL'] == 1) & \n",
    "            (df['PHASE'] == 'baseline'))\n",
    "        ) | \n",
    "        (\n",
    "            # k-drive test with levels 1, 2, 3\n",
    "            ((df['STUDY'] == 'k-drive') & \n",
    "            (df['LEVEL'].isin([1, 2, 3])) & \n",
    "            (df['PHASE'] == 'test'))\n",
    "        )].copy()\n",
    "\n",
    "print(f\"Filtered dataframe shape: {df.shape}\")\n",
    "print(f\"Remaining subsets: {df.groupby(['STUDY', 'LEVEL', 'PHASE']).size()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77dbd6df",
   "metadata": {},
   "outputs": [],
   "source": [
    "face_au_cols = [c for c in df.columns if c.startswith(\"FACE_AU\")]\n",
    "eye_cols =  ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
    "       'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
    "       'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
    "       'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
    "       'Pupil_mean', 'Pupil_IPA']\n",
    "eye_cols_without_blink =  ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
    "       'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
    "       'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
    "       'Sac_median_dur', 'Pupil_mean', 'Pupil_IPA']\n",
    "print(len(eye_cols))\n",
    "all_signal_columns = eye_cols+face_au_cols\n",
    "print(len(all_signal_columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5e9c67a",
   "metadata": {},
   "source": [
    "MAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "592291ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_mad_params(df, columns):\n",
    "    \"\"\"\n",
    "    Calculate median and MAD parameters for each column.\n",
    "    This should be run ONLY on the training data.\n",
    "    \n",
    "    Returns a dictionary: {col: (median, mad)}\n",
    "    \"\"\"\n",
    "    params = {}\n",
    "    for col in columns:\n",
    "        median = df[col].median()\n",
    "        mad = np.median(np.abs(df[col] - median))\n",
    "        params[col] = (median, mad)\n",
    "    return params\n",
    "def apply_mad_filter(df, params, threshold=3.5):\n",
    "    \"\"\"\n",
    "    Apply MAD-based outlier removal using precomputed parameters.\n",
    "    Works on training, validation, and test data.\n",
    "    \n",
    "    df: DataFrame to filter\n",
    "    params: dictionary {col: (median, mad)} from training data\n",
    "    threshold: cutoff for robust Z-score\n",
    "    \"\"\"\n",
    "    df_clean = df.copy()\n",
    "\n",
    "    for col, (median, mad) in params.items():\n",
    "        if mad == 0:\n",
    "            continue  # no spread; nothing to remove for this column\n",
    "\n",
    "        robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
    "        outlier_mask = np.abs(robust_z) > threshold\n",
    "\n",
    "        # Remove values only in this specific column\n",
    "        df_clean.loc[outlier_mask, col] = median\n",
    "        \n",
    "        \n",
    "    print(df_clean.shape)\n",
    "    return df_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ddad4a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "if(FILTER_MAD):\n",
    "    mad_params = calculate_mad_params(df, all_signal_columns)\n",
    "    df = apply_mad_filter(df, mad_params, THRESHOLD)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89387879",
   "metadata": {},
   "source": [
    "Normalizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c129cdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
    "    \"\"\"\n",
    "    Fit normalization scalers on training data.\n",
    "    \n",
    "    Parameters:\n",
    "    -----------\n",
    "    train_data : pd.DataFrame\n",
    "        Training dataframe with AU columns and subjectID\n",
    "    au_columns : list\n",
    "        List of AU column names to normalize\n",
    "    method : str, default='standard'\n",
    "        Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
    "    scope : str, default='global'\n",
    "        Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
    "    \n",
    "    Returns:\n",
    "    --------\n",
    "    dict\n",
    "        Dictionary containing fitted scalers and statistics for new subjects\n",
    "    \"\"\"\n",
    "    if method == 'standard':\n",
    "        Scaler = StandardScaler\n",
    "    elif method == 'minmax':\n",
    "        Scaler = MinMaxScaler\n",
    "    else:\n",
    "        raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
    "    \n",
    "    scalers = {}\n",
    "    if scope == 'subject':\n",
    "        # Fit one scaler per subject\n",
    "        subject_stats = []\n",
    "        \n",
    "        for subject in train_data['subjectID'].unique():\n",
    "            subject_mask = train_data['subjectID'] == subject\n",
    "            scaler = Scaler()\n",
    "            scaler.fit(train_data.loc[subject_mask, au_columns].values)\n",
    "            scalers[subject] = scaler\n",
    "            \n",
    "            # Store statistics for averaging\n",
    "            if method == 'standard':\n",
    "                subject_stats.append({\n",
    "                    'mean': scaler.mean_,\n",
    "                    'std': scaler.scale_\n",
    "                })\n",
    "            elif method == 'minmax':\n",
    "                subject_stats.append({\n",
    "                    'min': scaler.data_min_,\n",
    "                    'max': scaler.data_max_\n",
    "                })\n",
    "        \n",
    "        # Calculate average statistics for new subjects\n",
    "        if method == 'standard':\n",
    "            avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n",
    "            avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n",
    "            fallback_scaler = StandardScaler()\n",
    "            fallback_scaler.mean_ = avg_mean\n",
    "            fallback_scaler.scale_ = avg_std\n",
    "            fallback_scaler.var_ = avg_std ** 2\n",
    "            fallback_scaler.n_features_in_ = len(au_columns)\n",
    "        elif method == 'minmax':\n",
    "            avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n",
    "            avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n",
    "            fallback_scaler = MinMaxScaler()\n",
    "            fallback_scaler.data_min_ = avg_min\n",
    "            fallback_scaler.data_max_ = avg_max\n",
    "            fallback_scaler.data_range_ = avg_max - avg_min\n",
    "            fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n",
    "            fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n",
    "            fallback_scaler.n_features_in_ = len(au_columns)\n",
    "        \n",
    "        scalers['_fallback'] = fallback_scaler\n",
    "    \n",
    "    elif scope == 'global':\n",
    "        # Fit one scaler for all subjects\n",
    "        scaler = Scaler()\n",
    "        scaler.fit(train_data[au_columns].values)\n",
    "        scalers['global'] = scaler\n",
    "    \n",
    "    else:\n",
    "        raise ValueError(\"scope must be 'subject' or 'global'\")\n",
    "    \n",
    "    return {'scalers': scalers, 'method': method, 'scope': scope}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cfabd37",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_normalizer(data, columns, normalizer_dict):\n",
    "    \"\"\"\n",
    "    Apply fitted normalization scalers to data.\n",
    "    \n",
    "    Parameters:\n",
    "    -----------\n",
    "    data : pd.DataFrame\n",
    "        Dataframe with AU columns and subjectID\n",
    "    au_columns : list\n",
    "        List of AU column names to normalize\n",
    "    normalizer_dict : dict\n",
    "        Dictionary containing fitted scalers from fit_normalizer()\n",
    "    \n",
    "    Returns:\n",
    "    --------\n",
    "    pd.DataFrame\n",
    "        DataFrame with normalized AU columns\n",
    "    \"\"\"\n",
    "    normalized_data = data.copy()\n",
    "    scalers = normalizer_dict['scalers']\n",
    "    scope = normalizer_dict['scope']\n",
    "    normalized_data[columns] = normalized_data[columns].astype(np.float64)\n",
    "\n",
    "    if scope == 'subject':\n",
    "        # Apply per-subject normalization\n",
    "        for subject in data['subjectID'].unique():\n",
    "            subject_mask = data['subjectID'] == subject\n",
    "            \n",
    "            # Use the subject's scaler if available, otherwise use fallback\n",
    "            if subject in scalers:\n",
    "                scaler = scalers[subject]\n",
    "            else:\n",
    "                # Use averaged scaler for new subjects\n",
    "                scaler = scalers['_fallback']\n",
    "                print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n",
    "            \n",
    "            normalized_data.loc[subject_mask, columns] = scaler.transform(\n",
    "                data.loc[subject_mask, columns].values\n",
    "            )\n",
    "    \n",
    "    elif scope == 'global':\n",
    "        # Apply global normalization\n",
    "        scaler = scalers['global']\n",
    "        normalized_data[columns] = scaler.transform(data[columns].values)\n",
    "    \n",
    "    return normalized_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4dbbebf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = fit_normalizer(df, all_signal_columns, method=METHOD, scope=SCOPE)\n",
    "df_min_max_normalised = apply_normalizer(df, all_signal_columns, scaler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b9b2ae8",
   "metadata": {},
   "outputs": [],
   "source": [
    "a= df_min_max_normalised[['STUDY','LEVEL','PHASE']]\n",
    "print(a.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3e1bc34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define signal columns (adjust only once)\n",
    "signal_columns = all_signal_columns\n",
    "\n",
    "# Get all unique combinations of STUDY, LEVEL and PHASE\n",
    "unique_combinations = df_min_max_normalised[['STUDY', 'LEVEL', 'PHASE']].drop_duplicates().reset_index(drop=True)\n",
    "\n",
    "# Dictionary to store subsets\n",
    "subsets = {}\n",
    "subset_sizes = {}\n",
    "\n",
    "for idx, row in unique_combinations.iterrows():\n",
    "    study = row['STUDY']\n",
    "    level = row['LEVEL']\n",
    "    phase = row['PHASE']\n",
    "    key = f\"{study}_L{level}_P{phase}\"\n",
    "    subset = df_min_max_normalised[\n",
    "        (df_min_max_normalised['STUDY'] == study) & \n",
    "        (df_min_max_normalised['LEVEL'] == level) & \n",
    "        (df_min_max_normalised['PHASE'] == phase)\n",
    "    ]\n",
    "    subsets[key] = subset\n",
    "    subset_sizes[key] = len(subset)\n",
    "\n",
    "# Output subset sizes\n",
    "print(\"Number of samples per subset:\")\n",
    "print(\"=\" * 40)\n",
    "for key, size in subset_sizes.items():\n",
    "    print(f\"{key}: {size} samples\")\n",
    "print(\"=\" * 40)\n",
    "print(f\"Total number of subsets: {len(subsets)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7fdeb5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Function to categorize subsets\n",
    "def categorize_subset(key):\n",
    "    \"\"\"Categorizes a subset as 'low' or 'high' based on the given logic\"\"\"\n",
    "    parts = key.split('_')\n",
    "    study = parts[0]\n",
    "    level = int(parts[1][1:])  # 'L1' -> 1\n",
    "    phase = parts[2][1:]  # 'Pbaseline' -> 'baseline'\n",
    "    \n",
    "    # LOW: baseline OR (n-back with level 1 or 4)\n",
    "    if phase == \"baseline\":\n",
    "        return 'low'\n",
    "    elif study == \"n-back\" and level in [1, 4]:\n",
    "        return 'low'\n",
    "    \n",
    "    # HIGH: (n-back with level 2,3,5,6 and phase train/test) OR (k-drive not baseline)\n",
    "    elif study == \"n-back\" and level in [2, 3, 5, 6] and phase in [\"train\", \"test\"]:\n",
    "        return 'high'\n",
    "    elif study == \"k-drive\" and phase != \"baseline\":\n",
    "        return 'high'\n",
    "    \n",
    "    return None\n",
    "\n",
    "# Categorize subsets\n",
    "low_subsets = {}\n",
    "high_subsets = {}\n",
    "\n",
    "for key, subset in subsets.items():\n",
    "    category = categorize_subset(key)\n",
    "    if category == 'low':\n",
    "        low_subsets[key] = subset\n",
    "    elif category == 'high':\n",
    "        high_subsets[key] = subset\n",
    "\n",
    "# Output statistics\n",
    "print(\"\\n\" + \"=\" * 50)\n",
    "print(\"SUBSET CATEGORIZATION\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "print(\"\\nLOW subsets (Blue):\")\n",
    "print(\"-\" * 50)\n",
    "low_total = 0\n",
    "for key in sorted(low_subsets.keys()):\n",
    "    size = subset_sizes[key]\n",
    "    low_total += size\n",
    "    print(f\"  {key}: {size} samples\")\n",
    "print(f\"{'TOTAL LOW:':<30} {low_total} samples\")\n",
    "print(f\"{'NUMBER OF LOW SUBSETS:':<30} {len(low_subsets)}\")\n",
    "\n",
    "print(\"\\nHIGH subsets (Red):\")\n",
    "print(\"-\" * 50)\n",
    "high_total = 0\n",
    "for key in sorted(high_subsets.keys()):\n",
    "    size = subset_sizes[key]\n",
    "    high_total += size\n",
    "    print(f\"  {key}: {size} samples\")\n",
    "print(f\"{'TOTAL HIGH:':<30} {high_total} samples\")\n",
    "print(f\"{'NUMBER OF HIGH SUBSETS:':<30} {len(high_subsets)}\")\n",
    "\n",
    "print(\"\\n\" + \"=\" * 50)\n",
    "print(f\"TOTAL SAMPLES: {low_total + high_total}\")\n",
    "print(f\"TOTAL SUBSETS: {len(low_subsets) + len(high_subsets)}\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "# Find minimum subset size\n",
    "min_subset_size = min(subset_sizes.values())\n",
    "print(f\"\\nMinimum subset size: {min_subset_size}\")\n",
    "\n",
    "# Number of points to plot per subset (50% of minimum size)\n",
    "sampling_factor = 1\n",
    "n_samples_per_subset = int(sampling_factor * min_subset_size)\n",
    "print(f\"Number of randomly drawn points per subset: {n_samples_per_subset}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff363fc5",
   "metadata": {},
   "source": [
    "### Plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a9d9163",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create comparison plots\n",
    "fig, axes = plt.subplots(len(signal_columns), 1, figsize=(14, 4 * len(signal_columns)))\n",
    "\n",
    "# If only one signal column exists, convert axes to list\n",
    "if len(signal_columns) == 1:\n",
    "    axes = [axes]\n",
    "\n",
    "# Create a plot for each signal column\n",
    "for i, signal_col in enumerate(signal_columns):\n",
    "    ax = axes[i]\n",
    "    \n",
    "    y_pos = 0\n",
    "    labels = []\n",
    "    \n",
    "    # First plot all LOW subsets (sorted, blue)\n",
    "    for label in sorted(low_subsets.keys()):\n",
    "        subset = low_subsets[label]\n",
    "        if len(subset) > 0 and signal_col in subset.columns:\n",
    "            # Draw random sample\n",
    "            n_samples = min(n_samples_per_subset, len(subset))\n",
    "            sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
    "            \n",
    "            # Calculate mean and median\n",
    "            mean_val = subset[signal_col].mean()\n",
    "            median_val = subset[signal_col].median()\n",
    "            \n",
    "            # Plot points in blue\n",
    "            ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
    "                      alpha=0.5, s=30, color='blue')\n",
    "            \n",
    "            # Mean as black cross\n",
    "            ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
    "                   color='black', zorder=5)\n",
    "            \n",
    "            # Median as brown cross\n",
    "            ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
    "                   color='brown', zorder=5)\n",
    "            \n",
    "            labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
    "            y_pos += 1\n",
    "    \n",
    "    # Separation line between LOW and HIGH\n",
    "    if len(low_subsets) > 0 and len(high_subsets) > 0:\n",
    "        ax.axhline(y=y_pos - 0.5, color='gray', linestyle='--', linewidth=2, alpha=0.7)\n",
    "    \n",
    "    # Then plot all HIGH subsets (sorted, red)\n",
    "    for label in sorted(high_subsets.keys()):\n",
    "        subset = high_subsets[label]\n",
    "        if len(subset) > 0 and signal_col in subset.columns:\n",
    "            # Draw random sample\n",
    "            n_samples = min(n_samples_per_subset, len(subset))\n",
    "            sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
    "            \n",
    "            # Calculate mean and median\n",
    "            mean_val = subset[signal_col].mean()\n",
    "            median_val = subset[signal_col].median()\n",
    "            \n",
    "            # Plot points in red\n",
    "            ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
    "                      alpha=0.5, s=30, color='red')\n",
    "            \n",
    "            # Mean as black cross\n",
    "            ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
    "                   color='black', zorder=5)\n",
    "            \n",
    "            # Median as brown cross\n",
    "            ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
    "                   color='brown', zorder=5)\n",
    "            \n",
    "            labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
    "            y_pos += 1\n",
    "    \n",
    "    ax.set_yticks(range(len(labels)))\n",
    "    ax.set_yticklabels(labels)\n",
    "    ax.set_xlabel(f'{signal_col} value')\n",
    "    ax.set_title(f'{signal_col}: LOW (Blue) vs HIGH (Red) | {n_samples_per_subset} points/subset | Black X = Mean, Brown X = Median')\n",
    "    ax.grid(True, alpha=0.3, axis='x')\n",
    "    ax.axvline(0, color='gray', linestyle='--', alpha=0.5)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"\\nNote: {n_samples_per_subset} random points were plotted per subset.\")\n",
    "print(\"Blue points = LOW subsets | Red points = HIGH subsets\")\n",
    "print(\"Black 'X' = Mean of entire subset | Brown 'X' = Median of entire subset\")\n",
    "print(f\"Total subsets plotted: {len(low_subsets)} LOW + {len(high_subsets)} HIGH = {len(low_subsets) + len(high_subsets)} subsets\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/EDA/histogramms.ipynb
+++ b/EDA/histogramms.ipynb
@ -1,166 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1014c5e0",
   "metadata": {},
   "source": [
    "Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e42f3011",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a834496",
   "metadata": {},
   "outputs": [],
   "source": [
    "path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
    "df = pd.read_parquet(path=path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa4759fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "high_nback = df[\n",
    "    (df[\"STUDY\"]==\"n-back\") &\n",
    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
    "]\n",
    "high_nback.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2aa0596",
   "metadata": {},
   "outputs": [],
   "source": [
    "low_all = df[\n",
    "    ((df[\"PHASE\"] == \"baseline\") |\n",
    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
    "]\n",
    "print(low_all.shape)\n",
    "high_kdrive = df[\n",
    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
    "]\n",
    "print(high_kdrive.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7d446a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
    "print(df.shape[0])\n",
    "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "474e144a",
   "metadata": {},
   "outputs": [],
   "source": [
    "high_all = pd.concat([high_nback, high_kdrive])\n",
    "high_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dd585c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bd39d9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get all columns that start with 'AU'\n",
    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
    "\n",
    "# Calculate number of rows and columns for subplots\n",
    "n_cols = len(au_columns)\n",
    "n_rows = 4\n",
    "n_cols_subplot = 5\n",
    "\n",
    "# Create figure with subplots\n",
    "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
    "axes = axes.flatten()\n",
    "fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
    "\n",
    "# Create histogram for each AU column\n",
    "for idx, col in enumerate(au_columns):\n",
    "    ax = axes[idx]\n",
    "    \n",
    "    # Plot overlapping histograms\n",
    "    ax.hist(low_all[col].dropna(), bins=30, alpha=0.6, color='blue', label='low_all', edgecolor='black')\n",
    "    ax.hist(high_all[col].dropna(), bins=30, alpha=0.6, color='red', label='high_all', edgecolor='black')\n",
    "    \n",
    "    # Set title and labels\n",
    "    ax.set_title(col, fontsize=10, fontweight='bold')\n",
    "    ax.set_xlabel('Value', fontsize=8)\n",
    "    ax.set_ylabel('Frequency', fontsize=8)\n",
    "    ax.legend(fontsize=8)\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "# Hide any unused subplots\n",
    "for idx in range(len(au_columns), len(axes)):\n",
    "    axes[idx].set_visible(False)\n",
    "\n",
    "# Adjust layout\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/EDA/owncloud.ipynb
+++ b/EDA/owncloud.ipynb
@ -1,157 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# %pip install pyocclient\n",
    "import yaml\n",
    "import owncloud\n",
    "import pandas as pd\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "\n",
    "with open(\"../login.yaml\") as f:\n",
    "    cfg = yaml.safe_load(f)\n",
    "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
    "file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
    "oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
    "\n",
    "\n",
    "oc.get_file(file, \"tmp22.h5\")\n",
    "\n",
    "end = time.time()\n",
    "print(end - start)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "df_performance =  pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
    "end = time.time()\n",
    "print(end - start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f50e97d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(22)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c131c816",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "df_4_col =  pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
    "end = time.time()\n",
    "print(end - start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_4_col.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_4_col.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_4_col.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_4_col.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72313895-c478-44a5-9108-00b0bec01bb8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/EDA/researchOnSubjectPerformance.ipynb
+++ b/EDA/researchOnSubjectPerformance.ipynb
@ -1,213 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8fb02733",
   "metadata": {},
   "source": [
    "Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96f3b128",
   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "import owncloud\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c20cee7c",
   "metadata": {},
   "source": [
    "Connection to Owncloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4c94558",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load credentials\n",
    "with open(\"../login.yaml\") as f:\n",
    "    cfg = yaml.safe_load(f)\n",
    "   \n",
    "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
    "\n",
    "# Connect once\n",
    "oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
    "# File pattern\n",
    "# base = \"adabase-public-{num:04d}-v_0_0_2.h5py\"\n",
    "base = \"{num:04d}-*.h5py\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07c03d07",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_files = 2  # number of files to process (min: 1, max: 30)\n",
    "performance_data = []\n",
    "\n",
    "for i in range(num_files):\n",
    "    file_pattern = f\"{i:04d}-*\"\n",
    "    \n",
    "    # Get list of files matching the pattern\n",
    "    files = oc.list('.')\n",
    "    matching_files = [f.get_name() for f in files if f.get_name().startswith(f\"{i:04d}-\")]\n",
    "    \n",
    "    if matching_files:\n",
    "        file_name = matching_files[0]  # Take the first matching file\n",
    "        local_tmp = f\"tmp_{i:04d}.h5\"\n",
    "        \n",
    "        oc.get_file(file_name, local_tmp)\n",
    "        print(f\"{file_name} geöffnet\")\n",
    "    else:\n",
    "        print(f\"Keine Datei gefunden für Muster: {file_pattern}\")\n",
    "    # file_name = base.format(num=i)\n",
    "    # local_tmp = f\"tmp_{i:04d}.h5\"\n",
    "\n",
    "    # oc.get_file(file_name, local_tmp)\n",
    "    # print(f\"{file_name} geöffnet\")\n",
    "\n",
    "    # check SIGNALS table for AUs\n",
    "    with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
    "        cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n",
    "    au_cols = [c for c in cols if c.startswith(\"AU\")]\n",
    "    if not au_cols:\n",
    "        print(f\"Subject {i} enthält keine AUs\")\n",
    "        continue\n",
    "\n",
    "    # load performance table\n",
    "    with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
    "        perf_df = store.select(\"PERFORMANCE\")\n",
    "\n",
    "    f1_cols = [c for c in [\"AUDITIVE F1\", \"VISUAL F1\", \"F1\"] if c in perf_df.columns]\n",
    "    if not f1_cols:\n",
    "        print(f\"Subject {i}: keine F1-Spalten gefunden\")\n",
    "        continue\n",
    "\n",
    "    subject_entry = {\"subjectID\": i}\n",
    "    valid_scores = []\n",
    "\n",
    "    # iterate rows: each (study, level, phase)\n",
    "    for _, row in perf_df.iterrows():\n",
    "        study, level, phase = row[\"STUDY\"], row[\"LEVEL\"], row[\"PHASE\"]\n",
    "        col_name = f\"STUDY_{study}_LEVEL_{level}_PHASE_{phase}\"\n",
    "\n",
    "        # collect valid F1 values among the three columns\n",
    "        scores = [row[c] for c in f1_cols if pd.notna(row[c])]\n",
    "        if scores:\n",
    "            mean_score = float(np.mean(scores))\n",
    "            subject_entry[col_name] = mean_score\n",
    "            valid_scores.extend(scores)\n",
    "\n",
    "    # compute overall average across all valid combinations\n",
    "    if valid_scores:\n",
    "        subject_entry[\"overall_score\"] = float(np.mean(valid_scores))\n",
    "        performance_data.append(subject_entry)\n",
    "        print(f\"Subject {i}: {len(valid_scores)} gültige Scores, Overall = {subject_entry['overall_score']:.3f}\")\n",
    "    else:\n",
    "        print(f\"Subject {i}: keine gültigen F1-Scores\")\n",
    "\n",
    "# build dataframe\n",
    "if performance_data:\n",
    "    performance_df = pd.DataFrame(performance_data)\n",
    "    combination_cols = sorted([c for c in performance_df.columns if c.startswith(\"STUDY_\")])\n",
    "    final_cols = [\"subjectID\", \"overall_score\"] + combination_cols\n",
    "    performance_df = performance_df[final_cols]\n",
    "    performance_df.to_csv(\"n_au_performance.csv\", index=False)\n",
    "\n",
    "    print(f\"\\nGesamt Subjects mit Action Units: {len(performance_df)}\")\n",
    "else:\n",
    "    print(\"Keine gültigen Daten gefunden.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bcaf065",
   "metadata": {},
   "outputs": [],
   "source": [
    "performance_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db95eea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
    "    md = store.select(\"META\")\n",
    "print(\"File 0:\")\n",
    "print(md)\n",
    "with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\") as store:\n",
    "    md = store.select(\"META\")\n",
    "print(\"File 1\")\n",
    "print(md)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8067036b",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f18e7385",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
    "    md = store.select(\"SIGNALS\", start=0, stop=1)\n",
    "print(\"File 0:\")\n",
    "md.head()\n",
    "# with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\",start=0, stop=1) as store:\n",
    "#     md = store.select(\"SIGNALS\")\n",
    "# print(\"File 1\")\n",
    "# print(md.columns)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/files_for_testing/50s_25Hz_dataset.parquet
+++ b/files_for_testing/50s_25Hz_dataset.parquet
--- a/predict_pipeline/check_python_version.py
+++ b/predict_pipeline/check_python_version.py
@ -0,0 +1,11 @@
 # from tools import db_helpers
 import sys
 def main():
    print(sys.version)
    # db_helpers.add_columns_to_table()    
 if __name__ == "__main__":
    main()
--- a/predict_pipeline/fill_db.ipynb
+++ b/predict_pipeline/fill_db.ipynb
@ -0,0 +1,211 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d70a13f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/tools')\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import db_helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce696366",
   "metadata": {},
   "outputs": [],
   "source": [
    "database_path = Path(r\"/home/edgekit/MSY_FS/databases/rawdata.sqlite\")\n",
    "parquet_path = Path(r\"/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/files_for_testing/both_mod_0000.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1aa9398",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_parquet(parquet_path)\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b183746e",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24ed769d",
   "metadata": {},
   "outputs": [],
   "source": [
    "con, cursor = db_helpers.connect_db(database_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e604ed30",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clean = dataset.drop(columns=['subjectID','rowID', 'STUDY', 'LEVEL', 'PHASE'])\n",
    "df_first_100 = df_clean.head(200)\n",
    "df_first_100 = df_first_100.reset_index(drop=True)\n",
    "df_first_100.insert(0, '_Id', df_first_100.index + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e77a812e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pandas_to_sqlite_dtype(dtype):\n",
    "    if pd.api.types.is_integer_dtype(dtype):\n",
    "        return \"INTEGER\"\n",
    "    if pd.api.types.is_float_dtype(dtype):\n",
    "        return \"REAL\"\n",
    "    if pd.api.types.is_bool_dtype(dtype):\n",
    "        return \"INTEGER\"\n",
    "    if pd.api.types.is_datetime64_any_dtype(dtype):\n",
    "        return \"TEXT\"\n",
    "    return \"TEXT\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e8897b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = {\n",
    "    col: pandas_to_sqlite_dtype(dtype)\n",
    "    for col, dtype in df_first_100.dtypes.items()\n",
    "}\n",
    "\n",
    "constraints = {\n",
    "    \"_Id\": [\"NOT NULL\"]\n",
    "}\n",
    "\n",
    "primary_key = {\n",
    "    \"pk_df_first_100\": [\"_Id\"]\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ab57624",
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = db_helpers.create_table(\n",
    "    conn=con,\n",
    "    cursor=cursor,\n",
    "    table_name=\"rawdata\",\n",
    "    columns=columns,\n",
    "    constraints=constraints,\n",
    "    primary_key=primary_key,\n",
    "    commit=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25096a7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_to_insert = {\n",
    "    col: df_first_100[col].tolist()\n",
    "    for col in df_first_100.columns\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a5a3aa8",
   "metadata": {},
   "outputs": [],
   "source": [
    "db_helpers.insert_rows_into_table(\n",
    "    conn=con,\n",
    "    cursor=cursor,\n",
    "    table_name=\"rawdata\",\n",
    "    columns=columns_to_insert,\n",
    "    commit=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b56beae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = db_helpers.get_data_from_table(conn=con, table_name='rawdata',columns_list=['*'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4a74a9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da0f8737",
   "metadata": {},
   "outputs": [],
   "source": [
    "db_helpers.disconnect_db(con, cursor)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MSY_FS_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/predict_pipeline/predict_sample.py
+++ b/predict_pipeline/predict_sample.py
@ -0,0 +1,43 @@
 # Imports
 import pandas as pd
 import json
 from pathlib import Path
 import numpy as np
 def getLastEntryFromSQLite():
    return 
 def callModel(sample):
    prediction: np.int32 = sample # noch unklar ob jedes mal ein load oder z.B. mit Flask API
    return prediction
 def getMessageConfig( config_file_path):
    return dict()
 def buildMessage(result: np.int32, config: dict):
    # message =json...
    message = 5
    return message
 def sendMessage(destination, message):
    return 2
 def main():
    config_file_path = Path("")
    config = getMessageConfig(config_file_path=config_file_path)
    sample = getLastEntryFromSQLite()
    prediction = callModel(sample=sample)
    message = buildMessage(result=prediction, config=config)
    sendMessage(config, message)
 if __name__ == "__main__":
    main()
--- a/smallerenv.yaml
+++ b/smallerenv.yaml
@ -0,0 +1,78 @@
 # ============================================================
 # SMALLER ENVIRONMENT - Korrigiert & Erweitert
 # Für Fahrsimulator-Projekt mit ML & IoT
 # ============================================================
 name: smaller_env
 channels:
  - conda-forge
  - defaults
 dependencies:
  # ====== PYTHON ======
  - python=3.8  # Kompatibel mit Jetson Nano
  # ====== CORE DATA SCIENCE ======
  - numpy=1.19.5
  - pandas=1.3.5
  - scipy=1.7.3
  - scikit-learn=1.0.2  # sklearn ist ein Alias
  # ====== VISUALIZATION ======
  # ====== ML/DL SUPPORT ======
  - h5py=3.6.0
  - joblib=1.1.0
  # ====== VIDEO PROCESSING ======
  - moviepy=1.0.3
  # ====== MACHINE LEARNING ======
  - xgboost=1.5.2
  # ====== FILE FORMATS ======
  - pyyaml  # yaml Modul
  # ====== IoT & COMMUNICATION (NEU) ======
  - paho-mqtt=1.6.1  # MQTT Client
  # ====== DATABASE (NEU) ======
  # sqlite3 ist bereits in Python eingebaut!
  # ====== UTILITIES ======
  - tqdm=4.64.1  # Progress bars
  - requests=2.28.1  # HTTP requests
  # ====== PIP PACKAGES ======
  - pip
  - pip:
    # TensorFlow (wird separat für Jetson installiert)
    # - tensorflow==2.7.0  # Jetson: via NVIDIA repo installieren
    # Eye-tracking Analysis
    - pygazeanalyser==0.2.0
    # ML Detection (falls vorhanden auf PyPI)
    # - detectors  # Prüfen ob verfügbar
    # - feat  # Prüfen ob verfügbar
    # MQTT zusätzlich via pip falls conda Version Probleme macht
    # - paho-mqtt==1.6.1
 # ============================================================
 # HINWEISE:
 # ============================================================
 #
 # 3. TENSORFLOW FÜR JETSON:
 #    Installiere nach Environment-Erstellung separat:
 #    pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v46 tensorflow==2.7.0+nv22.1
 #
 # 4. SQLITE3:
 #    Ist bereits in Python eingebaut, keine Installation nötig!
 #    Import: import sqlite3
 #
 # 5. MQTT:
 #    paho-mqtt ist der Standard MQTT-Client für Python
 #    Broker-Empfehlungen: Mosquitto, HiveMQ, EMQX
--- a/tools/db_helpers.py
+++ b/tools/db_helpers.py
@ -0,0 +1,166 @@
 import os
 import sqlite3
 import pandas as pd
 def connect_db(path_to_file: os.PathLike) -> tuple[sqlite3.Connection, sqlite3.Cursor]:
    ''' Establishes a connection with a sqlite3 database. '''
    conn = sqlite3.connect(path_to_file)
    cursor = conn.cursor()
    return conn, cursor
 def disconnect_db(conn: sqlite3.Connection, cursor: sqlite3.Cursor, commit: bool = True) -> None:
    ''' Commits all remaining changes and closes the connection with an sqlite3 database. '''
    cursor.close()
    if commit: conn.commit()  # commit all pending changes made to the sqlite3 database before closing
    conn.close()
 def create_table(
        conn: sqlite3.Connection,
        cursor: sqlite3.Cursor,
        table_name: str,
        columns: dict,
        constraints: dict,
        primary_key: dict,
        commit: bool = True
    ) -> str:
    '''
    Creates a new empty table with the given columns, constraints and primary key.
    :param columns: dict with column names (=keys) and dtypes (=values) (e.g. BIGINT, INT, ...)
    :param constraints: dict with column names (=keys) and list of constraints (=values) (like [\'NOT NULL\'(,...)])
    :param primary_key: dict with primary key name (=key) and list of attributes which combined define the table's primary key (=values, like [\'att1\'(,...)])
    '''
    assert len(primary_key.keys()) == 1
    sql = f'CREATE TABLE {table_name} (\n    '
    for column,dtype in columns.items():
        sql += f'{column} {dtype}{" "+" ".join(constraints[column]) if column in constraints.keys() else ""},\n    '
    if list(primary_key.keys())[0]: sql += f'CONSTRAINT {list(primary_key.keys())[0]} '
    sql += f'PRIMARY KEY ({", ".join(list(primary_key.values())[0])})\n)'
    cursor.execute(sql)
    if commit: conn.commit()
    return sql
 def add_columns_to_table(
        conn: sqlite3.Connection,
        cursor: sqlite3.Cursor,
        table_name: str,
        columns: dict,
        constraints: dict = dict(),
        commit: bool = True
    ) -> str:
    ''' Adds one/multiple columns (each with a list of constraints) to the given table. '''
    sql_total = ''
    for column,dtype in columns.items():  # sqlite can only add one column per query
        sql = f'ALTER TABLE {table_name}\n    '
        sql += f'ADD "{column}" {dtype}{" "+" ".join(constraints[column]) if column in constraints.keys() else ""}'
        sql_total += sql + '\n'
        cursor.execute(sql)
    if commit: conn.commit()
    return sql_total
 def insert_rows_into_table(
        conn: sqlite3.Connection,
        cursor: sqlite3.Cursor,
        table_name: str,
        columns: dict,
        commit: bool = True
    ) -> str:
    '''
    Inserts values as multiple rows into the given table.
    :param columns: dict with column names (=keys) and values to insert as lists with at least one element (=values)
    Note: The number of given values per attribute must match the number of rows to insert!
    Note: The values for the rows must be of normal python types (e.g. list, str, int, ...) instead of e.g. numpy arrays!
    '''
    assert len(set(map(len, columns.values()))) == 1, 'ERROR: Provide equal number of values for each column!'
    assert len(set(list(map(type,columns.values())))) == 1 and isinstance(list(columns.values())[0], list), 'ERROR: Provide values as Python lists!'
    assert set([type(a) for b in list(columns.values()) for a in b]).issubset({str,int,float,bool}), 'ERROR: Provide values as basic Python data types!'
    values = list(zip(*columns.values()))
    sql = f'INSERT INTO {table_name} ({", ".join(columns.keys())})\n  VALUES ({("?,"*len(values[0]))[:-1]})'
    cursor.executemany(sql, values)
    if commit: conn.commit()
    return sql
 def update_multiple_rows_in_table(
        conn: sqlite3.Connection,
        cursor: sqlite3.Cursor,
        table_name: str,
        new_vals: dict,
        conditions: str,
        commit: bool = True
    ) -> str:
    '''
    Updates attribute values of some rows in the given table.
    :param new_vals: dict with column names (=keys) and the new values to set (=values)
    :param conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...)
    '''
    assignments = ', '.join([f'{k}={v}' for k,v in zip(new_vals.keys(), new_vals.values())])
    sql = f'UPDATE {table_name}\n  SET {assignments}\n  WHERE {conditions}'
    cursor.execute(sql)
    if commit: conn.commit()
    return sql
 def delete_rows_from_table(
        conn: sqlite3.Connection,
        cursor: sqlite3.Cursor,
        table_name: str,
        conditions: str,
        commit: bool = True
    ) -> str:
    '''
    Deletes rows from the given table.
    :param conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...)
    '''
    sql = f'DELETE FROM {table_name} WHERE {conditions}'
    cursor.execute(sql)
    if commit: conn.commit()
    return sql
 def get_data_from_table(
        conn: sqlite3.Connection,
        table_name: str,
        columns_list: list = ['*'],
        aggregations: [None,dict] = None,
        where_conditions: [None,str] = None,
        order_by: [None, dict] = None,
        limit: [None, int] = None,
        offset: [None, int] = None
    ) -> pd.DataFrame:
    '''
    Helper function which returns (if desired: aggregated) contents from the given table as a pandas DataFrame. The rows can be filtered by providing the condition as a string.
    :param columns_list: use if no aggregation is needed to select which columns to get from the table
    :param (optional) aggregations: use to apply aggregations on the data from the table; dictionary with column(s) as key(s) and aggregation(s) as corresponding value(s) (e.g. {'col1': 'MIN', 'col2': 'AVG', ...} or {'*': 'COUNT'})
    :param (optional) where_conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...) applied on table.
    :param (optional) order_by: dict defining the ordering of the outputs with column(s) as key(s) and ordering as corresponding value(s) (e.g. {'col1': 'ASC'})
    :param (optional) limit: use to limit the number of returned rows
    :param (optional) offset: use to skip the first n rows before displaying
    Note: If aggregations is set, the columns_list is ignored.
    Note: Get all data as a DataFrame with get_data_from_table(conn, table_name).
    Note: If one output is wanted (e.g. count(*) or similar), get it with get_data_from_table(...).iloc[0,0] from the DataFrame.
    '''
    assert columns_list or aggregations
    if aggregations:
        selection = [f'{agg}({col})' for col,agg in aggregations.items()]
    else:
        selection = columns_list
    selection = ", ".join(selection) 
    where_conditions = 'WHERE ' + where_conditions if where_conditions else ''
    order_by = 'ORDER BY ' + ', '.join([f'{k} {v}' for k,v in order_by.items()]) if order_by else ''
    limit = f'LIMIT {limit}' if limit else ''
    offset = f'OFFSET {offset}' if offset else ''
    sql = f'SELECT {selection} FROM {table_name} {where_conditions} {order_by} {limit} {offset}'
    return pd.read_sql_query(sql, conn)
Author	SHA1	Message	Date
Michael Weig	9b7bb945bc	new files for comissioning	2026-01-29 20:06:42 +00:00
Michael	a9ff3880e2	rearrarngement of files	2026-01-29 18:12:13 +01:00
Michael Weig	5a216b22fd	tool functions for sqlite database	2026-01-29 17:04:36 +00:00
Michael Weig	0294d4e584	files for testing	2026-01-29 13:48:47 +00:00
Michael	4f6c3b7370	created low code script for model deployment	2026-01-27 19:28:12 +01:00
Michael	5f2db4d0c9	init commit on deployment, removed EDA files	2026-01-27 18:42:40 +01:00