added 2-class separation in "open parquet-test"

2025-11-02 20:55:01 +01:00 · 2025-11-02 20:55:01 +01:00 · a6a0dd3ac5
commit a6a0dd3ac5
parent 25aa03398a
4 changed files with 1045 additions and 55 deletions
--- a/EDA/owncloud.ipynb
+++ b/EDA/owncloud.ipynb
@ -2,14 +2,14 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
-    "%pip install pyocclient\n",
+    "# %pip install pyocclient\n",
    "import yaml\n",
    "import owncloud\n",
    "import pandas as pd\n",
@ -18,21 +18,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "143.946026802063\n"
+     ]
+    }
+   ],
   "source": [
    "start = time.time()\n",
    "\n",
-    "with open(\"login.yaml\") as f:\n",
+    "with open(\"../login.yaml\") as f:\n",
    "    cfg = yaml.safe_load(f)\n",
    "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
-    "file = \"adabase-public-0003-v_0_0_2.h5py\"\n",
+    "file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
    "oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
    "\n",
    "\n",
-    "oc.get_file(file, \"tmp.h5\")\n",
+    "oc.get_file(file, \"tmp22.h5\")\n",
    "\n",
    "end = time.time()\n",
    "print(end - start)\n"
@ -40,66 +48,457 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.5121121406555176\n"
+     ]
+    }
+   ],
   "source": [
    "start = time.time()\n",
-    "df_performance =  pd.read_hdf(\"tmp.h5\", \"PERFORMANCE\")\n",
+    "df_performance =  pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
    "end = time.time()\n",
    "print(end - start)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
+   "id": "f50e97d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "22\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(22)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c131c816",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>STUDY</th>\n",
+       "      <th>PHASE</th>\n",
+       "      <th>LEVEL</th>\n",
+       "      <th>AUDITIVE F1</th>\n",
+       "      <th>AUDITIVE MEAN REACTION TIME</th>\n",
+       "      <th>AUDITIVE PRECISION</th>\n",
+       "      <th>AUDITIVE RECALL</th>\n",
+       "      <th>VISUAL F1</th>\n",
+       "      <th>VISUAL MEAN REACTION TIME</th>\n",
+       "      <th>VISUAL PRECISION</th>\n",
+       "      <th>VISUAL RECALL</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>PRECISION</th>\n",
+       "      <th>REACTION TIME</th>\n",
+       "      <th>RECALL</th>\n",
+       "      <th>SONGS RECALL</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>01</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.428068</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>02</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.928571</td>\n",
+       "      <td>0.626869</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.866667</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>03</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.640000</td>\n",
+       "      <td>0.828912</td>\n",
+       "      <td>0.727273</td>\n",
+       "      <td>0.571429</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>04</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.309286</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.942916</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>05</td>\n",
+       "      <td>0.782609</td>\n",
+       "      <td>1.316484</td>\n",
+       "      <td>0.818182</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.814815</td>\n",
+       "      <td>1.151405</td>\n",
+       "      <td>0.916667</td>\n",
+       "      <td>0.733333</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>n-back</td>\n",
+       "      <td>test</td>\n",
+       "      <td>06</td>\n",
+       "      <td>0.363636</td>\n",
+       "      <td>1.703583</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.285714</td>\n",
+       "      <td>0.476190</td>\n",
+       "      <td>1.530054</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>0.357143</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>test</td>\n",
+       "      <td>01</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.446914</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>test</td>\n",
+       "      <td>02</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.914286</td>\n",
+       "      <td>0.914286</td>\n",
+       "      <td>0.702571</td>\n",
+       "      <td>0.914286</td>\n",
+       "      <td>0.454545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>test</td>\n",
+       "      <td>03</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.786325</td>\n",
+       "      <td>0.938776</td>\n",
+       "      <td>1.175797</td>\n",
+       "      <td>0.676471</td>\n",
+       "      <td>0.347826</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      STUDY PHASE LEVEL  AUDITIVE F1  AUDITIVE MEAN REACTION TIME  \\\n",
+       "6    n-back  test    01          NaN                          NaN   \n",
+       "7    n-back  test    02          NaN                          NaN   \n",
+       "8    n-back  test    03          NaN                          NaN   \n",
+       "9    n-back  test    04     1.000000                     1.309286   \n",
+       "10   n-back  test    05     0.782609                     1.316484   \n",
+       "11   n-back  test    06     0.363636                     1.703583   \n",
+       "3   k-drive  test    01          NaN                          NaN   \n",
+       "4   k-drive  test    02          NaN                          NaN   \n",
+       "5   k-drive  test    03          NaN                          NaN   \n",
+       "\n",
+       "    AUDITIVE PRECISION  AUDITIVE RECALL  VISUAL F1  VISUAL MEAN REACTION TIME  \\\n",
+       "6                  NaN              NaN   1.000000                   0.428068   \n",
+       "7                  NaN              NaN   0.928571                   0.626869   \n",
+       "8                  NaN              NaN   0.640000                   0.828912   \n",
+       "9             1.000000         1.000000   1.000000                   0.942916   \n",
+       "10            0.818182         0.750000   0.814815                   1.151405   \n",
+       "11            0.500000         0.285714   0.476190                   1.530054   \n",
+       "3                  NaN              NaN        NaN                        NaN   \n",
+       "4                  NaN              NaN        NaN                        NaN   \n",
+       "5                  NaN              NaN        NaN                        NaN   \n",
+       "\n",
+       "    VISUAL PRECISION  VISUAL RECALL        F1  PRECISION  REACTION TIME  \\\n",
+       "6           1.000000       1.000000       NaN        NaN            NaN   \n",
+       "7           1.000000       0.866667       NaN        NaN            NaN   \n",
+       "8           0.727273       0.571429       NaN        NaN            NaN   \n",
+       "9           1.000000       1.000000       NaN        NaN            NaN   \n",
+       "10          0.916667       0.733333       NaN        NaN            NaN   \n",
+       "11          0.714286       0.357143       NaN        NaN            NaN   \n",
+       "3                NaN            NaN  1.000000   1.000000       0.446914   \n",
+       "4                NaN            NaN  0.914286   0.914286       0.702571   \n",
+       "5                NaN            NaN  0.786325   0.938776       1.175797   \n",
+       "\n",
+       "      RECALL  SONGS RECALL  \n",
+       "6        NaN           NaN  \n",
+       "7        NaN           NaN  \n",
+       "8        NaN           NaN  \n",
+       "9        NaN           NaN  \n",
+       "10       NaN           NaN  \n",
+       "11       NaN           NaN  \n",
+       "3   1.000000           NaN  \n",
+       "4   0.914286      0.454545  \n",
+       "5   0.676471      0.347826  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
   "id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.05357074737548828\n"
+     ]
+    }
+   ],
   "source": [
    "start = time.time()\n",
-    "df_4_col =  pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", ''])\n",
+    "df_4_col =  pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
    "end = time.time()\n",
    "print(end - start)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>STUDY</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>n/a</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  STUDY\n",
+       "0   n/a"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df_4_col.head()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1, 1)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df_4_col.shape"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 1 entries, 0 to 0\n",
+      "Data columns (total 1 columns):\n",
+      " #   Column  Non-Null Count  Dtype \n",
+      "---  ------  --------------  ----- \n",
+      " 0   STUDY   1 non-null      object\n",
+      "dtypes: object(1)\n",
+      "memory usage: 16.0+ bytes\n"
+     ]
+    }
+   ],
   "source": [
    "df_4_col.info()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "STUDY    0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df_4_col.isna().sum()"
   ]
@ -115,7 +514,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
@ -129,7 +528,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.11.5"
  }
 },
 "nbformat": 4,
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@ -94,8 +94,9 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
 # Beispiel-Verwendung
 if __name__ == "__main__":
    # Anpassen an deine Pfade
-    input_directory = ""
-    output_file = "./output/output_windowed.parquet"
+    input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU"
+    output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet"
+
    
    
    result = process_parquet_files(
--- a/dataset_creation/create_parquet_files.py
+++ b/dataset_creation/create_parquet_files.py
@ -1,14 +1,16 @@
-# %pip install pyocclient
+#  pip install pyocclient
 import yaml
 import owncloud
 import pandas as pd
 import h5py
+import os

+print(os.getcwd())
 num_files = 30  # number of files to process (min: 1, max: 30)
 # Load credentials
 with open("login.yaml") as f:
    cfg = yaml.safe_load(f)
-    print("ahahahah")
+    print("yaml geladen")
 url, password = cfg[0]["url"], cfg[1]["password"]

 # Connect once
@ -18,6 +20,7 @@ print("connection aufgebaut")
 base = "adabase-public-{num:04d}-v_0_0_2.h5py"

 for i in range(num_files):
+    print(f"Subject {i} gestartet")
    file_name = base.format(num=i)
    local_tmp = f"tmp_{i:04d}.h5"

@ -35,7 +38,9 @@ for i in range(num_files):
    # Step 2: Filter columns that start with "AU"
    au_cols = [c for c in cols if c.startswith("AU")]
    print(au_cols)
-
+    if len(au_cols)==0:
+        print(f"keine AU Signale in Subject {i}")
+        continue
    # Step 3: Read only those columns (plus any others you want)
    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)

@ -58,7 +63,8 @@ for i in range(num_files):
  

    # Save to parquet
-    out_name = f"cleaned_{i:04d}.parquet"
+    os.makedirs("ParquetFiles", exist_ok=True)
+    out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
    df.to_parquet(out_name, index=False)

    print(f"Processed {file_name} -> {out_name}")
--- a/dataset_creation/open_parquet_test.ipynb
+++ b/dataset_creation/open_parquet_test.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "2b3fface",
   "metadata": {},
   "outputs": [],
@ -12,66 +12,650 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "74f1f5ec",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(7320, 25)\n"
+     ]
+    }
+   ],
   "source": [
-    "df= pd.read_parquet(\"cleaned_0000.parquet\")\n",
+    "df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
    "print(df.shape)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "05775454",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subjectID</th>\n",
+       "      <th>start_time</th>\n",
+       "      <th>STUDY</th>\n",
+       "      <th>LEVEL</th>\n",
+       "      <th>PHASE</th>\n",
+       "      <th>AU01_sum</th>\n",
+       "      <th>AU02_sum</th>\n",
+       "      <th>AU04_sum</th>\n",
+       "      <th>AU05_sum</th>\n",
+       "      <th>AU06_sum</th>\n",
+       "      <th>...</th>\n",
+       "      <th>AU14_sum</th>\n",
+       "      <th>AU15_sum</th>\n",
+       "      <th>AU17_sum</th>\n",
+       "      <th>AU20_sum</th>\n",
+       "      <th>AU23_sum</th>\n",
+       "      <th>AU24_sum</th>\n",
+       "      <th>AU25_sum</th>\n",
+       "      <th>AU26_sum</th>\n",
+       "      <th>AU28_sum</th>\n",
+       "      <th>AU43_sum</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>192000</td>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>1</td>\n",
+       "      <td>baseline</td>\n",
+       "      <td>441.0</td>\n",
+       "      <td>354.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>29.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>302.0</td>\n",
+       "      <td>511.0</td>\n",
+       "      <td>653.0</td>\n",
+       "      <td>65.0</td>\n",
+       "      <td>798.0</td>\n",
+       "      <td>1096.0</td>\n",
+       "      <td>84.0</td>\n",
+       "      <td>230.0</td>\n",
+       "      <td>114.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>197120</td>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>1</td>\n",
+       "      <td>baseline</td>\n",
+       "      <td>459.0</td>\n",
+       "      <td>357.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>71.0</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>222.0</td>\n",
+       "      <td>549.0</td>\n",
+       "      <td>683.0</td>\n",
+       "      <td>54.0</td>\n",
+       "      <td>810.0</td>\n",
+       "      <td>1093.0</td>\n",
+       "      <td>86.0</td>\n",
+       "      <td>247.0</td>\n",
+       "      <td>108.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>202120</td>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>1</td>\n",
+       "      <td>baseline</td>\n",
+       "      <td>487.0</td>\n",
+       "      <td>342.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>70.0</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>141.0</td>\n",
+       "      <td>558.0</td>\n",
+       "      <td>710.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>828.0</td>\n",
+       "      <td>1092.0</td>\n",
+       "      <td>86.0</td>\n",
+       "      <td>257.0</td>\n",
+       "      <td>95.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>207120</td>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>1</td>\n",
+       "      <td>baseline</td>\n",
+       "      <td>545.0</td>\n",
+       "      <td>374.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>70.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>84.0</td>\n",
+       "      <td>594.0</td>\n",
+       "      <td>742.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>858.0</td>\n",
+       "      <td>1091.0</td>\n",
+       "      <td>97.0</td>\n",
+       "      <td>279.0</td>\n",
+       "      <td>99.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>212120</td>\n",
+       "      <td>k-drive</td>\n",
+       "      <td>1</td>\n",
+       "      <td>baseline</td>\n",
+       "      <td>571.0</td>\n",
+       "      <td>375.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>68.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>547.0</td>\n",
+       "      <td>735.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>894.0</td>\n",
+       "      <td>1138.0</td>\n",
+       "      <td>69.0</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>98.0</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 25 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   subjectID  start_time    STUDY  LEVEL     PHASE  AU01_sum  AU02_sum  \\\n",
+       "0          0      192000  k-drive      1  baseline     441.0     354.0   \n",
+       "1          0      197120  k-drive      1  baseline     459.0     357.0   \n",
+       "2          0      202120  k-drive      1  baseline     487.0     342.0   \n",
+       "3          0      207120  k-drive      1  baseline     545.0     374.0   \n",
+       "4          0      212120  k-drive      1  baseline     571.0     375.0   \n",
+       "\n",
+       "   AU04_sum  AU05_sum  AU06_sum  ...  AU14_sum  AU15_sum  AU17_sum  AU20_sum  \\\n",
+       "0       3.0      81.0      29.0  ...     302.0     511.0     653.0      65.0   \n",
+       "1       4.0      71.0      22.0  ...     222.0     549.0     683.0      54.0   \n",
+       "2       5.0      70.0      18.0  ...     141.0     558.0     710.0      27.0   \n",
+       "3       4.0      70.0      13.0  ...      84.0     594.0     742.0      13.0   \n",
+       "4       7.0      68.0      10.0  ...      80.0     547.0     735.0      12.0   \n",
+       "\n",
+       "   AU23_sum  AU24_sum  AU25_sum  AU26_sum  AU28_sum  AU43_sum  \n",
+       "0     798.0    1096.0      84.0     230.0     114.0       5.0  \n",
+       "1     810.0    1093.0      86.0     247.0     108.0       5.0  \n",
+       "2     828.0    1092.0      86.0     257.0      95.0       3.0  \n",
+       "3     858.0    1091.0      97.0     279.0      99.0       2.0  \n",
+       "4     894.0    1138.0      69.0     245.0      98.0       8.0  \n",
+       "\n",
+       "[5 rows x 25 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "id": "99e17328",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subjectID</th>\n",
+       "      <th>start_time</th>\n",
+       "      <th>STUDY</th>\n",
+       "      <th>LEVEL</th>\n",
+       "      <th>PHASE</th>\n",
+       "      <th>AU01_sum</th>\n",
+       "      <th>AU02_sum</th>\n",
+       "      <th>AU04_sum</th>\n",
+       "      <th>AU05_sum</th>\n",
+       "      <th>AU06_sum</th>\n",
+       "      <th>...</th>\n",
+       "      <th>AU14_sum</th>\n",
+       "      <th>AU15_sum</th>\n",
+       "      <th>AU17_sum</th>\n",
+       "      <th>AU20_sum</th>\n",
+       "      <th>AU23_sum</th>\n",
+       "      <th>AU24_sum</th>\n",
+       "      <th>AU25_sum</th>\n",
+       "      <th>AU26_sum</th>\n",
+       "      <th>AU28_sum</th>\n",
+       "      <th>AU43_sum</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>7315</th>\n",
+       "      <td>29</td>\n",
+       "      <td>7142440</td>\n",
+       "      <td>n-back</td>\n",
+       "      <td>6</td>\n",
+       "      <td>test</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>388.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>83.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>191.0</td>\n",
+       "      <td>697.0</td>\n",
+       "      <td>584.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>319.0</td>\n",
+       "      <td>421.0</td>\n",
+       "      <td>247.0</td>\n",
+       "      <td>88.0</td>\n",
+       "      <td>35.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7316</th>\n",
+       "      <td>29</td>\n",
+       "      <td>7147440</td>\n",
+       "      <td>n-back</td>\n",
+       "      <td>6</td>\n",
+       "      <td>test</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>407.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>86.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>191.0</td>\n",
+       "      <td>693.0</td>\n",
+       "      <td>594.0</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>73.0</td>\n",
+       "      <td>312.0</td>\n",
+       "      <td>414.0</td>\n",
+       "      <td>242.0</td>\n",
+       "      <td>83.0</td>\n",
+       "      <td>40.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7317</th>\n",
+       "      <td>29</td>\n",
+       "      <td>7152440</td>\n",
+       "      <td>n-back</td>\n",
+       "      <td>6</td>\n",
+       "      <td>test</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>409.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>87.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>187.0</td>\n",
+       "      <td>703.0</td>\n",
+       "      <td>597.0</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>64.0</td>\n",
+       "      <td>314.0</td>\n",
+       "      <td>411.0</td>\n",
+       "      <td>248.0</td>\n",
+       "      <td>98.0</td>\n",
+       "      <td>38.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7318</th>\n",
+       "      <td>29</td>\n",
+       "      <td>7157440</td>\n",
+       "      <td>n-back</td>\n",
+       "      <td>6</td>\n",
+       "      <td>test</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>417.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>94.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>169.0</td>\n",
+       "      <td>711.0</td>\n",
+       "      <td>603.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>63.0</td>\n",
+       "      <td>327.0</td>\n",
+       "      <td>398.0</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>35.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7319</th>\n",
+       "      <td>29</td>\n",
+       "      <td>7162440</td>\n",
+       "      <td>n-back</td>\n",
+       "      <td>6</td>\n",
+       "      <td>test</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>436.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>178.0</td>\n",
+       "      <td>720.0</td>\n",
+       "      <td>621.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>65.0</td>\n",
+       "      <td>337.0</td>\n",
+       "      <td>377.0</td>\n",
+       "      <td>246.0</td>\n",
+       "      <td>101.0</td>\n",
+       "      <td>31.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 25 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      subjectID  start_time   STUDY  LEVEL PHASE  AU01_sum  AU02_sum  \\\n",
+       "7315         29     7142440  n-back      6  test      14.0      15.0   \n",
+       "7316         29     7147440  n-back      6  test      17.0      11.0   \n",
+       "7317         29     7152440  n-back      6  test      14.0       9.0   \n",
+       "7318         29     7157440  n-back      6  test      14.0       9.0   \n",
+       "7319         29     7162440  n-back      6  test      13.0       9.0   \n",
+       "\n",
+       "      AU04_sum  AU05_sum  AU06_sum  ...  AU14_sum  AU15_sum  AU17_sum  \\\n",
+       "7315     388.0       0.0      83.0  ...     191.0     697.0     584.0   \n",
+       "7316     407.0       0.0      86.0  ...     191.0     693.0     594.0   \n",
+       "7317     409.0       0.0      87.0  ...     187.0     703.0     597.0   \n",
+       "7318     417.0       0.0      94.0  ...     169.0     711.0     603.0   \n",
+       "7319     436.0       0.0     100.0  ...     178.0     720.0     621.0   \n",
+       "\n",
+       "      AU20_sum  AU23_sum  AU24_sum  AU25_sum  AU26_sum  AU28_sum  AU43_sum  \n",
+       "7315      15.0      81.0     319.0     421.0     247.0      88.0      35.0  \n",
+       "7316      14.0      73.0     312.0     414.0     242.0      83.0      40.0  \n",
+       "7317      14.0      64.0     314.0     411.0     248.0      98.0      38.0  \n",
+       "7318      15.0      63.0     327.0     398.0     245.0     100.0      35.0  \n",
+       "7319      17.0      65.0     337.0     377.0     246.0     101.0      31.0  \n",
+       "\n",
+       "[5 rows x 25 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "0238d802",
+   "execution_count": 5,
+   "id": "69e53731",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 7320 entries, 0 to 7319\n",
+      "Data columns (total 25 columns):\n",
+      " #   Column      Non-Null Count  Dtype  \n",
+      "---  ------      --------------  -----  \n",
+      " 0   subjectID   7320 non-null   int64  \n",
+      " 1   start_time  7320 non-null   int64  \n",
+      " 2   STUDY       7320 non-null   object \n",
+      " 3   LEVEL       7320 non-null   int8   \n",
+      " 4   PHASE       7320 non-null   object \n",
+      " 5   AU01_sum    7320 non-null   float64\n",
+      " 6   AU02_sum    7320 non-null   float64\n",
+      " 7   AU04_sum    7320 non-null   float64\n",
+      " 8   AU05_sum    7320 non-null   float64\n",
+      " 9   AU06_sum    7320 non-null   float64\n",
+      " 10  AU07_sum    7320 non-null   float64\n",
+      " 11  AU09_sum    7320 non-null   float64\n",
+      " 12  AU10_sum    7320 non-null   float64\n",
+      " 13  AU11_sum    7320 non-null   float64\n",
+      " 14  AU12_sum    7320 non-null   float64\n",
+      " 15  AU14_sum    7320 non-null   float64\n",
+      " 16  AU15_sum    7320 non-null   float64\n",
+      " 17  AU17_sum    7320 non-null   float64\n",
+      " 18  AU20_sum    7320 non-null   float64\n",
+      " 19  AU23_sum    7320 non-null   float64\n",
+      " 20  AU24_sum    7320 non-null   float64\n",
+      " 21  AU25_sum    7320 non-null   float64\n",
+      " 22  AU26_sum    7320 non-null   float64\n",
+      " 23  AU28_sum    7320 non-null   float64\n",
+      " 24  AU43_sum    7320 non-null   float64\n",
+      "dtypes: float64(20), int64(2), int8(1), object(2)\n",
+      "memory usage: 1.3+ MB\n"
+     ]
+    }
+   ],
   "source": [
-    "step2 = pd.read_parquet(\"output_windowed.parquet\")\n",
-    "step2.head()"
+    "df.info()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "1257c535",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "step2.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "3754c664",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "STUDY    PHASE     LEVEL\n",
+       "k-drive  train     1        155\n",
+       "                   3        156\n",
+       "                   2        162\n",
+       "         baseline  3        248\n",
+       "n-back   baseline  2        252\n",
+       "         test      5        255\n",
+       "                   6        256\n",
+       "                   1        258\n",
+       "                   4        258\n",
+       "                   2        260\n",
+       "                   3        260\n",
+       "k-drive  baseline  2        267\n",
+       "                   1        896\n",
+       "n-back   baseline  1        901\n",
+       "k-drive  test      1        911\n",
+       "                   2        912\n",
+       "                   3        913\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# Zeigt alle Kombinationen mit Häufigkeit\n",
-    "step2[['STUDY', 'LEVEL', 'PHASE']].value_counts()"
+    "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "f83b595c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1031, 25)"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "high_nback = df[\n",
+    "    (df[\"STUDY\"]==\"n-back\") &\n",
+    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+    "]\n",
+    "high_nback.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "c0940343",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(3080, 25)\n",
+      "(3209, 25)\n"
+     ]
+    }
+   ],
+   "source": [
+    "low_all = df[\n",
+    "    ((df[\"PHASE\"] == \"baseline\") |\n",
+    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
+    "]\n",
+    "print(low_all.shape)\n",
+    "high_kdrive = df[\n",
+    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+    "]\n",
+    "print(high_kdrive.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "f7ce38d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "7320\n",
+      "7320\n"
+     ]
+    }
+   ],
+   "source": [
+    "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
+    "print(df.shape[0])\n",
+    "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "48ba0379",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4240, 25)"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "high_all = pd.concat([high_nback, high_kdrive])\n",
+    "high_all.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "77dda26c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Gesamt: 7320==7320\n",
+      "Anzahl an low load Samples: 3080\n",
+      "Anzahl an high load Samples: 4240\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
+    "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
+    "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
   ]
  }
 ],