diff --git a/EDA/owncloud.ipynb b/EDA/owncloud.ipynb
index 8f6772a..32aa864 100644
--- a/EDA/owncloud.ipynb
+++ b/EDA/owncloud.ipynb
@@ -2,14 +2,14 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
- "%pip install pyocclient\n",
+ "# %pip install pyocclient\n",
"import yaml\n",
"import owncloud\n",
"import pandas as pd\n",
@@ -18,21 +18,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "143.946026802063\n"
+ ]
+ }
+ ],
"source": [
"start = time.time()\n",
"\n",
- "with open(\"login.yaml\") as f:\n",
+ "with open(\"../login.yaml\") as f:\n",
" cfg = yaml.safe_load(f)\n",
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
- "file = \"adabase-public-0003-v_0_0_2.h5py\"\n",
+ "file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
"\n",
"\n",
- "oc.get_file(file, \"tmp.h5\")\n",
+ "oc.get_file(file, \"tmp22.h5\")\n",
"\n",
"end = time.time()\n",
"print(end - start)\n"
@@ -40,66 +48,457 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.5121121406555176\n"
+ ]
+ }
+ ],
"source": [
"start = time.time()\n",
- "df_performance = pd.read_hdf(\"tmp.h5\", \"PERFORMANCE\")\n",
+ "df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
+ "id": "f50e97d0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "22\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(22)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c131c816",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " STUDY | \n",
+ " PHASE | \n",
+ " LEVEL | \n",
+ " AUDITIVE F1 | \n",
+ " AUDITIVE MEAN REACTION TIME | \n",
+ " AUDITIVE PRECISION | \n",
+ " AUDITIVE RECALL | \n",
+ " VISUAL F1 | \n",
+ " VISUAL MEAN REACTION TIME | \n",
+ " VISUAL PRECISION | \n",
+ " VISUAL RECALL | \n",
+ " F1 | \n",
+ " PRECISION | \n",
+ " REACTION TIME | \n",
+ " RECALL | \n",
+ " SONGS RECALL | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 0.428068 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 02 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.928571 | \n",
+ " 0.626869 | \n",
+ " 1.000000 | \n",
+ " 0.866667 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 03 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.640000 | \n",
+ " 0.828912 | \n",
+ " 0.727273 | \n",
+ " 0.571429 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 04 | \n",
+ " 1.000000 | \n",
+ " 1.309286 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.942916 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 05 | \n",
+ " 0.782609 | \n",
+ " 1.316484 | \n",
+ " 0.818182 | \n",
+ " 0.750000 | \n",
+ " 0.814815 | \n",
+ " 1.151405 | \n",
+ " 0.916667 | \n",
+ " 0.733333 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " n-back | \n",
+ " test | \n",
+ " 06 | \n",
+ " 0.363636 | \n",
+ " 1.703583 | \n",
+ " 0.500000 | \n",
+ " 0.285714 | \n",
+ " 0.476190 | \n",
+ " 1.530054 | \n",
+ " 0.714286 | \n",
+ " 0.357143 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " k-drive | \n",
+ " test | \n",
+ " 01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.446914 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " k-drive | \n",
+ " test | \n",
+ " 02 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.914286 | \n",
+ " 0.914286 | \n",
+ " 0.702571 | \n",
+ " 0.914286 | \n",
+ " 0.454545 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " k-drive | \n",
+ " test | \n",
+ " 03 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.786325 | \n",
+ " 0.938776 | \n",
+ " 1.175797 | \n",
+ " 0.676471 | \n",
+ " 0.347826 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " STUDY PHASE LEVEL AUDITIVE F1 AUDITIVE MEAN REACTION TIME \\\n",
+ "6 n-back test 01 NaN NaN \n",
+ "7 n-back test 02 NaN NaN \n",
+ "8 n-back test 03 NaN NaN \n",
+ "9 n-back test 04 1.000000 1.309286 \n",
+ "10 n-back test 05 0.782609 1.316484 \n",
+ "11 n-back test 06 0.363636 1.703583 \n",
+ "3 k-drive test 01 NaN NaN \n",
+ "4 k-drive test 02 NaN NaN \n",
+ "5 k-drive test 03 NaN NaN \n",
+ "\n",
+ " AUDITIVE PRECISION AUDITIVE RECALL VISUAL F1 VISUAL MEAN REACTION TIME \\\n",
+ "6 NaN NaN 1.000000 0.428068 \n",
+ "7 NaN NaN 0.928571 0.626869 \n",
+ "8 NaN NaN 0.640000 0.828912 \n",
+ "9 1.000000 1.000000 1.000000 0.942916 \n",
+ "10 0.818182 0.750000 0.814815 1.151405 \n",
+ "11 0.500000 0.285714 0.476190 1.530054 \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN \n",
+ "5 NaN NaN NaN NaN \n",
+ "\n",
+ " VISUAL PRECISION VISUAL RECALL F1 PRECISION REACTION TIME \\\n",
+ "6 1.000000 1.000000 NaN NaN NaN \n",
+ "7 1.000000 0.866667 NaN NaN NaN \n",
+ "8 0.727273 0.571429 NaN NaN NaN \n",
+ "9 1.000000 1.000000 NaN NaN NaN \n",
+ "10 0.916667 0.733333 NaN NaN NaN \n",
+ "11 0.714286 0.357143 NaN NaN NaN \n",
+ "3 NaN NaN 1.000000 1.000000 0.446914 \n",
+ "4 NaN NaN 0.914286 0.914286 0.702571 \n",
+ "5 NaN NaN 0.786325 0.938776 1.175797 \n",
+ "\n",
+ " RECALL SONGS RECALL \n",
+ "6 NaN NaN \n",
+ "7 NaN NaN \n",
+ "8 NaN NaN \n",
+ "9 NaN NaN \n",
+ "10 NaN NaN \n",
+ "11 NaN NaN \n",
+ "3 1.000000 NaN \n",
+ "4 0.914286 0.454545 \n",
+ "5 0.676471 0.347826 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_performance"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.05357074737548828\n"
+ ]
+ }
+ ],
"source": [
"start = time.time()\n",
- "df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", ''])\n",
+ "df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " STUDY | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " n/a | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " STUDY\n",
+ "0 n/a"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df_4_col.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1, 1)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df_4_col.shape"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Index: 1 entries, 0 to 0\n",
+ "Data columns (total 1 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 STUDY 1 non-null object\n",
+ "dtypes: object(1)\n",
+ "memory usage: 16.0+ bytes\n"
+ ]
+ }
+ ],
"source": [
"df_4_col.info()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "STUDY 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df_4_col.isna().sum()"
]
@@ -115,7 +514,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -129,7 +528,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.10"
+ "version": "3.11.5"
}
},
"nbformat": 4,
diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py
index 53baa33..7fac133 100644
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@@ -94,8 +94,9 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
# Beispiel-Verwendung
if __name__ == "__main__":
# Anpassen an deine Pfade
- input_directory = ""
- output_file = "./output/output_windowed.parquet"
+ input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU"
+ output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet"
+
result = process_parquet_files(
diff --git a/dataset_creation/create_parquet_files.py b/dataset_creation/create_parquet_files.py
index 5fb5b6a..1a2fb7f 100644
--- a/dataset_creation/create_parquet_files.py
+++ b/dataset_creation/create_parquet_files.py
@@ -1,14 +1,16 @@
-# %pip install pyocclient
+# pip install pyocclient
import yaml
import owncloud
import pandas as pd
import h5py
+import os
+print(os.getcwd())
num_files = 30 # number of files to process (min: 1, max: 30)
# Load credentials
with open("login.yaml") as f:
cfg = yaml.safe_load(f)
- print("ahahahah")
+ print("yaml geladen")
url, password = cfg[0]["url"], cfg[1]["password"]
# Connect once
@@ -18,6 +20,7 @@ print("connection aufgebaut")
base = "adabase-public-{num:04d}-v_0_0_2.h5py"
for i in range(num_files):
+ print(f"Subject {i} gestartet")
file_name = base.format(num=i)
local_tmp = f"tmp_{i:04d}.h5"
@@ -35,7 +38,9 @@ for i in range(num_files):
# Step 2: Filter columns that start with "AU"
au_cols = [c for c in cols if c.startswith("AU")]
print(au_cols)
-
+ if len(au_cols)==0:
+ print(f"keine AU Signale in Subject {i}")
+ continue
# Step 3: Read only those columns (plus any others you want)
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
@@ -58,7 +63,8 @@ for i in range(num_files):
# Save to parquet
- out_name = f"cleaned_{i:04d}.parquet"
+ os.makedirs("ParquetFiles", exist_ok=True)
+ out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
df.to_parquet(out_name, index=False)
print(f"Processed {file_name} -> {out_name}")
diff --git a/dataset_creation/open_parquet_test.ipynb b/dataset_creation/open_parquet_test.ipynb
index 2159704..309686f 100644
--- a/dataset_creation/open_parquet_test.ipynb
+++ b/dataset_creation/open_parquet_test.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "2b3fface",
"metadata": {},
"outputs": [],
@@ -12,66 +12,650 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "74f1f5ec",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(7320, 25)\n"
+ ]
+ }
+ ],
"source": [
- "df= pd.read_parquet(\"cleaned_0000.parquet\")\n",
+ "df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
"print(df.shape)\n",
"\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "05775454",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subjectID | \n",
+ " start_time | \n",
+ " STUDY | \n",
+ " LEVEL | \n",
+ " PHASE | \n",
+ " AU01_sum | \n",
+ " AU02_sum | \n",
+ " AU04_sum | \n",
+ " AU05_sum | \n",
+ " AU06_sum | \n",
+ " ... | \n",
+ " AU14_sum | \n",
+ " AU15_sum | \n",
+ " AU17_sum | \n",
+ " AU20_sum | \n",
+ " AU23_sum | \n",
+ " AU24_sum | \n",
+ " AU25_sum | \n",
+ " AU26_sum | \n",
+ " AU28_sum | \n",
+ " AU43_sum | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 192000 | \n",
+ " k-drive | \n",
+ " 1 | \n",
+ " baseline | \n",
+ " 441.0 | \n",
+ " 354.0 | \n",
+ " 3.0 | \n",
+ " 81.0 | \n",
+ " 29.0 | \n",
+ " ... | \n",
+ " 302.0 | \n",
+ " 511.0 | \n",
+ " 653.0 | \n",
+ " 65.0 | \n",
+ " 798.0 | \n",
+ " 1096.0 | \n",
+ " 84.0 | \n",
+ " 230.0 | \n",
+ " 114.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 197120 | \n",
+ " k-drive | \n",
+ " 1 | \n",
+ " baseline | \n",
+ " 459.0 | \n",
+ " 357.0 | \n",
+ " 4.0 | \n",
+ " 71.0 | \n",
+ " 22.0 | \n",
+ " ... | \n",
+ " 222.0 | \n",
+ " 549.0 | \n",
+ " 683.0 | \n",
+ " 54.0 | \n",
+ " 810.0 | \n",
+ " 1093.0 | \n",
+ " 86.0 | \n",
+ " 247.0 | \n",
+ " 108.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 202120 | \n",
+ " k-drive | \n",
+ " 1 | \n",
+ " baseline | \n",
+ " 487.0 | \n",
+ " 342.0 | \n",
+ " 5.0 | \n",
+ " 70.0 | \n",
+ " 18.0 | \n",
+ " ... | \n",
+ " 141.0 | \n",
+ " 558.0 | \n",
+ " 710.0 | \n",
+ " 27.0 | \n",
+ " 828.0 | \n",
+ " 1092.0 | \n",
+ " 86.0 | \n",
+ " 257.0 | \n",
+ " 95.0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 207120 | \n",
+ " k-drive | \n",
+ " 1 | \n",
+ " baseline | \n",
+ " 545.0 | \n",
+ " 374.0 | \n",
+ " 4.0 | \n",
+ " 70.0 | \n",
+ " 13.0 | \n",
+ " ... | \n",
+ " 84.0 | \n",
+ " 594.0 | \n",
+ " 742.0 | \n",
+ " 13.0 | \n",
+ " 858.0 | \n",
+ " 1091.0 | \n",
+ " 97.0 | \n",
+ " 279.0 | \n",
+ " 99.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 212120 | \n",
+ " k-drive | \n",
+ " 1 | \n",
+ " baseline | \n",
+ " 571.0 | \n",
+ " 375.0 | \n",
+ " 7.0 | \n",
+ " 68.0 | \n",
+ " 10.0 | \n",
+ " ... | \n",
+ " 80.0 | \n",
+ " 547.0 | \n",
+ " 735.0 | \n",
+ " 12.0 | \n",
+ " 894.0 | \n",
+ " 1138.0 | \n",
+ " 69.0 | \n",
+ " 245.0 | \n",
+ " 98.0 | \n",
+ " 8.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
+ "0 0 192000 k-drive 1 baseline 441.0 354.0 \n",
+ "1 0 197120 k-drive 1 baseline 459.0 357.0 \n",
+ "2 0 202120 k-drive 1 baseline 487.0 342.0 \n",
+ "3 0 207120 k-drive 1 baseline 545.0 374.0 \n",
+ "4 0 212120 k-drive 1 baseline 571.0 375.0 \n",
+ "\n",
+ " AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum AU20_sum \\\n",
+ "0 3.0 81.0 29.0 ... 302.0 511.0 653.0 65.0 \n",
+ "1 4.0 71.0 22.0 ... 222.0 549.0 683.0 54.0 \n",
+ "2 5.0 70.0 18.0 ... 141.0 558.0 710.0 27.0 \n",
+ "3 4.0 70.0 13.0 ... 84.0 594.0 742.0 13.0 \n",
+ "4 7.0 68.0 10.0 ... 80.0 547.0 735.0 12.0 \n",
+ "\n",
+ " AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
+ "0 798.0 1096.0 84.0 230.0 114.0 5.0 \n",
+ "1 810.0 1093.0 86.0 247.0 108.0 5.0 \n",
+ "2 828.0 1092.0 86.0 257.0 95.0 3.0 \n",
+ "3 858.0 1091.0 97.0 279.0 99.0 2.0 \n",
+ "4 894.0 1138.0 69.0 245.0 98.0 8.0 \n",
+ "\n",
+ "[5 rows x 25 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "99e17328",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subjectID | \n",
+ " start_time | \n",
+ " STUDY | \n",
+ " LEVEL | \n",
+ " PHASE | \n",
+ " AU01_sum | \n",
+ " AU02_sum | \n",
+ " AU04_sum | \n",
+ " AU05_sum | \n",
+ " AU06_sum | \n",
+ " ... | \n",
+ " AU14_sum | \n",
+ " AU15_sum | \n",
+ " AU17_sum | \n",
+ " AU20_sum | \n",
+ " AU23_sum | \n",
+ " AU24_sum | \n",
+ " AU25_sum | \n",
+ " AU26_sum | \n",
+ " AU28_sum | \n",
+ " AU43_sum | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 7315 | \n",
+ " 29 | \n",
+ " 7142440 | \n",
+ " n-back | \n",
+ " 6 | \n",
+ " test | \n",
+ " 14.0 | \n",
+ " 15.0 | \n",
+ " 388.0 | \n",
+ " 0.0 | \n",
+ " 83.0 | \n",
+ " ... | \n",
+ " 191.0 | \n",
+ " 697.0 | \n",
+ " 584.0 | \n",
+ " 15.0 | \n",
+ " 81.0 | \n",
+ " 319.0 | \n",
+ " 421.0 | \n",
+ " 247.0 | \n",
+ " 88.0 | \n",
+ " 35.0 | \n",
+ "
\n",
+ " \n",
+ " | 7316 | \n",
+ " 29 | \n",
+ " 7147440 | \n",
+ " n-back | \n",
+ " 6 | \n",
+ " test | \n",
+ " 17.0 | \n",
+ " 11.0 | \n",
+ " 407.0 | \n",
+ " 0.0 | \n",
+ " 86.0 | \n",
+ " ... | \n",
+ " 191.0 | \n",
+ " 693.0 | \n",
+ " 594.0 | \n",
+ " 14.0 | \n",
+ " 73.0 | \n",
+ " 312.0 | \n",
+ " 414.0 | \n",
+ " 242.0 | \n",
+ " 83.0 | \n",
+ " 40.0 | \n",
+ "
\n",
+ " \n",
+ " | 7317 | \n",
+ " 29 | \n",
+ " 7152440 | \n",
+ " n-back | \n",
+ " 6 | \n",
+ " test | \n",
+ " 14.0 | \n",
+ " 9.0 | \n",
+ " 409.0 | \n",
+ " 0.0 | \n",
+ " 87.0 | \n",
+ " ... | \n",
+ " 187.0 | \n",
+ " 703.0 | \n",
+ " 597.0 | \n",
+ " 14.0 | \n",
+ " 64.0 | \n",
+ " 314.0 | \n",
+ " 411.0 | \n",
+ " 248.0 | \n",
+ " 98.0 | \n",
+ " 38.0 | \n",
+ "
\n",
+ " \n",
+ " | 7318 | \n",
+ " 29 | \n",
+ " 7157440 | \n",
+ " n-back | \n",
+ " 6 | \n",
+ " test | \n",
+ " 14.0 | \n",
+ " 9.0 | \n",
+ " 417.0 | \n",
+ " 0.0 | \n",
+ " 94.0 | \n",
+ " ... | \n",
+ " 169.0 | \n",
+ " 711.0 | \n",
+ " 603.0 | \n",
+ " 15.0 | \n",
+ " 63.0 | \n",
+ " 327.0 | \n",
+ " 398.0 | \n",
+ " 245.0 | \n",
+ " 100.0 | \n",
+ " 35.0 | \n",
+ "
\n",
+ " \n",
+ " | 7319 | \n",
+ " 29 | \n",
+ " 7162440 | \n",
+ " n-back | \n",
+ " 6 | \n",
+ " test | \n",
+ " 13.0 | \n",
+ " 9.0 | \n",
+ " 436.0 | \n",
+ " 0.0 | \n",
+ " 100.0 | \n",
+ " ... | \n",
+ " 178.0 | \n",
+ " 720.0 | \n",
+ " 621.0 | \n",
+ " 17.0 | \n",
+ " 65.0 | \n",
+ " 337.0 | \n",
+ " 377.0 | \n",
+ " 246.0 | \n",
+ " 101.0 | \n",
+ " 31.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
+ "7315 29 7142440 n-back 6 test 14.0 15.0 \n",
+ "7316 29 7147440 n-back 6 test 17.0 11.0 \n",
+ "7317 29 7152440 n-back 6 test 14.0 9.0 \n",
+ "7318 29 7157440 n-back 6 test 14.0 9.0 \n",
+ "7319 29 7162440 n-back 6 test 13.0 9.0 \n",
+ "\n",
+ " AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum \\\n",
+ "7315 388.0 0.0 83.0 ... 191.0 697.0 584.0 \n",
+ "7316 407.0 0.0 86.0 ... 191.0 693.0 594.0 \n",
+ "7317 409.0 0.0 87.0 ... 187.0 703.0 597.0 \n",
+ "7318 417.0 0.0 94.0 ... 169.0 711.0 603.0 \n",
+ "7319 436.0 0.0 100.0 ... 178.0 720.0 621.0 \n",
+ "\n",
+ " AU20_sum AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
+ "7315 15.0 81.0 319.0 421.0 247.0 88.0 35.0 \n",
+ "7316 14.0 73.0 312.0 414.0 242.0 83.0 40.0 \n",
+ "7317 14.0 64.0 314.0 411.0 248.0 98.0 38.0 \n",
+ "7318 15.0 63.0 327.0 398.0 245.0 100.0 35.0 \n",
+ "7319 17.0 65.0 337.0 377.0 246.0 101.0 31.0 \n",
+ "\n",
+ "[5 rows x 25 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "0238d802",
+ "execution_count": 5,
+ "id": "69e53731",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 7320 entries, 0 to 7319\n",
+ "Data columns (total 25 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 subjectID 7320 non-null int64 \n",
+ " 1 start_time 7320 non-null int64 \n",
+ " 2 STUDY 7320 non-null object \n",
+ " 3 LEVEL 7320 non-null int8 \n",
+ " 4 PHASE 7320 non-null object \n",
+ " 5 AU01_sum 7320 non-null float64\n",
+ " 6 AU02_sum 7320 non-null float64\n",
+ " 7 AU04_sum 7320 non-null float64\n",
+ " 8 AU05_sum 7320 non-null float64\n",
+ " 9 AU06_sum 7320 non-null float64\n",
+ " 10 AU07_sum 7320 non-null float64\n",
+ " 11 AU09_sum 7320 non-null float64\n",
+ " 12 AU10_sum 7320 non-null float64\n",
+ " 13 AU11_sum 7320 non-null float64\n",
+ " 14 AU12_sum 7320 non-null float64\n",
+ " 15 AU14_sum 7320 non-null float64\n",
+ " 16 AU15_sum 7320 non-null float64\n",
+ " 17 AU17_sum 7320 non-null float64\n",
+ " 18 AU20_sum 7320 non-null float64\n",
+ " 19 AU23_sum 7320 non-null float64\n",
+ " 20 AU24_sum 7320 non-null float64\n",
+ " 21 AU25_sum 7320 non-null float64\n",
+ " 22 AU26_sum 7320 non-null float64\n",
+ " 23 AU28_sum 7320 non-null float64\n",
+ " 24 AU43_sum 7320 non-null float64\n",
+ "dtypes: float64(20), int64(2), int8(1), object(2)\n",
+ "memory usage: 1.3+ MB\n"
+ ]
+ }
+ ],
"source": [
- "step2 = pd.read_parquet(\"output_windowed.parquet\")\n",
- "step2.head()"
+ "df.info()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "1257c535",
- "metadata": {},
- "outputs": [],
- "source": [
- "step2.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "3754c664",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "STUDY PHASE LEVEL\n",
+ "k-drive train 1 155\n",
+ " 3 156\n",
+ " 2 162\n",
+ " baseline 3 248\n",
+ "n-back baseline 2 252\n",
+ " test 5 255\n",
+ " 6 256\n",
+ " 1 258\n",
+ " 4 258\n",
+ " 2 260\n",
+ " 3 260\n",
+ "k-drive baseline 2 267\n",
+ " 1 896\n",
+ "n-back baseline 1 901\n",
+ "k-drive test 1 911\n",
+ " 2 912\n",
+ " 3 913\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Zeigt alle Kombinationen mit Häufigkeit\n",
- "step2[['STUDY', 'LEVEL', 'PHASE']].value_counts()"
+ "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "f83b595c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1031, 25)"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "high_nback = df[\n",
+ " (df[\"STUDY\"]==\"n-back\") &\n",
+ " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+ " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+ "]\n",
+ "high_nback.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "c0940343",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(3080, 25)\n",
+ "(3209, 25)\n"
+ ]
+ }
+ ],
+ "source": [
+ "low_all = df[\n",
+ " ((df[\"PHASE\"] == \"baseline\") |\n",
+ " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
+ "]\n",
+ "print(low_all.shape)\n",
+ "high_kdrive = df[\n",
+ " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+ "]\n",
+ "print(high_kdrive.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "f7ce38d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "True\n",
+ "7320\n",
+ "7320\n"
+ ]
+ }
+ ],
+ "source": [
+ "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
+ "print(df.shape[0])\n",
+ "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "48ba0379",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4240, 25)"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "high_all = pd.concat([high_nback, high_kdrive])\n",
+ "high_all.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "77dda26c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Gesamt: 7320==7320\n",
+ "Anzahl an low load Samples: 3080\n",
+ "Anzahl an high load Samples: 4240\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
+ "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
+ "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
]
}
],