added 2-class separation in "open parquet-test"

This commit is contained in:
Michael Weig 2025-11-02 20:55:01 +01:00
parent 25aa03398a
commit a6a0dd3ac5
4 changed files with 1045 additions and 55 deletions

View File

@ -2,14 +2,14 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%pip install pyocclient\n",
"# %pip install pyocclient\n",
"import yaml\n",
"import owncloud\n",
"import pandas as pd\n",
@ -18,21 +18,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"143.946026802063\n"
]
}
],
"source": [
"start = time.time()\n",
"\n",
"with open(\"login.yaml\") as f:\n",
"with open(\"../login.yaml\") as f:\n",
" cfg = yaml.safe_load(f)\n",
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
"file = \"adabase-public-0003-v_0_0_2.h5py\"\n",
"file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
"\n",
"\n",
"oc.get_file(file, \"tmp.h5\")\n",
"oc.get_file(file, \"tmp22.h5\")\n",
"\n",
"end = time.time()\n",
"print(end - start)\n"
@ -40,66 +48,457 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5121121406555176\n"
]
}
],
"source": [
"start = time.time()\n",
"df_performance = pd.read_hdf(\"tmp.h5\", \"PERFORMANCE\")\n",
"df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "f50e97d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22\n"
]
}
],
"source": [
"print(22)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c131c816",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STUDY</th>\n",
" <th>PHASE</th>\n",
" <th>LEVEL</th>\n",
" <th>AUDITIVE F1</th>\n",
" <th>AUDITIVE MEAN REACTION TIME</th>\n",
" <th>AUDITIVE PRECISION</th>\n",
" <th>AUDITIVE RECALL</th>\n",
" <th>VISUAL F1</th>\n",
" <th>VISUAL MEAN REACTION TIME</th>\n",
" <th>VISUAL PRECISION</th>\n",
" <th>VISUAL RECALL</th>\n",
" <th>F1</th>\n",
" <th>PRECISION</th>\n",
" <th>REACTION TIME</th>\n",
" <th>RECALL</th>\n",
" <th>SONGS RECALL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>0.428068</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.928571</td>\n",
" <td>0.626869</td>\n",
" <td>1.000000</td>\n",
" <td>0.866667</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>03</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.640000</td>\n",
" <td>0.828912</td>\n",
" <td>0.727273</td>\n",
" <td>0.571429</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>04</td>\n",
" <td>1.000000</td>\n",
" <td>1.309286</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.942916</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>05</td>\n",
" <td>0.782609</td>\n",
" <td>1.316484</td>\n",
" <td>0.818182</td>\n",
" <td>0.750000</td>\n",
" <td>0.814815</td>\n",
" <td>1.151405</td>\n",
" <td>0.916667</td>\n",
" <td>0.733333</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>n-back</td>\n",
" <td>test</td>\n",
" <td>06</td>\n",
" <td>0.363636</td>\n",
" <td>1.703583</td>\n",
" <td>0.500000</td>\n",
" <td>0.285714</td>\n",
" <td>0.476190</td>\n",
" <td>1.530054</td>\n",
" <td>0.714286</td>\n",
" <td>0.357143</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>k-drive</td>\n",
" <td>test</td>\n",
" <td>01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.446914</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>k-drive</td>\n",
" <td>test</td>\n",
" <td>02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.914286</td>\n",
" <td>0.914286</td>\n",
" <td>0.702571</td>\n",
" <td>0.914286</td>\n",
" <td>0.454545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>k-drive</td>\n",
" <td>test</td>\n",
" <td>03</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.786325</td>\n",
" <td>0.938776</td>\n",
" <td>1.175797</td>\n",
" <td>0.676471</td>\n",
" <td>0.347826</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" STUDY PHASE LEVEL AUDITIVE F1 AUDITIVE MEAN REACTION TIME \\\n",
"6 n-back test 01 NaN NaN \n",
"7 n-back test 02 NaN NaN \n",
"8 n-back test 03 NaN NaN \n",
"9 n-back test 04 1.000000 1.309286 \n",
"10 n-back test 05 0.782609 1.316484 \n",
"11 n-back test 06 0.363636 1.703583 \n",
"3 k-drive test 01 NaN NaN \n",
"4 k-drive test 02 NaN NaN \n",
"5 k-drive test 03 NaN NaN \n",
"\n",
" AUDITIVE PRECISION AUDITIVE RECALL VISUAL F1 VISUAL MEAN REACTION TIME \\\n",
"6 NaN NaN 1.000000 0.428068 \n",
"7 NaN NaN 0.928571 0.626869 \n",
"8 NaN NaN 0.640000 0.828912 \n",
"9 1.000000 1.000000 1.000000 0.942916 \n",
"10 0.818182 0.750000 0.814815 1.151405 \n",
"11 0.500000 0.285714 0.476190 1.530054 \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"5 NaN NaN NaN NaN \n",
"\n",
" VISUAL PRECISION VISUAL RECALL F1 PRECISION REACTION TIME \\\n",
"6 1.000000 1.000000 NaN NaN NaN \n",
"7 1.000000 0.866667 NaN NaN NaN \n",
"8 0.727273 0.571429 NaN NaN NaN \n",
"9 1.000000 1.000000 NaN NaN NaN \n",
"10 0.916667 0.733333 NaN NaN NaN \n",
"11 0.714286 0.357143 NaN NaN NaN \n",
"3 NaN NaN 1.000000 1.000000 0.446914 \n",
"4 NaN NaN 0.914286 0.914286 0.702571 \n",
"5 NaN NaN 0.786325 0.938776 1.175797 \n",
"\n",
" RECALL SONGS RECALL \n",
"6 NaN NaN \n",
"7 NaN NaN \n",
"8 NaN NaN \n",
"9 NaN NaN \n",
"10 NaN NaN \n",
"11 NaN NaN \n",
"3 1.000000 NaN \n",
"4 0.914286 0.454545 \n",
"5 0.676471 0.347826 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_performance"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.05357074737548828\n"
]
}
],
"source": [
"start = time.time()\n",
"df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", ''])\n",
"df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STUDY</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n/a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" STUDY\n",
"0 n/a"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_4_col.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(1, 1)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_4_col.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 1 entries, 0 to 0\n",
"Data columns (total 1 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 STUDY 1 non-null object\n",
"dtypes: object(1)\n",
"memory usage: 16.0+ bytes\n"
]
}
],
"source": [
"df_4_col.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"STUDY 0\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_4_col.isna().sum()"
]
@ -115,7 +514,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
@ -129,7 +528,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
"version": "3.11.5"
}
},
"nbformat": 4,

View File

@ -94,8 +94,9 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
# Beispiel-Verwendung
if __name__ == "__main__":
# Anpassen an deine Pfade
input_directory = ""
output_file = "./output/output_windowed.parquet"
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU"
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet"
result = process_parquet_files(

View File

@ -1,14 +1,16 @@
# %pip install pyocclient
# pip install pyocclient
import yaml
import owncloud
import pandas as pd
import h5py
import os
print(os.getcwd())
num_files = 30 # number of files to process (min: 1, max: 30)
# Load credentials
with open("login.yaml") as f:
cfg = yaml.safe_load(f)
print("ahahahah")
print("yaml geladen")
url, password = cfg[0]["url"], cfg[1]["password"]
# Connect once
@ -18,6 +20,7 @@ print("connection aufgebaut")
base = "adabase-public-{num:04d}-v_0_0_2.h5py"
for i in range(num_files):
print(f"Subject {i} gestartet")
file_name = base.format(num=i)
local_tmp = f"tmp_{i:04d}.h5"
@ -35,7 +38,9 @@ for i in range(num_files):
# Step 2: Filter columns that start with "AU"
au_cols = [c for c in cols if c.startswith("AU")]
print(au_cols)
if len(au_cols)==0:
print(f"keine AU Signale in Subject {i}")
continue
# Step 3: Read only those columns (plus any others you want)
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
@ -58,7 +63,8 @@ for i in range(num_files):
# Save to parquet
out_name = f"cleaned_{i:04d}.parquet"
os.makedirs("ParquetFiles", exist_ok=True)
out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
df.to_parquet(out_name, index=False)
print(f"Processed {file_name} -> {out_name}")

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "2b3fface",
"metadata": {},
"outputs": [],
@ -12,66 +12,650 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "74f1f5ec",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(7320, 25)\n"
]
}
],
"source": [
"df= pd.read_parquet(\"cleaned_0000.parquet\")\n",
"df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
"print(df.shape)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "05775454",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subjectID</th>\n",
" <th>start_time</th>\n",
" <th>STUDY</th>\n",
" <th>LEVEL</th>\n",
" <th>PHASE</th>\n",
" <th>AU01_sum</th>\n",
" <th>AU02_sum</th>\n",
" <th>AU04_sum</th>\n",
" <th>AU05_sum</th>\n",
" <th>AU06_sum</th>\n",
" <th>...</th>\n",
" <th>AU14_sum</th>\n",
" <th>AU15_sum</th>\n",
" <th>AU17_sum</th>\n",
" <th>AU20_sum</th>\n",
" <th>AU23_sum</th>\n",
" <th>AU24_sum</th>\n",
" <th>AU25_sum</th>\n",
" <th>AU26_sum</th>\n",
" <th>AU28_sum</th>\n",
" <th>AU43_sum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>192000</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>441.0</td>\n",
" <td>354.0</td>\n",
" <td>3.0</td>\n",
" <td>81.0</td>\n",
" <td>29.0</td>\n",
" <td>...</td>\n",
" <td>302.0</td>\n",
" <td>511.0</td>\n",
" <td>653.0</td>\n",
" <td>65.0</td>\n",
" <td>798.0</td>\n",
" <td>1096.0</td>\n",
" <td>84.0</td>\n",
" <td>230.0</td>\n",
" <td>114.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>197120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>459.0</td>\n",
" <td>357.0</td>\n",
" <td>4.0</td>\n",
" <td>71.0</td>\n",
" <td>22.0</td>\n",
" <td>...</td>\n",
" <td>222.0</td>\n",
" <td>549.0</td>\n",
" <td>683.0</td>\n",
" <td>54.0</td>\n",
" <td>810.0</td>\n",
" <td>1093.0</td>\n",
" <td>86.0</td>\n",
" <td>247.0</td>\n",
" <td>108.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>202120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>487.0</td>\n",
" <td>342.0</td>\n",
" <td>5.0</td>\n",
" <td>70.0</td>\n",
" <td>18.0</td>\n",
" <td>...</td>\n",
" <td>141.0</td>\n",
" <td>558.0</td>\n",
" <td>710.0</td>\n",
" <td>27.0</td>\n",
" <td>828.0</td>\n",
" <td>1092.0</td>\n",
" <td>86.0</td>\n",
" <td>257.0</td>\n",
" <td>95.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>207120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>545.0</td>\n",
" <td>374.0</td>\n",
" <td>4.0</td>\n",
" <td>70.0</td>\n",
" <td>13.0</td>\n",
" <td>...</td>\n",
" <td>84.0</td>\n",
" <td>594.0</td>\n",
" <td>742.0</td>\n",
" <td>13.0</td>\n",
" <td>858.0</td>\n",
" <td>1091.0</td>\n",
" <td>97.0</td>\n",
" <td>279.0</td>\n",
" <td>99.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>212120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>571.0</td>\n",
" <td>375.0</td>\n",
" <td>7.0</td>\n",
" <td>68.0</td>\n",
" <td>10.0</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>547.0</td>\n",
" <td>735.0</td>\n",
" <td>12.0</td>\n",
" <td>894.0</td>\n",
" <td>1138.0</td>\n",
" <td>69.0</td>\n",
" <td>245.0</td>\n",
" <td>98.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
"0 0 192000 k-drive 1 baseline 441.0 354.0 \n",
"1 0 197120 k-drive 1 baseline 459.0 357.0 \n",
"2 0 202120 k-drive 1 baseline 487.0 342.0 \n",
"3 0 207120 k-drive 1 baseline 545.0 374.0 \n",
"4 0 212120 k-drive 1 baseline 571.0 375.0 \n",
"\n",
" AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum AU20_sum \\\n",
"0 3.0 81.0 29.0 ... 302.0 511.0 653.0 65.0 \n",
"1 4.0 71.0 22.0 ... 222.0 549.0 683.0 54.0 \n",
"2 5.0 70.0 18.0 ... 141.0 558.0 710.0 27.0 \n",
"3 4.0 70.0 13.0 ... 84.0 594.0 742.0 13.0 \n",
"4 7.0 68.0 10.0 ... 80.0 547.0 735.0 12.0 \n",
"\n",
" AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
"0 798.0 1096.0 84.0 230.0 114.0 5.0 \n",
"1 810.0 1093.0 86.0 247.0 108.0 5.0 \n",
"2 828.0 1092.0 86.0 257.0 95.0 3.0 \n",
"3 858.0 1091.0 97.0 279.0 99.0 2.0 \n",
"4 894.0 1138.0 69.0 245.0 98.0 8.0 \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "99e17328",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subjectID</th>\n",
" <th>start_time</th>\n",
" <th>STUDY</th>\n",
" <th>LEVEL</th>\n",
" <th>PHASE</th>\n",
" <th>AU01_sum</th>\n",
" <th>AU02_sum</th>\n",
" <th>AU04_sum</th>\n",
" <th>AU05_sum</th>\n",
" <th>AU06_sum</th>\n",
" <th>...</th>\n",
" <th>AU14_sum</th>\n",
" <th>AU15_sum</th>\n",
" <th>AU17_sum</th>\n",
" <th>AU20_sum</th>\n",
" <th>AU23_sum</th>\n",
" <th>AU24_sum</th>\n",
" <th>AU25_sum</th>\n",
" <th>AU26_sum</th>\n",
" <th>AU28_sum</th>\n",
" <th>AU43_sum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7315</th>\n",
" <td>29</td>\n",
" <td>7142440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>15.0</td>\n",
" <td>388.0</td>\n",
" <td>0.0</td>\n",
" <td>83.0</td>\n",
" <td>...</td>\n",
" <td>191.0</td>\n",
" <td>697.0</td>\n",
" <td>584.0</td>\n",
" <td>15.0</td>\n",
" <td>81.0</td>\n",
" <td>319.0</td>\n",
" <td>421.0</td>\n",
" <td>247.0</td>\n",
" <td>88.0</td>\n",
" <td>35.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7316</th>\n",
" <td>29</td>\n",
" <td>7147440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>17.0</td>\n",
" <td>11.0</td>\n",
" <td>407.0</td>\n",
" <td>0.0</td>\n",
" <td>86.0</td>\n",
" <td>...</td>\n",
" <td>191.0</td>\n",
" <td>693.0</td>\n",
" <td>594.0</td>\n",
" <td>14.0</td>\n",
" <td>73.0</td>\n",
" <td>312.0</td>\n",
" <td>414.0</td>\n",
" <td>242.0</td>\n",
" <td>83.0</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7317</th>\n",
" <td>29</td>\n",
" <td>7152440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>9.0</td>\n",
" <td>409.0</td>\n",
" <td>0.0</td>\n",
" <td>87.0</td>\n",
" <td>...</td>\n",
" <td>187.0</td>\n",
" <td>703.0</td>\n",
" <td>597.0</td>\n",
" <td>14.0</td>\n",
" <td>64.0</td>\n",
" <td>314.0</td>\n",
" <td>411.0</td>\n",
" <td>248.0</td>\n",
" <td>98.0</td>\n",
" <td>38.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7318</th>\n",
" <td>29</td>\n",
" <td>7157440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>9.0</td>\n",
" <td>417.0</td>\n",
" <td>0.0</td>\n",
" <td>94.0</td>\n",
" <td>...</td>\n",
" <td>169.0</td>\n",
" <td>711.0</td>\n",
" <td>603.0</td>\n",
" <td>15.0</td>\n",
" <td>63.0</td>\n",
" <td>327.0</td>\n",
" <td>398.0</td>\n",
" <td>245.0</td>\n",
" <td>100.0</td>\n",
" <td>35.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7319</th>\n",
" <td>29</td>\n",
" <td>7162440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>13.0</td>\n",
" <td>9.0</td>\n",
" <td>436.0</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>...</td>\n",
" <td>178.0</td>\n",
" <td>720.0</td>\n",
" <td>621.0</td>\n",
" <td>17.0</td>\n",
" <td>65.0</td>\n",
" <td>337.0</td>\n",
" <td>377.0</td>\n",
" <td>246.0</td>\n",
" <td>101.0</td>\n",
" <td>31.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
"7315 29 7142440 n-back 6 test 14.0 15.0 \n",
"7316 29 7147440 n-back 6 test 17.0 11.0 \n",
"7317 29 7152440 n-back 6 test 14.0 9.0 \n",
"7318 29 7157440 n-back 6 test 14.0 9.0 \n",
"7319 29 7162440 n-back 6 test 13.0 9.0 \n",
"\n",
" AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum \\\n",
"7315 388.0 0.0 83.0 ... 191.0 697.0 584.0 \n",
"7316 407.0 0.0 86.0 ... 191.0 693.0 594.0 \n",
"7317 409.0 0.0 87.0 ... 187.0 703.0 597.0 \n",
"7318 417.0 0.0 94.0 ... 169.0 711.0 603.0 \n",
"7319 436.0 0.0 100.0 ... 178.0 720.0 621.0 \n",
"\n",
" AU20_sum AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
"7315 15.0 81.0 319.0 421.0 247.0 88.0 35.0 \n",
"7316 14.0 73.0 312.0 414.0 242.0 83.0 40.0 \n",
"7317 14.0 64.0 314.0 411.0 248.0 98.0 38.0 \n",
"7318 15.0 63.0 327.0 398.0 245.0 100.0 35.0 \n",
"7319 17.0 65.0 337.0 377.0 246.0 101.0 31.0 \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0238d802",
"execution_count": 5,
"id": "69e53731",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 7320 entries, 0 to 7319\n",
"Data columns (total 25 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 subjectID 7320 non-null int64 \n",
" 1 start_time 7320 non-null int64 \n",
" 2 STUDY 7320 non-null object \n",
" 3 LEVEL 7320 non-null int8 \n",
" 4 PHASE 7320 non-null object \n",
" 5 AU01_sum 7320 non-null float64\n",
" 6 AU02_sum 7320 non-null float64\n",
" 7 AU04_sum 7320 non-null float64\n",
" 8 AU05_sum 7320 non-null float64\n",
" 9 AU06_sum 7320 non-null float64\n",
" 10 AU07_sum 7320 non-null float64\n",
" 11 AU09_sum 7320 non-null float64\n",
" 12 AU10_sum 7320 non-null float64\n",
" 13 AU11_sum 7320 non-null float64\n",
" 14 AU12_sum 7320 non-null float64\n",
" 15 AU14_sum 7320 non-null float64\n",
" 16 AU15_sum 7320 non-null float64\n",
" 17 AU17_sum 7320 non-null float64\n",
" 18 AU20_sum 7320 non-null float64\n",
" 19 AU23_sum 7320 non-null float64\n",
" 20 AU24_sum 7320 non-null float64\n",
" 21 AU25_sum 7320 non-null float64\n",
" 22 AU26_sum 7320 non-null float64\n",
" 23 AU28_sum 7320 non-null float64\n",
" 24 AU43_sum 7320 non-null float64\n",
"dtypes: float64(20), int64(2), int8(1), object(2)\n",
"memory usage: 1.3+ MB\n"
]
}
],
"source": [
"step2 = pd.read_parquet(\"output_windowed.parquet\")\n",
"step2.head()"
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1257c535",
"metadata": {},
"outputs": [],
"source": [
"step2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "3754c664",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"STUDY PHASE LEVEL\n",
"k-drive train 1 155\n",
" 3 156\n",
" 2 162\n",
" baseline 3 248\n",
"n-back baseline 2 252\n",
" test 5 255\n",
" 6 256\n",
" 1 258\n",
" 4 258\n",
" 2 260\n",
" 3 260\n",
"k-drive baseline 2 267\n",
" 1 896\n",
"n-back baseline 1 901\n",
"k-drive test 1 911\n",
" 2 912\n",
" 3 913\n",
"Name: count, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zeigt alle Kombinationen mit Häufigkeit\n",
"step2[['STUDY', 'LEVEL', 'PHASE']].value_counts()"
"df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "f83b595c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1031, 25)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"high_nback.shape"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "c0940343",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3080, 25)\n",
"(3209, 25)\n"
]
}
],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
"]\n",
"print(low_all.shape)\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(high_kdrive.shape)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "f7ce38d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"7320\n",
"7320\n"
]
}
],
"source": [
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
"print(df.shape[0])\n",
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "48ba0379",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4240, 25)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_all = pd.concat([high_nback, high_kdrive])\n",
"high_all.shape"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "77dda26c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Gesamt: 7320==7320\n",
"Anzahl an low load Samples: 3080\n",
"Anzahl an high load Samples: 4240\n"
]
}
],
"source": [
"print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
"print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
"print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
]
}
],