diff --git a/EDA/owncloud.ipynb b/EDA/owncloud.ipynb index 8f6772a..32aa864 100644 --- a/EDA/owncloud.ipynb +++ b/EDA/owncloud.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "aab6b326-a583-47ad-8bb7-723c2fddcc63", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "%pip install pyocclient\n", + "# %pip install pyocclient\n", "import yaml\n", "import owncloud\n", "import pandas as pd\n", @@ -18,21 +18,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "4f42846c-27c3-4394-a40a-e22d73c2902e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "143.946026802063\n" + ] + } + ], "source": [ "start = time.time()\n", "\n", - "with open(\"login.yaml\") as f:\n", + "with open(\"../login.yaml\") as f:\n", " cfg = yaml.safe_load(f)\n", "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n", - "file = \"adabase-public-0003-v_0_0_2.h5py\"\n", + "file = \"adabase-public-0022-v_0_0_2.h5py\"\n", "oc = owncloud.Client.from_public_link(url, folder_password=password)\n", "\n", "\n", - "oc.get_file(file, \"tmp.h5\")\n", + "oc.get_file(file, \"tmp22.h5\")\n", "\n", "end = time.time()\n", "print(end - start)\n" @@ -40,66 +48,457 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5121121406555176\n" + ] + } + ], "source": [ "start = time.time()\n", - "df_performance = pd.read_hdf(\"tmp.h5\", \"PERFORMANCE\")\n", + "df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n", "end = time.time()\n", "print(end - start)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, + "id": "f50e97d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22\n" + ] + } + ], + "source": [ + "print(22)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c131c816", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STUDYPHASELEVELAUDITIVE F1AUDITIVE MEAN REACTION TIMEAUDITIVE PRECISIONAUDITIVE RECALLVISUAL F1VISUAL MEAN REACTION TIMEVISUAL PRECISIONVISUAL RECALLF1PRECISIONREACTION TIMERECALLSONGS RECALL
6n-backtest01NaNNaNNaNNaN1.0000000.4280681.0000001.000000NaNNaNNaNNaNNaN
7n-backtest02NaNNaNNaNNaN0.9285710.6268691.0000000.866667NaNNaNNaNNaNNaN
8n-backtest03NaNNaNNaNNaN0.6400000.8289120.7272730.571429NaNNaNNaNNaNNaN
9n-backtest041.0000001.3092861.0000001.0000001.0000000.9429161.0000001.000000NaNNaNNaNNaNNaN
10n-backtest050.7826091.3164840.8181820.7500000.8148151.1514050.9166670.733333NaNNaNNaNNaNNaN
11n-backtest060.3636361.7035830.5000000.2857140.4761901.5300540.7142860.357143NaNNaNNaNNaNNaN
3k-drivetest01NaNNaNNaNNaNNaNNaNNaNNaN1.0000001.0000000.4469141.000000NaN
4k-drivetest02NaNNaNNaNNaNNaNNaNNaNNaN0.9142860.9142860.7025710.9142860.454545
5k-drivetest03NaNNaNNaNNaNNaNNaNNaNNaN0.7863250.9387761.1757970.6764710.347826
\n", + "
" + ], + "text/plain": [ + " STUDY PHASE LEVEL AUDITIVE F1 AUDITIVE MEAN REACTION TIME \\\n", + "6 n-back test 01 NaN NaN \n", + "7 n-back test 02 NaN NaN \n", + "8 n-back test 03 NaN NaN \n", + "9 n-back test 04 1.000000 1.309286 \n", + "10 n-back test 05 0.782609 1.316484 \n", + "11 n-back test 06 0.363636 1.703583 \n", + "3 k-drive test 01 NaN NaN \n", + "4 k-drive test 02 NaN NaN \n", + "5 k-drive test 03 NaN NaN \n", + "\n", + " AUDITIVE PRECISION AUDITIVE RECALL VISUAL F1 VISUAL MEAN REACTION TIME \\\n", + "6 NaN NaN 1.000000 0.428068 \n", + "7 NaN NaN 0.928571 0.626869 \n", + "8 NaN NaN 0.640000 0.828912 \n", + "9 1.000000 1.000000 1.000000 0.942916 \n", + "10 0.818182 0.750000 0.814815 1.151405 \n", + "11 0.500000 0.285714 0.476190 1.530054 \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "5 NaN NaN NaN NaN \n", + "\n", + " VISUAL PRECISION VISUAL RECALL F1 PRECISION REACTION TIME \\\n", + "6 1.000000 1.000000 NaN NaN NaN \n", + "7 1.000000 0.866667 NaN NaN NaN \n", + "8 0.727273 0.571429 NaN NaN NaN \n", + "9 1.000000 1.000000 NaN NaN NaN \n", + "10 0.916667 0.733333 NaN NaN NaN \n", + "11 0.714286 0.357143 NaN NaN NaN \n", + "3 NaN NaN 1.000000 1.000000 0.446914 \n", + "4 NaN NaN 0.914286 0.914286 0.702571 \n", + "5 NaN NaN 0.786325 0.938776 1.175797 \n", + "\n", + " RECALL SONGS RECALL \n", + "6 NaN NaN \n", + "7 NaN NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "10 NaN NaN \n", + "11 NaN NaN \n", + "3 1.000000 NaN \n", + "4 0.914286 0.454545 \n", + "5 0.676471 0.347826 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_performance" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "6ae47e52-ad86-4f8d-b929-0080dc99f646", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.05357074737548828\n" + ] + } + ], "source": [ "start = time.time()\n", - "df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", ''])\n", + "df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n", "end = time.time()\n", "print(end - start)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STUDY
0n/a
\n", + "
" + ], + "text/plain": [ + " STUDY\n", + "0 n/a" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_4_col.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1, 1)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_4_col.shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "95aa4523-3784-4ab6-bf92-0227ce60e863", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 1 entries, 0 to 0\n", + "Data columns (total 1 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 STUDY 1 non-null object\n", + "dtypes: object(1)\n", + "memory usage: 16.0+ bytes\n" + ] + } + ], "source": [ "df_4_col.info()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "STUDY 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_4_col.isna().sum()" ] @@ -115,7 +514,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -129,7 +528,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py index 53baa33..7fac133 100644 --- a/dataset_creation/create_feature_table.py +++ b/dataset_creation/create_feature_table.py @@ -94,8 +94,9 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12 # Beispiel-Verwendung if __name__ == "__main__": # Anpassen an deine Pfade - input_directory = "" - output_file = "./output/output_windowed.parquet" + input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU" + output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet" + result = process_parquet_files( diff --git a/dataset_creation/create_parquet_files.py b/dataset_creation/create_parquet_files.py index 5fb5b6a..1a2fb7f 100644 --- a/dataset_creation/create_parquet_files.py +++ b/dataset_creation/create_parquet_files.py @@ -1,14 +1,16 @@ -# %pip install pyocclient +# pip install pyocclient import yaml import owncloud import pandas as pd import h5py +import os +print(os.getcwd()) num_files = 30 # number of files to process (min: 1, max: 30) # Load credentials with open("login.yaml") as f: cfg = yaml.safe_load(f) - print("ahahahah") + print("yaml geladen") url, password = cfg[0]["url"], cfg[1]["password"] # Connect once @@ -18,6 +20,7 @@ print("connection aufgebaut") base = "adabase-public-{num:04d}-v_0_0_2.h5py" for i in range(num_files): + print(f"Subject {i} gestartet") file_name = base.format(num=i) local_tmp = f"tmp_{i:04d}.h5" @@ -35,7 +38,9 @@ for i in range(num_files): # Step 2: Filter columns that start with "AU" au_cols = [c for c in cols if c.startswith("AU")] print(au_cols) - + if len(au_cols)==0: + print(f"keine AU Signale in Subject {i}") + continue # Step 3: Read only those columns (plus any others you want) df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols) @@ -58,7 +63,8 @@ for i in range(num_files): # Save to parquet - out_name = f"cleaned_{i:04d}.parquet" + os.makedirs("ParquetFiles", exist_ok=True) + out_name = f"ParquetFiles/cleaned_{i:04d}.parquet" df.to_parquet(out_name, index=False) print(f"Processed {file_name} -> {out_name}") diff --git a/dataset_creation/open_parquet_test.ipynb b/dataset_creation/open_parquet_test.ipynb index 2159704..309686f 100644 --- a/dataset_creation/open_parquet_test.ipynb +++ b/dataset_creation/open_parquet_test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2b3fface", "metadata": {}, "outputs": [], @@ -12,66 +12,650 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "74f1f5ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(7320, 25)\n" + ] + } + ], "source": [ - "df= pd.read_parquet(\"cleaned_0000.parquet\")\n", + "df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n", "print(df.shape)\n", "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "05775454", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subjectIDstart_timeSTUDYLEVELPHASEAU01_sumAU02_sumAU04_sumAU05_sumAU06_sum...AU14_sumAU15_sumAU17_sumAU20_sumAU23_sumAU24_sumAU25_sumAU26_sumAU28_sumAU43_sum
00192000k-drive1baseline441.0354.03.081.029.0...302.0511.0653.065.0798.01096.084.0230.0114.05.0
10197120k-drive1baseline459.0357.04.071.022.0...222.0549.0683.054.0810.01093.086.0247.0108.05.0
20202120k-drive1baseline487.0342.05.070.018.0...141.0558.0710.027.0828.01092.086.0257.095.03.0
30207120k-drive1baseline545.0374.04.070.013.0...84.0594.0742.013.0858.01091.097.0279.099.02.0
40212120k-drive1baseline571.0375.07.068.010.0...80.0547.0735.012.0894.01138.069.0245.098.08.0
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n", + "0 0 192000 k-drive 1 baseline 441.0 354.0 \n", + "1 0 197120 k-drive 1 baseline 459.0 357.0 \n", + "2 0 202120 k-drive 1 baseline 487.0 342.0 \n", + "3 0 207120 k-drive 1 baseline 545.0 374.0 \n", + "4 0 212120 k-drive 1 baseline 571.0 375.0 \n", + "\n", + " AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum AU20_sum \\\n", + "0 3.0 81.0 29.0 ... 302.0 511.0 653.0 65.0 \n", + "1 4.0 71.0 22.0 ... 222.0 549.0 683.0 54.0 \n", + "2 5.0 70.0 18.0 ... 141.0 558.0 710.0 27.0 \n", + "3 4.0 70.0 13.0 ... 84.0 594.0 742.0 13.0 \n", + "4 7.0 68.0 10.0 ... 80.0 547.0 735.0 12.0 \n", + "\n", + " AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n", + "0 798.0 1096.0 84.0 230.0 114.0 5.0 \n", + "1 810.0 1093.0 86.0 247.0 108.0 5.0 \n", + "2 828.0 1092.0 86.0 257.0 95.0 3.0 \n", + "3 858.0 1091.0 97.0 279.0 99.0 2.0 \n", + "4 894.0 1138.0 69.0 245.0 98.0 8.0 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "99e17328", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subjectIDstart_timeSTUDYLEVELPHASEAU01_sumAU02_sumAU04_sumAU05_sumAU06_sum...AU14_sumAU15_sumAU17_sumAU20_sumAU23_sumAU24_sumAU25_sumAU26_sumAU28_sumAU43_sum
7315297142440n-back6test14.015.0388.00.083.0...191.0697.0584.015.081.0319.0421.0247.088.035.0
7316297147440n-back6test17.011.0407.00.086.0...191.0693.0594.014.073.0312.0414.0242.083.040.0
7317297152440n-back6test14.09.0409.00.087.0...187.0703.0597.014.064.0314.0411.0248.098.038.0
7318297157440n-back6test14.09.0417.00.094.0...169.0711.0603.015.063.0327.0398.0245.0100.035.0
7319297162440n-back6test13.09.0436.00.0100.0...178.0720.0621.017.065.0337.0377.0246.0101.031.0
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n", + "7315 29 7142440 n-back 6 test 14.0 15.0 \n", + "7316 29 7147440 n-back 6 test 17.0 11.0 \n", + "7317 29 7152440 n-back 6 test 14.0 9.0 \n", + "7318 29 7157440 n-back 6 test 14.0 9.0 \n", + "7319 29 7162440 n-back 6 test 13.0 9.0 \n", + "\n", + " AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum \\\n", + "7315 388.0 0.0 83.0 ... 191.0 697.0 584.0 \n", + "7316 407.0 0.0 86.0 ... 191.0 693.0 594.0 \n", + "7317 409.0 0.0 87.0 ... 187.0 703.0 597.0 \n", + "7318 417.0 0.0 94.0 ... 169.0 711.0 603.0 \n", + "7319 436.0 0.0 100.0 ... 178.0 720.0 621.0 \n", + "\n", + " AU20_sum AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n", + "7315 15.0 81.0 319.0 421.0 247.0 88.0 35.0 \n", + "7316 14.0 73.0 312.0 414.0 242.0 83.0 40.0 \n", + "7317 14.0 64.0 314.0 411.0 248.0 98.0 38.0 \n", + "7318 15.0 63.0 327.0 398.0 245.0 100.0 35.0 \n", + "7319 17.0 65.0 337.0 377.0 246.0 101.0 31.0 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.tail()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0238d802", + "execution_count": 5, + "id": "69e53731", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 7320 entries, 0 to 7319\n", + "Data columns (total 25 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 subjectID 7320 non-null int64 \n", + " 1 start_time 7320 non-null int64 \n", + " 2 STUDY 7320 non-null object \n", + " 3 LEVEL 7320 non-null int8 \n", + " 4 PHASE 7320 non-null object \n", + " 5 AU01_sum 7320 non-null float64\n", + " 6 AU02_sum 7320 non-null float64\n", + " 7 AU04_sum 7320 non-null float64\n", + " 8 AU05_sum 7320 non-null float64\n", + " 9 AU06_sum 7320 non-null float64\n", + " 10 AU07_sum 7320 non-null float64\n", + " 11 AU09_sum 7320 non-null float64\n", + " 12 AU10_sum 7320 non-null float64\n", + " 13 AU11_sum 7320 non-null float64\n", + " 14 AU12_sum 7320 non-null float64\n", + " 15 AU14_sum 7320 non-null float64\n", + " 16 AU15_sum 7320 non-null float64\n", + " 17 AU17_sum 7320 non-null float64\n", + " 18 AU20_sum 7320 non-null float64\n", + " 19 AU23_sum 7320 non-null float64\n", + " 20 AU24_sum 7320 non-null float64\n", + " 21 AU25_sum 7320 non-null float64\n", + " 22 AU26_sum 7320 non-null float64\n", + " 23 AU28_sum 7320 non-null float64\n", + " 24 AU43_sum 7320 non-null float64\n", + "dtypes: float64(20), int64(2), int8(1), object(2)\n", + "memory usage: 1.3+ MB\n" + ] + } + ], "source": [ - "step2 = pd.read_parquet(\"output_windowed.parquet\")\n", - "step2.head()" + "df.info()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1257c535", - "metadata": {}, - "outputs": [], - "source": [ - "step2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "3754c664", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "STUDY PHASE LEVEL\n", + "k-drive train 1 155\n", + " 3 156\n", + " 2 162\n", + " baseline 3 248\n", + "n-back baseline 2 252\n", + " test 5 255\n", + " 6 256\n", + " 1 258\n", + " 4 258\n", + " 2 260\n", + " 3 260\n", + "k-drive baseline 2 267\n", + " 1 896\n", + "n-back baseline 1 901\n", + "k-drive test 1 911\n", + " 2 912\n", + " 3 913\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Zeigt alle Kombinationen mit Häufigkeit\n", - "step2[['STUDY', 'LEVEL', 'PHASE']].value_counts()" + "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "f83b595c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1031, 25)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_nback = df[\n", + " (df[\"STUDY\"]==\"n-back\") &\n", + " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", + " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", + "]\n", + "high_nback.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "c0940343", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(3080, 25)\n", + "(3209, 25)\n" + ] + } + ], + "source": [ + "low_all = df[\n", + " ((df[\"PHASE\"] == \"baseline\") |\n", + " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n", + "]\n", + "print(low_all.shape)\n", + "high_kdrive = df[\n", + " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", + "]\n", + "print(high_kdrive.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "f7ce38d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "7320\n", + "7320\n" + ] + } + ], + "source": [ + "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n", + "print(df.shape[0])\n", + "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "48ba0379", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4240, 25)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_all = pd.concat([high_nback, high_kdrive])\n", + "high_all.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "77dda26c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gesamt: 7320==7320\n", + "Anzahl an low load Samples: 3080\n", + "Anzahl an high load Samples: 4240\n" + ] + } + ], + "source": [ + "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n", + "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n", + "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n" ] } ],