Fahrsimulator_MSY2526_AI/dataset_creation/open_parquet_test.ipynb

684 lines
21 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2b3fface",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "74f1f5ec",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(7320, 25)\n"
]
}
],
"source": [
"df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
"print(df.shape)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "05775454",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subjectID</th>\n",
" <th>start_time</th>\n",
" <th>STUDY</th>\n",
" <th>LEVEL</th>\n",
" <th>PHASE</th>\n",
" <th>AU01_sum</th>\n",
" <th>AU02_sum</th>\n",
" <th>AU04_sum</th>\n",
" <th>AU05_sum</th>\n",
" <th>AU06_sum</th>\n",
" <th>...</th>\n",
" <th>AU14_sum</th>\n",
" <th>AU15_sum</th>\n",
" <th>AU17_sum</th>\n",
" <th>AU20_sum</th>\n",
" <th>AU23_sum</th>\n",
" <th>AU24_sum</th>\n",
" <th>AU25_sum</th>\n",
" <th>AU26_sum</th>\n",
" <th>AU28_sum</th>\n",
" <th>AU43_sum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>192000</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>441.0</td>\n",
" <td>354.0</td>\n",
" <td>3.0</td>\n",
" <td>81.0</td>\n",
" <td>29.0</td>\n",
" <td>...</td>\n",
" <td>302.0</td>\n",
" <td>511.0</td>\n",
" <td>653.0</td>\n",
" <td>65.0</td>\n",
" <td>798.0</td>\n",
" <td>1096.0</td>\n",
" <td>84.0</td>\n",
" <td>230.0</td>\n",
" <td>114.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>197120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>459.0</td>\n",
" <td>357.0</td>\n",
" <td>4.0</td>\n",
" <td>71.0</td>\n",
" <td>22.0</td>\n",
" <td>...</td>\n",
" <td>222.0</td>\n",
" <td>549.0</td>\n",
" <td>683.0</td>\n",
" <td>54.0</td>\n",
" <td>810.0</td>\n",
" <td>1093.0</td>\n",
" <td>86.0</td>\n",
" <td>247.0</td>\n",
" <td>108.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>202120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>487.0</td>\n",
" <td>342.0</td>\n",
" <td>5.0</td>\n",
" <td>70.0</td>\n",
" <td>18.0</td>\n",
" <td>...</td>\n",
" <td>141.0</td>\n",
" <td>558.0</td>\n",
" <td>710.0</td>\n",
" <td>27.0</td>\n",
" <td>828.0</td>\n",
" <td>1092.0</td>\n",
" <td>86.0</td>\n",
" <td>257.0</td>\n",
" <td>95.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>207120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>545.0</td>\n",
" <td>374.0</td>\n",
" <td>4.0</td>\n",
" <td>70.0</td>\n",
" <td>13.0</td>\n",
" <td>...</td>\n",
" <td>84.0</td>\n",
" <td>594.0</td>\n",
" <td>742.0</td>\n",
" <td>13.0</td>\n",
" <td>858.0</td>\n",
" <td>1091.0</td>\n",
" <td>97.0</td>\n",
" <td>279.0</td>\n",
" <td>99.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>212120</td>\n",
" <td>k-drive</td>\n",
" <td>1</td>\n",
" <td>baseline</td>\n",
" <td>571.0</td>\n",
" <td>375.0</td>\n",
" <td>7.0</td>\n",
" <td>68.0</td>\n",
" <td>10.0</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>547.0</td>\n",
" <td>735.0</td>\n",
" <td>12.0</td>\n",
" <td>894.0</td>\n",
" <td>1138.0</td>\n",
" <td>69.0</td>\n",
" <td>245.0</td>\n",
" <td>98.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
"0 0 192000 k-drive 1 baseline 441.0 354.0 \n",
"1 0 197120 k-drive 1 baseline 459.0 357.0 \n",
"2 0 202120 k-drive 1 baseline 487.0 342.0 \n",
"3 0 207120 k-drive 1 baseline 545.0 374.0 \n",
"4 0 212120 k-drive 1 baseline 571.0 375.0 \n",
"\n",
" AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum AU20_sum \\\n",
"0 3.0 81.0 29.0 ... 302.0 511.0 653.0 65.0 \n",
"1 4.0 71.0 22.0 ... 222.0 549.0 683.0 54.0 \n",
"2 5.0 70.0 18.0 ... 141.0 558.0 710.0 27.0 \n",
"3 4.0 70.0 13.0 ... 84.0 594.0 742.0 13.0 \n",
"4 7.0 68.0 10.0 ... 80.0 547.0 735.0 12.0 \n",
"\n",
" AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
"0 798.0 1096.0 84.0 230.0 114.0 5.0 \n",
"1 810.0 1093.0 86.0 247.0 108.0 5.0 \n",
"2 828.0 1092.0 86.0 257.0 95.0 3.0 \n",
"3 858.0 1091.0 97.0 279.0 99.0 2.0 \n",
"4 894.0 1138.0 69.0 245.0 98.0 8.0 \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "99e17328",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subjectID</th>\n",
" <th>start_time</th>\n",
" <th>STUDY</th>\n",
" <th>LEVEL</th>\n",
" <th>PHASE</th>\n",
" <th>AU01_sum</th>\n",
" <th>AU02_sum</th>\n",
" <th>AU04_sum</th>\n",
" <th>AU05_sum</th>\n",
" <th>AU06_sum</th>\n",
" <th>...</th>\n",
" <th>AU14_sum</th>\n",
" <th>AU15_sum</th>\n",
" <th>AU17_sum</th>\n",
" <th>AU20_sum</th>\n",
" <th>AU23_sum</th>\n",
" <th>AU24_sum</th>\n",
" <th>AU25_sum</th>\n",
" <th>AU26_sum</th>\n",
" <th>AU28_sum</th>\n",
" <th>AU43_sum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7315</th>\n",
" <td>29</td>\n",
" <td>7142440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>15.0</td>\n",
" <td>388.0</td>\n",
" <td>0.0</td>\n",
" <td>83.0</td>\n",
" <td>...</td>\n",
" <td>191.0</td>\n",
" <td>697.0</td>\n",
" <td>584.0</td>\n",
" <td>15.0</td>\n",
" <td>81.0</td>\n",
" <td>319.0</td>\n",
" <td>421.0</td>\n",
" <td>247.0</td>\n",
" <td>88.0</td>\n",
" <td>35.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7316</th>\n",
" <td>29</td>\n",
" <td>7147440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>17.0</td>\n",
" <td>11.0</td>\n",
" <td>407.0</td>\n",
" <td>0.0</td>\n",
" <td>86.0</td>\n",
" <td>...</td>\n",
" <td>191.0</td>\n",
" <td>693.0</td>\n",
" <td>594.0</td>\n",
" <td>14.0</td>\n",
" <td>73.0</td>\n",
" <td>312.0</td>\n",
" <td>414.0</td>\n",
" <td>242.0</td>\n",
" <td>83.0</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7317</th>\n",
" <td>29</td>\n",
" <td>7152440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>9.0</td>\n",
" <td>409.0</td>\n",
" <td>0.0</td>\n",
" <td>87.0</td>\n",
" <td>...</td>\n",
" <td>187.0</td>\n",
" <td>703.0</td>\n",
" <td>597.0</td>\n",
" <td>14.0</td>\n",
" <td>64.0</td>\n",
" <td>314.0</td>\n",
" <td>411.0</td>\n",
" <td>248.0</td>\n",
" <td>98.0</td>\n",
" <td>38.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7318</th>\n",
" <td>29</td>\n",
" <td>7157440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>14.0</td>\n",
" <td>9.0</td>\n",
" <td>417.0</td>\n",
" <td>0.0</td>\n",
" <td>94.0</td>\n",
" <td>...</td>\n",
" <td>169.0</td>\n",
" <td>711.0</td>\n",
" <td>603.0</td>\n",
" <td>15.0</td>\n",
" <td>63.0</td>\n",
" <td>327.0</td>\n",
" <td>398.0</td>\n",
" <td>245.0</td>\n",
" <td>100.0</td>\n",
" <td>35.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7319</th>\n",
" <td>29</td>\n",
" <td>7162440</td>\n",
" <td>n-back</td>\n",
" <td>6</td>\n",
" <td>test</td>\n",
" <td>13.0</td>\n",
" <td>9.0</td>\n",
" <td>436.0</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>...</td>\n",
" <td>178.0</td>\n",
" <td>720.0</td>\n",
" <td>621.0</td>\n",
" <td>17.0</td>\n",
" <td>65.0</td>\n",
" <td>337.0</td>\n",
" <td>377.0</td>\n",
" <td>246.0</td>\n",
" <td>101.0</td>\n",
" <td>31.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" subjectID start_time STUDY LEVEL PHASE AU01_sum AU02_sum \\\n",
"7315 29 7142440 n-back 6 test 14.0 15.0 \n",
"7316 29 7147440 n-back 6 test 17.0 11.0 \n",
"7317 29 7152440 n-back 6 test 14.0 9.0 \n",
"7318 29 7157440 n-back 6 test 14.0 9.0 \n",
"7319 29 7162440 n-back 6 test 13.0 9.0 \n",
"\n",
" AU04_sum AU05_sum AU06_sum ... AU14_sum AU15_sum AU17_sum \\\n",
"7315 388.0 0.0 83.0 ... 191.0 697.0 584.0 \n",
"7316 407.0 0.0 86.0 ... 191.0 693.0 594.0 \n",
"7317 409.0 0.0 87.0 ... 187.0 703.0 597.0 \n",
"7318 417.0 0.0 94.0 ... 169.0 711.0 603.0 \n",
"7319 436.0 0.0 100.0 ... 178.0 720.0 621.0 \n",
"\n",
" AU20_sum AU23_sum AU24_sum AU25_sum AU26_sum AU28_sum AU43_sum \n",
"7315 15.0 81.0 319.0 421.0 247.0 88.0 35.0 \n",
"7316 14.0 73.0 312.0 414.0 242.0 83.0 40.0 \n",
"7317 14.0 64.0 314.0 411.0 248.0 98.0 38.0 \n",
"7318 15.0 63.0 327.0 398.0 245.0 100.0 35.0 \n",
"7319 17.0 65.0 337.0 377.0 246.0 101.0 31.0 \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "69e53731",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 7320 entries, 0 to 7319\n",
"Data columns (total 25 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 subjectID 7320 non-null int64 \n",
" 1 start_time 7320 non-null int64 \n",
" 2 STUDY 7320 non-null object \n",
" 3 LEVEL 7320 non-null int8 \n",
" 4 PHASE 7320 non-null object \n",
" 5 AU01_sum 7320 non-null float64\n",
" 6 AU02_sum 7320 non-null float64\n",
" 7 AU04_sum 7320 non-null float64\n",
" 8 AU05_sum 7320 non-null float64\n",
" 9 AU06_sum 7320 non-null float64\n",
" 10 AU07_sum 7320 non-null float64\n",
" 11 AU09_sum 7320 non-null float64\n",
" 12 AU10_sum 7320 non-null float64\n",
" 13 AU11_sum 7320 non-null float64\n",
" 14 AU12_sum 7320 non-null float64\n",
" 15 AU14_sum 7320 non-null float64\n",
" 16 AU15_sum 7320 non-null float64\n",
" 17 AU17_sum 7320 non-null float64\n",
" 18 AU20_sum 7320 non-null float64\n",
" 19 AU23_sum 7320 non-null float64\n",
" 20 AU24_sum 7320 non-null float64\n",
" 21 AU25_sum 7320 non-null float64\n",
" 22 AU26_sum 7320 non-null float64\n",
" 23 AU28_sum 7320 non-null float64\n",
" 24 AU43_sum 7320 non-null float64\n",
"dtypes: float64(20), int64(2), int8(1), object(2)\n",
"memory usage: 1.3+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3754c664",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"STUDY PHASE LEVEL\n",
"k-drive train 1 155\n",
" 3 156\n",
" 2 162\n",
" baseline 3 248\n",
"n-back baseline 2 252\n",
" test 5 255\n",
" 6 256\n",
" 1 258\n",
" 4 258\n",
" 2 260\n",
" 3 260\n",
"k-drive baseline 2 267\n",
" 1 896\n",
"n-back baseline 1 901\n",
"k-drive test 1 911\n",
" 2 912\n",
" 3 913\n",
"Name: count, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zeigt alle Kombinationen mit Häufigkeit\n",
"df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "f83b595c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1031, 25)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"high_nback.shape"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "c0940343",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3080, 25)\n",
"(3209, 25)\n"
]
}
],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
"]\n",
"print(low_all.shape)\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(high_kdrive.shape)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "f7ce38d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"7320\n",
"7320\n"
]
}
],
"source": [
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
"print(df.shape[0])\n",
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "48ba0379",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4240, 25)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_all = pd.concat([high_nback, high_kdrive])\n",
"high_all.shape"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "77dda26c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Gesamt: 7320==7320\n",
"Anzahl an low load Samples: 3080\n",
"Anzahl an high load Samples: 4240\n"
]
}
],
"source": [
"print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
"print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
"print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}