init commit on deployment, removed EDA files

This commit is contained in:
Michael Weig 2026-01-27 18:42:40 +01:00
parent eee173dc0b
commit 5f2db4d0c9
5 changed files with 0 additions and 1420 deletions

View File

@ -1,259 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7440a5b3",
"metadata": {},
"outputs": [],
"source": [
"import h5py\n",
"import os\n",
"import warnings\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2401aaef",
"metadata": {},
"outputs": [],
"source": [
"file_path = \"adabase-public-0020-v_0_0_2.h5py\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46280999",
"metadata": {},
"outputs": [],
"source": [
"SKT_SR = 100\n",
"ECG_SR = 500\n",
"RSP_SR = 250\n",
"EMG_SR = 1000\n",
"EDA_SR = 500\n",
"EYE_SR = 250"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e23eb552",
"metadata": {},
"outputs": [],
"source": [
"df_signals = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b7f494d1",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd2f4d84",
"metadata": {},
"outputs": [],
"source": [
"settings = df_signals[['STUDY','PHASE','LEVEL']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1699ddc2",
"metadata": {},
"outputs": [],
"source": [
"settings.value_counts()"
]
},
{
"cell_type": "markdown",
"id": "a4731c56",
"metadata": {},
"source": [
"Actions units"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9db0b4b2",
"metadata": {},
"outputs": [],
"source": [
"df_signals.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ceccc89",
"metadata": {},
"outputs": [],
"source": [
"au_data = df_signals.iloc[:,-20:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d4ee088",
"metadata": {},
"outputs": [],
"source": [
"au_data.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d85a8cb",
"metadata": {},
"outputs": [],
"source": [
"print(au_data.shape)\n",
"print(au_data.isna().sum())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efff356f",
"metadata": {},
"outputs": [],
"source": [
"clean_au_data = au_data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42ed1bcd",
"metadata": {},
"outputs": [],
"source": [
"clean_au_data.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c7c3f14",
"metadata": {},
"outputs": [],
"source": [
"for i in range(len(clean_au_data.columns)):\n",
" print(clean_au_data.iloc[:,i].unique())"
]
},
{
"cell_type": "markdown",
"id": "332740a8",
"metadata": {},
"source": [
"Plots"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f30b8814",
"metadata": {},
"outputs": [],
"source": [
"# df_signals_ecg = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I'])\n",
"df_signals_ecg = df_signals[[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I']]\n",
"df_signals_ecg.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee80fd79",
"metadata": {},
"outputs": [],
"source": [
"study_filter = df_signals[\"STUDY\"] == \"n-back\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ef29446",
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(16, 2))\n",
"# Set the number of seconds to plot\n",
"seconds = 20\n",
"# Get the ECG signal data\n",
"ecg_signal = df_signals.loc[study_filter, \"RAW_ECG_I\"].dropna()\n",
"# Set the x-axis limits to the number of samples in the specified time range\n",
"num_samples = ECG_SR * seconds\n",
"# Plot the ECG signal\n",
"ax.plot(ecg_signal.index[:num_samples]/1000, ecg_signal[:num_samples]);\n",
"ax.set_title(\"ECG I\");\n",
"ax.set_xlabel('Seconds');\n",
"# Set figure size with a 16:6 aspect ratio\n",
"fig, ax = plt.subplots(figsize=(16, 2))\n",
"# Set the number of seconds to plot\n",
"start_second = 0\n",
"end_second = 60*30\n",
"# Get the EYE signal data - we replace inf with nan to get the original signal.␣\n",
"\n",
"eye_left_signal = df_signals.loc[study_filter, \"LEFT_PUPIL_DIAMETER\"].dropna()\n",
"eye_right_signal = df_signals.loc[study_filter, \"RIGHT_PUPIL_DIAMETER\"].dropna()\n",
"#eye_left_signal = df_signals.loc[:, \"LEFT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
"\n",
"#eye_right_signal = df_signals.loc[:, \"RIGHT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
"\n",
"# Set the x-axis limits to the number of samples in the specified time range\n",
"num_samples_start = EYE_SR * start_second\n",
"num_samples_end = EYE_SR * end_second\n",
"ax.plot(eye_left_signal.index[num_samples_start:num_samples_end]/1000,eye_left_signal[num_samples_start:num_samples_end], label=\"Left\")\n",
"ax.plot(eye_right_signal.index[num_samples_start:num_samples_end]/1000,eye_right_signal[num_samples_start:num_samples_end], label=\"Right\")\n",
"ax.set_title(\"Pupil Dilation\")\n",
"ax.set_xlabel('Seconds')\n",
"ax.legend()\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,625 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "89d81009",
"metadata": {},
"source": [
"### Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7440a5b3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
]
},
{
"cell_type": "markdown",
"id": "09b7d707",
"metadata": {},
"source": [
"### Config"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2401aaef",
"metadata": {},
"outputs": [],
"source": [
"dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n",
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/60s_combined_dataset_25hz.parquet\")\n",
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0282b0b1",
"metadata": {},
"outputs": [],
"source": [
"FILTER_MAD = True\n",
"THRESHOLD = 3.5\n",
"METHOD = 'minmax'\n",
"SCOPE = 'subject'\n",
"FILTER_SUBSETS = True"
]
},
{
"cell_type": "markdown",
"id": "a8f1716b",
"metadata": {},
"source": [
"### Calculations"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac32444a",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(dataset_path)\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ba4401c",
"metadata": {},
"outputs": [],
"source": [
"if(FILTER_SUBSETS):\n",
" # Special filter: Keep only specific subsets\n",
"# - k-drive L1 baseline\n",
"# - n-back L1 baseline \n",
"# - k-drive test with levels 1, 2, 3\n",
"\n",
" df = df[\n",
" (\n",
" # k-drive L1 baseline\n",
" ((df['STUDY'] == 'k-drive') & \n",
" (df['LEVEL'] == 1) & \n",
" (df['PHASE'] == 'baseline'))\n",
" ) | \n",
" (\n",
" # n-back L1 baseline\n",
" ((df['STUDY'] == 'n-back') & \n",
" (df['LEVEL'] == 1) & \n",
" (df['PHASE'] == 'baseline'))\n",
" ) | \n",
" (\n",
" # k-drive test with levels 1, 2, 3\n",
" ((df['STUDY'] == 'k-drive') & \n",
" (df['LEVEL'].isin([1, 2, 3])) & \n",
" (df['PHASE'] == 'test'))\n",
" )].copy()\n",
"\n",
"print(f\"Filtered dataframe shape: {df.shape}\")\n",
"print(f\"Remaining subsets: {df.groupby(['STUDY', 'LEVEL', 'PHASE']).size()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77dbd6df",
"metadata": {},
"outputs": [],
"source": [
"face_au_cols = [c for c in df.columns if c.startswith(\"FACE_AU\")]\n",
"eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
" 'Pupil_mean', 'Pupil_IPA']\n",
"eye_cols_without_blink = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
" 'Sac_median_dur', 'Pupil_mean', 'Pupil_IPA']\n",
"print(len(eye_cols))\n",
"all_signal_columns = eye_cols+face_au_cols\n",
"print(len(all_signal_columns))"
]
},
{
"cell_type": "markdown",
"id": "d5e9c67a",
"metadata": {},
"source": [
"MAD"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "592291ef",
"metadata": {},
"outputs": [],
"source": [
"def calculate_mad_params(df, columns):\n",
" \"\"\"\n",
" Calculate median and MAD parameters for each column.\n",
" This should be run ONLY on the training data.\n",
" \n",
" Returns a dictionary: {col: (median, mad)}\n",
" \"\"\"\n",
" params = {}\n",
" for col in columns:\n",
" median = df[col].median()\n",
" mad = np.median(np.abs(df[col] - median))\n",
" params[col] = (median, mad)\n",
" return params\n",
"def apply_mad_filter(df, params, threshold=3.5):\n",
" \"\"\"\n",
" Apply MAD-based outlier removal using precomputed parameters.\n",
" Works on training, validation, and test data.\n",
" \n",
" df: DataFrame to filter\n",
" params: dictionary {col: (median, mad)} from training data\n",
" threshold: cutoff for robust Z-score\n",
" \"\"\"\n",
" df_clean = df.copy()\n",
"\n",
" for col, (median, mad) in params.items():\n",
" if mad == 0:\n",
" continue # no spread; nothing to remove for this column\n",
"\n",
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
" outlier_mask = np.abs(robust_z) > threshold\n",
"\n",
" # Remove values only in this specific column\n",
" df_clean.loc[outlier_mask, col] = median\n",
" \n",
" \n",
" print(df_clean.shape)\n",
" return df_clean"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ddad4a8",
"metadata": {},
"outputs": [],
"source": [
"if(FILTER_MAD):\n",
" mad_params = calculate_mad_params(df, all_signal_columns)\n",
" df = apply_mad_filter(df, mad_params, THRESHOLD)"
]
},
{
"cell_type": "markdown",
"id": "89387879",
"metadata": {},
"source": [
"Normalizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c129cdd",
"metadata": {},
"outputs": [],
"source": [
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
" \"\"\"\n",
" Fit normalization scalers on training data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" train_data : pd.DataFrame\n",
" Training dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" method : str, default='standard'\n",
" Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
" scope : str, default='global'\n",
" Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing fitted scalers and statistics for new subjects\n",
" \"\"\"\n",
" if method == 'standard':\n",
" Scaler = StandardScaler\n",
" elif method == 'minmax':\n",
" Scaler = MinMaxScaler\n",
" else:\n",
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
" \n",
" scalers = {}\n",
" if scope == 'subject':\n",
" # Fit one scaler per subject\n",
" subject_stats = []\n",
" \n",
" for subject in train_data['subjectID'].unique():\n",
" subject_mask = train_data['subjectID'] == subject\n",
" scaler = Scaler()\n",
" scaler.fit(train_data.loc[subject_mask, au_columns].values)\n",
" scalers[subject] = scaler\n",
" \n",
" # Store statistics for averaging\n",
" if method == 'standard':\n",
" subject_stats.append({\n",
" 'mean': scaler.mean_,\n",
" 'std': scaler.scale_\n",
" })\n",
" elif method == 'minmax':\n",
" subject_stats.append({\n",
" 'min': scaler.data_min_,\n",
" 'max': scaler.data_max_\n",
" })\n",
" \n",
" # Calculate average statistics for new subjects\n",
" if method == 'standard':\n",
" avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n",
" avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n",
" fallback_scaler = StandardScaler()\n",
" fallback_scaler.mean_ = avg_mean\n",
" fallback_scaler.scale_ = avg_std\n",
" fallback_scaler.var_ = avg_std ** 2\n",
" fallback_scaler.n_features_in_ = len(au_columns)\n",
" elif method == 'minmax':\n",
" avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n",
" avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n",
" fallback_scaler = MinMaxScaler()\n",
" fallback_scaler.data_min_ = avg_min\n",
" fallback_scaler.data_max_ = avg_max\n",
" fallback_scaler.data_range_ = avg_max - avg_min\n",
" fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n",
" fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n",
" fallback_scaler.n_features_in_ = len(au_columns)\n",
" \n",
" scalers['_fallback'] = fallback_scaler\n",
" \n",
" elif scope == 'global':\n",
" # Fit one scaler for all subjects\n",
" scaler = Scaler()\n",
" scaler.fit(train_data[au_columns].values)\n",
" scalers['global'] = scaler\n",
" \n",
" else:\n",
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
" \n",
" return {'scalers': scalers, 'method': method, 'scope': scope}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9cfabd37",
"metadata": {},
"outputs": [],
"source": [
"def apply_normalizer(data, columns, normalizer_dict):\n",
" \"\"\"\n",
" Apply fitted normalization scalers to data.\n",
" \n",
" Parameters:\n",
" -----------\n",
" data : pd.DataFrame\n",
" Dataframe with AU columns and subjectID\n",
" au_columns : list\n",
" List of AU column names to normalize\n",
" normalizer_dict : dict\n",
" Dictionary containing fitted scalers from fit_normalizer()\n",
" \n",
" Returns:\n",
" --------\n",
" pd.DataFrame\n",
" DataFrame with normalized AU columns\n",
" \"\"\"\n",
" normalized_data = data.copy()\n",
" scalers = normalizer_dict['scalers']\n",
" scope = normalizer_dict['scope']\n",
" normalized_data[columns] = normalized_data[columns].astype(np.float64)\n",
"\n",
" if scope == 'subject':\n",
" # Apply per-subject normalization\n",
" for subject in data['subjectID'].unique():\n",
" subject_mask = data['subjectID'] == subject\n",
" \n",
" # Use the subject's scaler if available, otherwise use fallback\n",
" if subject in scalers:\n",
" scaler = scalers[subject]\n",
" else:\n",
" # Use averaged scaler for new subjects\n",
" scaler = scalers['_fallback']\n",
" print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n",
" \n",
" normalized_data.loc[subject_mask, columns] = scaler.transform(\n",
" data.loc[subject_mask, columns].values\n",
" )\n",
" \n",
" elif scope == 'global':\n",
" # Apply global normalization\n",
" scaler = scalers['global']\n",
" normalized_data[columns] = scaler.transform(data[columns].values)\n",
" \n",
" return normalized_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4dbbebf7",
"metadata": {},
"outputs": [],
"source": [
"scaler = fit_normalizer(df, all_signal_columns, method=METHOD, scope=SCOPE)\n",
"df_min_max_normalised = apply_normalizer(df, all_signal_columns, scaler)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b9b2ae8",
"metadata": {},
"outputs": [],
"source": [
"a= df_min_max_normalised[['STUDY','LEVEL','PHASE']]\n",
"print(a.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3e1bc34",
"metadata": {},
"outputs": [],
"source": [
"# Define signal columns (adjust only once)\n",
"signal_columns = all_signal_columns\n",
"\n",
"# Get all unique combinations of STUDY, LEVEL and PHASE\n",
"unique_combinations = df_min_max_normalised[['STUDY', 'LEVEL', 'PHASE']].drop_duplicates().reset_index(drop=True)\n",
"\n",
"# Dictionary to store subsets\n",
"subsets = {}\n",
"subset_sizes = {}\n",
"\n",
"for idx, row in unique_combinations.iterrows():\n",
" study = row['STUDY']\n",
" level = row['LEVEL']\n",
" phase = row['PHASE']\n",
" key = f\"{study}_L{level}_P{phase}\"\n",
" subset = df_min_max_normalised[\n",
" (df_min_max_normalised['STUDY'] == study) & \n",
" (df_min_max_normalised['LEVEL'] == level) & \n",
" (df_min_max_normalised['PHASE'] == phase)\n",
" ]\n",
" subsets[key] = subset\n",
" subset_sizes[key] = len(subset)\n",
"\n",
"# Output subset sizes\n",
"print(\"Number of samples per subset:\")\n",
"print(\"=\" * 40)\n",
"for key, size in subset_sizes.items():\n",
" print(f\"{key}: {size} samples\")\n",
"print(\"=\" * 40)\n",
"print(f\"Total number of subsets: {len(subsets)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7fdeb5c",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# Function to categorize subsets\n",
"def categorize_subset(key):\n",
" \"\"\"Categorizes a subset as 'low' or 'high' based on the given logic\"\"\"\n",
" parts = key.split('_')\n",
" study = parts[0]\n",
" level = int(parts[1][1:]) # 'L1' -> 1\n",
" phase = parts[2][1:] # 'Pbaseline' -> 'baseline'\n",
" \n",
" # LOW: baseline OR (n-back with level 1 or 4)\n",
" if phase == \"baseline\":\n",
" return 'low'\n",
" elif study == \"n-back\" and level in [1, 4]:\n",
" return 'low'\n",
" \n",
" # HIGH: (n-back with level 2,3,5,6 and phase train/test) OR (k-drive not baseline)\n",
" elif study == \"n-back\" and level in [2, 3, 5, 6] and phase in [\"train\", \"test\"]:\n",
" return 'high'\n",
" elif study == \"k-drive\" and phase != \"baseline\":\n",
" return 'high'\n",
" \n",
" return None\n",
"\n",
"# Categorize subsets\n",
"low_subsets = {}\n",
"high_subsets = {}\n",
"\n",
"for key, subset in subsets.items():\n",
" category = categorize_subset(key)\n",
" if category == 'low':\n",
" low_subsets[key] = subset\n",
" elif category == 'high':\n",
" high_subsets[key] = subset\n",
"\n",
"# Output statistics\n",
"print(\"\\n\" + \"=\" * 50)\n",
"print(\"SUBSET CATEGORIZATION\")\n",
"print(\"=\" * 50)\n",
"\n",
"print(\"\\nLOW subsets (Blue):\")\n",
"print(\"-\" * 50)\n",
"low_total = 0\n",
"for key in sorted(low_subsets.keys()):\n",
" size = subset_sizes[key]\n",
" low_total += size\n",
" print(f\" {key}: {size} samples\")\n",
"print(f\"{'TOTAL LOW:':<30} {low_total} samples\")\n",
"print(f\"{'NUMBER OF LOW SUBSETS:':<30} {len(low_subsets)}\")\n",
"\n",
"print(\"\\nHIGH subsets (Red):\")\n",
"print(\"-\" * 50)\n",
"high_total = 0\n",
"for key in sorted(high_subsets.keys()):\n",
" size = subset_sizes[key]\n",
" high_total += size\n",
" print(f\" {key}: {size} samples\")\n",
"print(f\"{'TOTAL HIGH:':<30} {high_total} samples\")\n",
"print(f\"{'NUMBER OF HIGH SUBSETS:':<30} {len(high_subsets)}\")\n",
"\n",
"print(\"\\n\" + \"=\" * 50)\n",
"print(f\"TOTAL SAMPLES: {low_total + high_total}\")\n",
"print(f\"TOTAL SUBSETS: {len(low_subsets) + len(high_subsets)}\")\n",
"print(\"=\" * 50)\n",
"\n",
"# Find minimum subset size\n",
"min_subset_size = min(subset_sizes.values())\n",
"print(f\"\\nMinimum subset size: {min_subset_size}\")\n",
"\n",
"# Number of points to plot per subset (50% of minimum size)\n",
"sampling_factor = 1\n",
"n_samples_per_subset = int(sampling_factor * min_subset_size)\n",
"print(f\"Number of randomly drawn points per subset: {n_samples_per_subset}\")"
]
},
{
"cell_type": "markdown",
"id": "ff363fc5",
"metadata": {},
"source": [
"### Plot"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a9d9163",
"metadata": {},
"outputs": [],
"source": [
"# Create comparison plots\n",
"fig, axes = plt.subplots(len(signal_columns), 1, figsize=(14, 4 * len(signal_columns)))\n",
"\n",
"# If only one signal column exists, convert axes to list\n",
"if len(signal_columns) == 1:\n",
" axes = [axes]\n",
"\n",
"# Create a plot for each signal column\n",
"for i, signal_col in enumerate(signal_columns):\n",
" ax = axes[i]\n",
" \n",
" y_pos = 0\n",
" labels = []\n",
" \n",
" # First plot all LOW subsets (sorted, blue)\n",
" for label in sorted(low_subsets.keys()):\n",
" subset = low_subsets[label]\n",
" if len(subset) > 0 and signal_col in subset.columns:\n",
" # Draw random sample\n",
" n_samples = min(n_samples_per_subset, len(subset))\n",
" sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
" \n",
" # Calculate mean and median\n",
" mean_val = subset[signal_col].mean()\n",
" median_val = subset[signal_col].median()\n",
" \n",
" # Plot points in blue\n",
" ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
" alpha=0.5, s=30, color='blue')\n",
" \n",
" # Mean as black cross\n",
" ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
" color='black', zorder=5)\n",
" \n",
" # Median as brown cross\n",
" ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
" color='brown', zorder=5)\n",
" \n",
" labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
" y_pos += 1\n",
" \n",
" # Separation line between LOW and HIGH\n",
" if len(low_subsets) > 0 and len(high_subsets) > 0:\n",
" ax.axhline(y=y_pos - 0.5, color='gray', linestyle='--', linewidth=2, alpha=0.7)\n",
" \n",
" # Then plot all HIGH subsets (sorted, red)\n",
" for label in sorted(high_subsets.keys()):\n",
" subset = high_subsets[label]\n",
" if len(subset) > 0 and signal_col in subset.columns:\n",
" # Draw random sample\n",
" n_samples = min(n_samples_per_subset, len(subset))\n",
" sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
" \n",
" # Calculate mean and median\n",
" mean_val = subset[signal_col].mean()\n",
" median_val = subset[signal_col].median()\n",
" \n",
" # Plot points in red\n",
" ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
" alpha=0.5, s=30, color='red')\n",
" \n",
" # Mean as black cross\n",
" ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
" color='black', zorder=5)\n",
" \n",
" # Median as brown cross\n",
" ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
" color='brown', zorder=5)\n",
" \n",
" labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
" y_pos += 1\n",
" \n",
" ax.set_yticks(range(len(labels)))\n",
" ax.set_yticklabels(labels)\n",
" ax.set_xlabel(f'{signal_col} value')\n",
" ax.set_title(f'{signal_col}: LOW (Blue) vs HIGH (Red) | {n_samples_per_subset} points/subset | Black X = Mean, Brown X = Median')\n",
" ax.grid(True, alpha=0.3, axis='x')\n",
" ax.axvline(0, color='gray', linestyle='--', alpha=0.5)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"print(f\"\\nNote: {n_samples_per_subset} random points were plotted per subset.\")\n",
"print(\"Blue points = LOW subsets | Red points = HIGH subsets\")\n",
"print(\"Black 'X' = Mean of entire subset | Brown 'X' = Median of entire subset\")\n",
"print(f\"Total subsets plotted: {len(low_subsets)} LOW + {len(high_subsets)} HIGH = {len(low_subsets) + len(high_subsets)} subsets\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,166 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1014c5e0",
"metadata": {},
"source": [
"Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e42f3011",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a834496",
"metadata": {},
"outputs": [],
"source": [
"path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
"df = pd.read_parquet(path=path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa4759fa",
"metadata": {},
"outputs": [],
"source": [
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"high_nback.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2aa0596",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
"]\n",
"print(low_all.shape)\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(high_kdrive.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7d446a1",
"metadata": {},
"outputs": [],
"source": [
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
"print(df.shape[0])\n",
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "474e144a",
"metadata": {},
"outputs": [],
"source": [
"high_all = pd.concat([high_nback, high_kdrive])\n",
"high_all.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5dd585c2",
"metadata": {},
"outputs": [],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bd39d9f",
"metadata": {},
"outputs": [],
"source": [
"# Get all columns that start with 'AU'\n",
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
"\n",
"# Calculate number of rows and columns for subplots\n",
"n_cols = len(au_columns)\n",
"n_rows = 4\n",
"n_cols_subplot = 5\n",
"\n",
"# Create figure with subplots\n",
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
"axes = axes.flatten()\n",
"fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
"\n",
"# Create histogram for each AU column\n",
"for idx, col in enumerate(au_columns):\n",
" ax = axes[idx]\n",
" \n",
" # Plot overlapping histograms\n",
" ax.hist(low_all[col].dropna(), bins=30, alpha=0.6, color='blue', label='low_all', edgecolor='black')\n",
" ax.hist(high_all[col].dropna(), bins=30, alpha=0.6, color='red', label='high_all', edgecolor='black')\n",
" \n",
" # Set title and labels\n",
" ax.set_title(col, fontsize=10, fontweight='bold')\n",
" ax.set_xlabel('Value', fontsize=8)\n",
" ax.set_ylabel('Frequency', fontsize=8)\n",
" ax.legend(fontsize=8)\n",
" ax.grid(True, alpha=0.3)\n",
"\n",
"# Hide any unused subplots\n",
"for idx in range(len(au_columns), len(axes)):\n",
" axes[idx].set_visible(False)\n",
"\n",
"# Adjust layout\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,157 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# %pip install pyocclient\n",
"import yaml\n",
"import owncloud\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
"metadata": {},
"outputs": [],
"source": [
"start = time.time()\n",
"\n",
"with open(\"../login.yaml\") as f:\n",
" cfg = yaml.safe_load(f)\n",
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
"file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
"\n",
"\n",
"oc.get_file(file, \"tmp22.h5\")\n",
"\n",
"end = time.time()\n",
"print(end - start)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
"metadata": {},
"outputs": [],
"source": [
"start = time.time()\n",
"df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f50e97d0",
"metadata": {},
"outputs": [],
"source": [
"print(22)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c131c816",
"metadata": {},
"outputs": [],
"source": [
"df_performance"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
"metadata": {},
"outputs": [],
"source": [
"start = time.time()\n",
"df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
"metadata": {},
"outputs": [],
"source": [
"df_4_col.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
"metadata": {},
"outputs": [],
"source": [
"df_4_col.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
"metadata": {},
"outputs": [],
"source": [
"df_4_col.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
"metadata": {},
"outputs": [],
"source": [
"df_4_col.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72313895-c478-44a5-9108-00b0bec01bb8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,213 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8fb02733",
"metadata": {},
"source": [
"Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96f3b128",
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"import owncloud\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "c20cee7c",
"metadata": {},
"source": [
"Connection to Owncloud"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4c94558",
"metadata": {},
"outputs": [],
"source": [
"# Load credentials\n",
"with open(\"../login.yaml\") as f:\n",
" cfg = yaml.safe_load(f)\n",
" \n",
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
"\n",
"# Connect once\n",
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
"# File pattern\n",
"# base = \"adabase-public-{num:04d}-v_0_0_2.h5py\"\n",
"base = \"{num:04d}-*.h5py\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07c03d07",
"metadata": {},
"outputs": [],
"source": [
"num_files = 2 # number of files to process (min: 1, max: 30)\n",
"performance_data = []\n",
"\n",
"for i in range(num_files):\n",
" file_pattern = f\"{i:04d}-*\"\n",
" \n",
" # Get list of files matching the pattern\n",
" files = oc.list('.')\n",
" matching_files = [f.get_name() for f in files if f.get_name().startswith(f\"{i:04d}-\")]\n",
" \n",
" if matching_files:\n",
" file_name = matching_files[0] # Take the first matching file\n",
" local_tmp = f\"tmp_{i:04d}.h5\"\n",
" \n",
" oc.get_file(file_name, local_tmp)\n",
" print(f\"{file_name} geöffnet\")\n",
" else:\n",
" print(f\"Keine Datei gefunden für Muster: {file_pattern}\")\n",
" # file_name = base.format(num=i)\n",
" # local_tmp = f\"tmp_{i:04d}.h5\"\n",
"\n",
" # oc.get_file(file_name, local_tmp)\n",
" # print(f\"{file_name} geöffnet\")\n",
"\n",
" # check SIGNALS table for AUs\n",
" with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
" cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n",
" au_cols = [c for c in cols if c.startswith(\"AU\")]\n",
" if not au_cols:\n",
" print(f\"Subject {i} enthält keine AUs\")\n",
" continue\n",
"\n",
" # load performance table\n",
" with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
" perf_df = store.select(\"PERFORMANCE\")\n",
"\n",
" f1_cols = [c for c in [\"AUDITIVE F1\", \"VISUAL F1\", \"F1\"] if c in perf_df.columns]\n",
" if not f1_cols:\n",
" print(f\"Subject {i}: keine F1-Spalten gefunden\")\n",
" continue\n",
"\n",
" subject_entry = {\"subjectID\": i}\n",
" valid_scores = []\n",
"\n",
" # iterate rows: each (study, level, phase)\n",
" for _, row in perf_df.iterrows():\n",
" study, level, phase = row[\"STUDY\"], row[\"LEVEL\"], row[\"PHASE\"]\n",
" col_name = f\"STUDY_{study}_LEVEL_{level}_PHASE_{phase}\"\n",
"\n",
" # collect valid F1 values among the three columns\n",
" scores = [row[c] for c in f1_cols if pd.notna(row[c])]\n",
" if scores:\n",
" mean_score = float(np.mean(scores))\n",
" subject_entry[col_name] = mean_score\n",
" valid_scores.extend(scores)\n",
"\n",
" # compute overall average across all valid combinations\n",
" if valid_scores:\n",
" subject_entry[\"overall_score\"] = float(np.mean(valid_scores))\n",
" performance_data.append(subject_entry)\n",
" print(f\"Subject {i}: {len(valid_scores)} gültige Scores, Overall = {subject_entry['overall_score']:.3f}\")\n",
" else:\n",
" print(f\"Subject {i}: keine gültigen F1-Scores\")\n",
"\n",
"# build dataframe\n",
"if performance_data:\n",
" performance_df = pd.DataFrame(performance_data)\n",
" combination_cols = sorted([c for c in performance_df.columns if c.startswith(\"STUDY_\")])\n",
" final_cols = [\"subjectID\", \"overall_score\"] + combination_cols\n",
" performance_df = performance_df[final_cols]\n",
" performance_df.to_csv(\"n_au_performance.csv\", index=False)\n",
"\n",
" print(f\"\\nGesamt Subjects mit Action Units: {len(performance_df)}\")\n",
"else:\n",
" print(\"Keine gültigen Daten gefunden.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bcaf065",
"metadata": {},
"outputs": [],
"source": [
"performance_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db95eea7",
"metadata": {},
"outputs": [],
"source": [
"with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
" md = store.select(\"META\")\n",
"print(\"File 0:\")\n",
"print(md)\n",
"with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\") as store:\n",
" md = store.select(\"META\")\n",
"print(\"File 1\")\n",
"print(md)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8067036b",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f18e7385",
"metadata": {},
"outputs": [],
"source": [
"with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
" md = store.select(\"SIGNALS\", start=0, stop=1)\n",
"print(\"File 0:\")\n",
"md.head()\n",
"# with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\",start=0, stop=1) as store:\n",
"# md = store.select(\"SIGNALS\")\n",
"# print(\"File 1\")\n",
"# print(md.columns)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}