diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb deleted file mode 100644 index 76844a5..0000000 --- a/EDA/EDA.ipynb +++ /dev/null @@ -1,259 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "7440a5b3", - "metadata": {}, - "outputs": [], - "source": [ - "import h5py\n", - "import os\n", - "import warnings\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2401aaef", - "metadata": {}, - "outputs": [], - "source": [ - "file_path = \"adabase-public-0020-v_0_0_2.h5py\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46280999", - "metadata": {}, - "outputs": [], - "source": [ - "SKT_SR = 100\n", - "ECG_SR = 500\n", - "RSP_SR = 250\n", - "EMG_SR = 1000\n", - "EDA_SR = 500\n", - "EYE_SR = 250" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e23eb552", - "metadata": {}, - "outputs": [], - "source": [ - "df_signals = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7f494d1", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_rows', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd2f4d84", - "metadata": {}, - "outputs": [], - "source": [ - "settings = df_signals[['STUDY','PHASE','LEVEL']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1699ddc2", - "metadata": {}, - "outputs": [], - "source": [ - "settings.value_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "a4731c56", - "metadata": {}, - "source": [ - "Actions units" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9db0b4b2", - "metadata": {}, - "outputs": [], - "source": [ - "df_signals.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ceccc89", - "metadata": {}, - "outputs": [], - "source": [ - "au_data = df_signals.iloc[:,-20:]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d4ee088", - "metadata": {}, - "outputs": [], - "source": [ - "au_data.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d85a8cb", - "metadata": {}, - "outputs": [], - "source": [ - "print(au_data.shape)\n", - "print(au_data.isna().sum())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efff356f", - "metadata": {}, - "outputs": [], - "source": [ - "clean_au_data = au_data.dropna()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42ed1bcd", - "metadata": {}, - "outputs": [], - "source": [ - "clean_au_data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c7c3f14", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(len(clean_au_data.columns)):\n", - " print(clean_au_data.iloc[:,i].unique())" - ] - }, - { - "cell_type": "markdown", - "id": "332740a8", - "metadata": {}, - "source": [ - "Plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f30b8814", - "metadata": {}, - "outputs": [], - "source": [ - "# df_signals_ecg = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I'])\n", - "df_signals_ecg = df_signals[[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I']]\n", - "df_signals_ecg.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee80fd79", - "metadata": {}, - "outputs": [], - "source": [ - "study_filter = df_signals[\"STUDY\"] == \"n-back\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ef29446", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(16, 2))\n", - "# Set the number of seconds to plot\n", - "seconds = 20\n", - "# Get the ECG signal data\n", - "ecg_signal = df_signals.loc[study_filter, \"RAW_ECG_I\"].dropna()\n", - "# Set the x-axis limits to the number of samples in the specified time range\n", - "num_samples = ECG_SR * seconds\n", - "# Plot the ECG signal\n", - "ax.plot(ecg_signal.index[:num_samples]/1000, ecg_signal[:num_samples]);\n", - "ax.set_title(\"ECG I\");\n", - "ax.set_xlabel('Seconds');\n", - "# Set figure size with a 16:6 aspect ratio\n", - "fig, ax = plt.subplots(figsize=(16, 2))\n", - "# Set the number of seconds to plot\n", - "start_second = 0\n", - "end_second = 60*30\n", - "# Get the EYE signal data - we replace inf with nan to get the original signal.␣\n", - "\n", - "eye_left_signal = df_signals.loc[study_filter, \"LEFT_PUPIL_DIAMETER\"].dropna()\n", - "eye_right_signal = df_signals.loc[study_filter, \"RIGHT_PUPIL_DIAMETER\"].dropna()\n", - "#eye_left_signal = df_signals.loc[:, \"LEFT_PUPIL_DIAMETER\"].replace([np.inf],␣\n", - "\n", - "#eye_right_signal = df_signals.loc[:, \"RIGHT_PUPIL_DIAMETER\"].replace([np.inf],␣\n", - "\n", - "# Set the x-axis limits to the number of samples in the specified time range\n", - "num_samples_start = EYE_SR * start_second\n", - "num_samples_end = EYE_SR * end_second\n", - "ax.plot(eye_left_signal.index[num_samples_start:num_samples_end]/1000,eye_left_signal[num_samples_start:num_samples_end], label=\"Left\")\n", - "ax.plot(eye_right_signal.index[num_samples_start:num_samples_end]/1000,eye_right_signal[num_samples_start:num_samples_end], label=\"Right\")\n", - "ax.set_title(\"Pupil Dilation\")\n", - "ax.set_xlabel('Seconds')\n", - "ax.legend()\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/EDA/distribution_plots.ipynb b/EDA/distribution_plots.ipynb deleted file mode 100644 index a73877a..0000000 --- a/EDA/distribution_plots.ipynb +++ /dev/null @@ -1,625 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "89d81009", - "metadata": {}, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7440a5b3", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from pathlib import Path\n", - "from sklearn.preprocessing import StandardScaler, MinMaxScaler" - ] - }, - { - "cell_type": "markdown", - "id": "09b7d707", - "metadata": {}, - "source": [ - "### Config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2401aaef", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n", - "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/60s_combined_dataset_25hz.parquet\")\n", - "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0282b0b1", - "metadata": {}, - "outputs": [], - "source": [ - "FILTER_MAD = True\n", - "THRESHOLD = 3.5\n", - "METHOD = 'minmax'\n", - "SCOPE = 'subject'\n", - "FILTER_SUBSETS = True" - ] - }, - { - "cell_type": "markdown", - "id": "a8f1716b", - "metadata": {}, - "source": [ - "### Calculations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac32444a", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(dataset_path)\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ba4401c", - "metadata": {}, - "outputs": [], - "source": [ - "if(FILTER_SUBSETS):\n", - " # Special filter: Keep only specific subsets\n", - "# - k-drive L1 baseline\n", - "# - n-back L1 baseline \n", - "# - k-drive test with levels 1, 2, 3\n", - "\n", - " df = df[\n", - " (\n", - " # k-drive L1 baseline\n", - " ((df['STUDY'] == 'k-drive') & \n", - " (df['LEVEL'] == 1) & \n", - " (df['PHASE'] == 'baseline'))\n", - " ) | \n", - " (\n", - " # n-back L1 baseline\n", - " ((df['STUDY'] == 'n-back') & \n", - " (df['LEVEL'] == 1) & \n", - " (df['PHASE'] == 'baseline'))\n", - " ) | \n", - " (\n", - " # k-drive test with levels 1, 2, 3\n", - " ((df['STUDY'] == 'k-drive') & \n", - " (df['LEVEL'].isin([1, 2, 3])) & \n", - " (df['PHASE'] == 'test'))\n", - " )].copy()\n", - "\n", - "print(f\"Filtered dataframe shape: {df.shape}\")\n", - "print(f\"Remaining subsets: {df.groupby(['STUDY', 'LEVEL', 'PHASE']).size()}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77dbd6df", - "metadata": {}, - "outputs": [], - "source": [ - "face_au_cols = [c for c in df.columns if c.startswith(\"FACE_AU\")]\n", - "eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", - " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", - " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", - " 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n", - " 'Pupil_mean', 'Pupil_IPA']\n", - "eye_cols_without_blink = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", - " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", - " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", - " 'Sac_median_dur', 'Pupil_mean', 'Pupil_IPA']\n", - "print(len(eye_cols))\n", - "all_signal_columns = eye_cols+face_au_cols\n", - "print(len(all_signal_columns))" - ] - }, - { - "cell_type": "markdown", - "id": "d5e9c67a", - "metadata": {}, - "source": [ - "MAD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "592291ef", - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_mad_params(df, columns):\n", - " \"\"\"\n", - " Calculate median and MAD parameters for each column.\n", - " This should be run ONLY on the training data.\n", - " \n", - " Returns a dictionary: {col: (median, mad)}\n", - " \"\"\"\n", - " params = {}\n", - " for col in columns:\n", - " median = df[col].median()\n", - " mad = np.median(np.abs(df[col] - median))\n", - " params[col] = (median, mad)\n", - " return params\n", - "def apply_mad_filter(df, params, threshold=3.5):\n", - " \"\"\"\n", - " Apply MAD-based outlier removal using precomputed parameters.\n", - " Works on training, validation, and test data.\n", - " \n", - " df: DataFrame to filter\n", - " params: dictionary {col: (median, mad)} from training data\n", - " threshold: cutoff for robust Z-score\n", - " \"\"\"\n", - " df_clean = df.copy()\n", - "\n", - " for col, (median, mad) in params.items():\n", - " if mad == 0:\n", - " continue # no spread; nothing to remove for this column\n", - "\n", - " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", - " outlier_mask = np.abs(robust_z) > threshold\n", - "\n", - " # Remove values only in this specific column\n", - " df_clean.loc[outlier_mask, col] = median\n", - " \n", - " \n", - " print(df_clean.shape)\n", - " return df_clean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ddad4a8", - "metadata": {}, - "outputs": [], - "source": [ - "if(FILTER_MAD):\n", - " mad_params = calculate_mad_params(df, all_signal_columns)\n", - " df = apply_mad_filter(df, mad_params, THRESHOLD)" - ] - }, - { - "cell_type": "markdown", - "id": "89387879", - "metadata": {}, - "source": [ - "Normalizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c129cdd", - "metadata": {}, - "outputs": [], - "source": [ - "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n", - " \"\"\"\n", - " Fit normalization scalers on training data.\n", - " \n", - " Parameters:\n", - " -----------\n", - " train_data : pd.DataFrame\n", - " Training dataframe with AU columns and subjectID\n", - " au_columns : list\n", - " List of AU column names to normalize\n", - " method : str, default='standard'\n", - " Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n", - " scope : str, default='global'\n", - " Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n", - " \n", - " Returns:\n", - " --------\n", - " dict\n", - " Dictionary containing fitted scalers and statistics for new subjects\n", - " \"\"\"\n", - " if method == 'standard':\n", - " Scaler = StandardScaler\n", - " elif method == 'minmax':\n", - " Scaler = MinMaxScaler\n", - " else:\n", - " raise ValueError(\"method must be 'standard' or 'minmax'\")\n", - " \n", - " scalers = {}\n", - " if scope == 'subject':\n", - " # Fit one scaler per subject\n", - " subject_stats = []\n", - " \n", - " for subject in train_data['subjectID'].unique():\n", - " subject_mask = train_data['subjectID'] == subject\n", - " scaler = Scaler()\n", - " scaler.fit(train_data.loc[subject_mask, au_columns].values)\n", - " scalers[subject] = scaler\n", - " \n", - " # Store statistics for averaging\n", - " if method == 'standard':\n", - " subject_stats.append({\n", - " 'mean': scaler.mean_,\n", - " 'std': scaler.scale_\n", - " })\n", - " elif method == 'minmax':\n", - " subject_stats.append({\n", - " 'min': scaler.data_min_,\n", - " 'max': scaler.data_max_\n", - " })\n", - " \n", - " # Calculate average statistics for new subjects\n", - " if method == 'standard':\n", - " avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n", - " avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n", - " fallback_scaler = StandardScaler()\n", - " fallback_scaler.mean_ = avg_mean\n", - " fallback_scaler.scale_ = avg_std\n", - " fallback_scaler.var_ = avg_std ** 2\n", - " fallback_scaler.n_features_in_ = len(au_columns)\n", - " elif method == 'minmax':\n", - " avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n", - " avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n", - " fallback_scaler = MinMaxScaler()\n", - " fallback_scaler.data_min_ = avg_min\n", - " fallback_scaler.data_max_ = avg_max\n", - " fallback_scaler.data_range_ = avg_max - avg_min\n", - " fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n", - " fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n", - " fallback_scaler.n_features_in_ = len(au_columns)\n", - " \n", - " scalers['_fallback'] = fallback_scaler\n", - " \n", - " elif scope == 'global':\n", - " # Fit one scaler for all subjects\n", - " scaler = Scaler()\n", - " scaler.fit(train_data[au_columns].values)\n", - " scalers['global'] = scaler\n", - " \n", - " else:\n", - " raise ValueError(\"scope must be 'subject' or 'global'\")\n", - " \n", - " return {'scalers': scalers, 'method': method, 'scope': scope}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cfabd37", - "metadata": {}, - "outputs": [], - "source": [ - "def apply_normalizer(data, columns, normalizer_dict):\n", - " \"\"\"\n", - " Apply fitted normalization scalers to data.\n", - " \n", - " Parameters:\n", - " -----------\n", - " data : pd.DataFrame\n", - " Dataframe with AU columns and subjectID\n", - " au_columns : list\n", - " List of AU column names to normalize\n", - " normalizer_dict : dict\n", - " Dictionary containing fitted scalers from fit_normalizer()\n", - " \n", - " Returns:\n", - " --------\n", - " pd.DataFrame\n", - " DataFrame with normalized AU columns\n", - " \"\"\"\n", - " normalized_data = data.copy()\n", - " scalers = normalizer_dict['scalers']\n", - " scope = normalizer_dict['scope']\n", - " normalized_data[columns] = normalized_data[columns].astype(np.float64)\n", - "\n", - " if scope == 'subject':\n", - " # Apply per-subject normalization\n", - " for subject in data['subjectID'].unique():\n", - " subject_mask = data['subjectID'] == subject\n", - " \n", - " # Use the subject's scaler if available, otherwise use fallback\n", - " if subject in scalers:\n", - " scaler = scalers[subject]\n", - " else:\n", - " # Use averaged scaler for new subjects\n", - " scaler = scalers['_fallback']\n", - " print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n", - " \n", - " normalized_data.loc[subject_mask, columns] = scaler.transform(\n", - " data.loc[subject_mask, columns].values\n", - " )\n", - " \n", - " elif scope == 'global':\n", - " # Apply global normalization\n", - " scaler = scalers['global']\n", - " normalized_data[columns] = scaler.transform(data[columns].values)\n", - " \n", - " return normalized_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4dbbebf7", - "metadata": {}, - "outputs": [], - "source": [ - "scaler = fit_normalizer(df, all_signal_columns, method=METHOD, scope=SCOPE)\n", - "df_min_max_normalised = apply_normalizer(df, all_signal_columns, scaler)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b9b2ae8", - "metadata": {}, - "outputs": [], - "source": [ - "a= df_min_max_normalised[['STUDY','LEVEL','PHASE']]\n", - "print(a.dtypes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3e1bc34", - "metadata": {}, - "outputs": [], - "source": [ - "# Define signal columns (adjust only once)\n", - "signal_columns = all_signal_columns\n", - "\n", - "# Get all unique combinations of STUDY, LEVEL and PHASE\n", - "unique_combinations = df_min_max_normalised[['STUDY', 'LEVEL', 'PHASE']].drop_duplicates().reset_index(drop=True)\n", - "\n", - "# Dictionary to store subsets\n", - "subsets = {}\n", - "subset_sizes = {}\n", - "\n", - "for idx, row in unique_combinations.iterrows():\n", - " study = row['STUDY']\n", - " level = row['LEVEL']\n", - " phase = row['PHASE']\n", - " key = f\"{study}_L{level}_P{phase}\"\n", - " subset = df_min_max_normalised[\n", - " (df_min_max_normalised['STUDY'] == study) & \n", - " (df_min_max_normalised['LEVEL'] == level) & \n", - " (df_min_max_normalised['PHASE'] == phase)\n", - " ]\n", - " subsets[key] = subset\n", - " subset_sizes[key] = len(subset)\n", - "\n", - "# Output subset sizes\n", - "print(\"Number of samples per subset:\")\n", - "print(\"=\" * 40)\n", - "for key, size in subset_sizes.items():\n", - " print(f\"{key}: {size} samples\")\n", - "print(\"=\" * 40)\n", - "print(f\"Total number of subsets: {len(subsets)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7fdeb5c", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# Function to categorize subsets\n", - "def categorize_subset(key):\n", - " \"\"\"Categorizes a subset as 'low' or 'high' based on the given logic\"\"\"\n", - " parts = key.split('_')\n", - " study = parts[0]\n", - " level = int(parts[1][1:]) # 'L1' -> 1\n", - " phase = parts[2][1:] # 'Pbaseline' -> 'baseline'\n", - " \n", - " # LOW: baseline OR (n-back with level 1 or 4)\n", - " if phase == \"baseline\":\n", - " return 'low'\n", - " elif study == \"n-back\" and level in [1, 4]:\n", - " return 'low'\n", - " \n", - " # HIGH: (n-back with level 2,3,5,6 and phase train/test) OR (k-drive not baseline)\n", - " elif study == \"n-back\" and level in [2, 3, 5, 6] and phase in [\"train\", \"test\"]:\n", - " return 'high'\n", - " elif study == \"k-drive\" and phase != \"baseline\":\n", - " return 'high'\n", - " \n", - " return None\n", - "\n", - "# Categorize subsets\n", - "low_subsets = {}\n", - "high_subsets = {}\n", - "\n", - "for key, subset in subsets.items():\n", - " category = categorize_subset(key)\n", - " if category == 'low':\n", - " low_subsets[key] = subset\n", - " elif category == 'high':\n", - " high_subsets[key] = subset\n", - "\n", - "# Output statistics\n", - "print(\"\\n\" + \"=\" * 50)\n", - "print(\"SUBSET CATEGORIZATION\")\n", - "print(\"=\" * 50)\n", - "\n", - "print(\"\\nLOW subsets (Blue):\")\n", - "print(\"-\" * 50)\n", - "low_total = 0\n", - "for key in sorted(low_subsets.keys()):\n", - " size = subset_sizes[key]\n", - " low_total += size\n", - " print(f\" {key}: {size} samples\")\n", - "print(f\"{'TOTAL LOW:':<30} {low_total} samples\")\n", - "print(f\"{'NUMBER OF LOW SUBSETS:':<30} {len(low_subsets)}\")\n", - "\n", - "print(\"\\nHIGH subsets (Red):\")\n", - "print(\"-\" * 50)\n", - "high_total = 0\n", - "for key in sorted(high_subsets.keys()):\n", - " size = subset_sizes[key]\n", - " high_total += size\n", - " print(f\" {key}: {size} samples\")\n", - "print(f\"{'TOTAL HIGH:':<30} {high_total} samples\")\n", - "print(f\"{'NUMBER OF HIGH SUBSETS:':<30} {len(high_subsets)}\")\n", - "\n", - "print(\"\\n\" + \"=\" * 50)\n", - "print(f\"TOTAL SAMPLES: {low_total + high_total}\")\n", - "print(f\"TOTAL SUBSETS: {len(low_subsets) + len(high_subsets)}\")\n", - "print(\"=\" * 50)\n", - "\n", - "# Find minimum subset size\n", - "min_subset_size = min(subset_sizes.values())\n", - "print(f\"\\nMinimum subset size: {min_subset_size}\")\n", - "\n", - "# Number of points to plot per subset (50% of minimum size)\n", - "sampling_factor = 1\n", - "n_samples_per_subset = int(sampling_factor * min_subset_size)\n", - "print(f\"Number of randomly drawn points per subset: {n_samples_per_subset}\")" - ] - }, - { - "cell_type": "markdown", - "id": "ff363fc5", - "metadata": {}, - "source": [ - "### Plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a9d9163", - "metadata": {}, - "outputs": [], - "source": [ - "# Create comparison plots\n", - "fig, axes = plt.subplots(len(signal_columns), 1, figsize=(14, 4 * len(signal_columns)))\n", - "\n", - "# If only one signal column exists, convert axes to list\n", - "if len(signal_columns) == 1:\n", - " axes = [axes]\n", - "\n", - "# Create a plot for each signal column\n", - "for i, signal_col in enumerate(signal_columns):\n", - " ax = axes[i]\n", - " \n", - " y_pos = 0\n", - " labels = []\n", - " \n", - " # First plot all LOW subsets (sorted, blue)\n", - " for label in sorted(low_subsets.keys()):\n", - " subset = low_subsets[label]\n", - " if len(subset) > 0 and signal_col in subset.columns:\n", - " # Draw random sample\n", - " n_samples = min(n_samples_per_subset, len(subset))\n", - " sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n", - " \n", - " # Calculate mean and median\n", - " mean_val = subset[signal_col].mean()\n", - " median_val = subset[signal_col].median()\n", - " \n", - " # Plot points in blue\n", - " ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n", - " alpha=0.5, s=30, color='blue')\n", - " \n", - " # Mean as black cross\n", - " ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", - " color='black', zorder=5)\n", - " \n", - " # Median as brown cross\n", - " ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", - " color='brown', zorder=5)\n", - " \n", - " labels.append(f\"{label} (n={subset_sizes[label]})\")\n", - " y_pos += 1\n", - " \n", - " # Separation line between LOW and HIGH\n", - " if len(low_subsets) > 0 and len(high_subsets) > 0:\n", - " ax.axhline(y=y_pos - 0.5, color='gray', linestyle='--', linewidth=2, alpha=0.7)\n", - " \n", - " # Then plot all HIGH subsets (sorted, red)\n", - " for label in sorted(high_subsets.keys()):\n", - " subset = high_subsets[label]\n", - " if len(subset) > 0 and signal_col in subset.columns:\n", - " # Draw random sample\n", - " n_samples = min(n_samples_per_subset, len(subset))\n", - " sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n", - " \n", - " # Calculate mean and median\n", - " mean_val = subset[signal_col].mean()\n", - " median_val = subset[signal_col].median()\n", - " \n", - " # Plot points in red\n", - " ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n", - " alpha=0.5, s=30, color='red')\n", - " \n", - " # Mean as black cross\n", - " ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", - " color='black', zorder=5)\n", - " \n", - " # Median as brown cross\n", - " ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", - " color='brown', zorder=5)\n", - " \n", - " labels.append(f\"{label} (n={subset_sizes[label]})\")\n", - " y_pos += 1\n", - " \n", - " ax.set_yticks(range(len(labels)))\n", - " ax.set_yticklabels(labels)\n", - " ax.set_xlabel(f'{signal_col} value')\n", - " ax.set_title(f'{signal_col}: LOW (Blue) vs HIGH (Red) | {n_samples_per_subset} points/subset | Black X = Mean, Brown X = Median')\n", - " ax.grid(True, alpha=0.3, axis='x')\n", - " ax.axvline(0, color='gray', linestyle='--', alpha=0.5)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "print(f\"\\nNote: {n_samples_per_subset} random points were plotted per subset.\")\n", - "print(\"Blue points = LOW subsets | Red points = HIGH subsets\")\n", - "print(\"Black 'X' = Mean of entire subset | Brown 'X' = Median of entire subset\")\n", - "print(f\"Total subsets plotted: {len(low_subsets)} LOW + {len(high_subsets)} HIGH = {len(low_subsets) + len(high_subsets)} subsets\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/EDA/histogramms.ipynb b/EDA/histogramms.ipynb deleted file mode 100644 index e35055f..0000000 --- a/EDA/histogramms.ipynb +++ /dev/null @@ -1,166 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1014c5e0", - "metadata": {}, - "source": [ - "Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e42f3011", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a834496", - "metadata": {}, - "outputs": [], - "source": [ - "path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n", - "df = pd.read_parquet(path=path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa4759fa", - "metadata": {}, - "outputs": [], - "source": [ - "high_nback = df[\n", - " (df[\"STUDY\"]==\"n-back\") &\n", - " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", - " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", - "]\n", - "high_nback.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2aa0596", - "metadata": {}, - "outputs": [], - "source": [ - "low_all = df[\n", - " ((df[\"PHASE\"] == \"baseline\") |\n", - " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n", - "]\n", - "print(low_all.shape)\n", - "high_kdrive = df[\n", - " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", - "]\n", - "print(high_kdrive.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7d446a1", - "metadata": {}, - "outputs": [], - "source": [ - "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n", - "print(df.shape[0])\n", - "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "474e144a", - "metadata": {}, - "outputs": [], - "source": [ - "high_all = pd.concat([high_nback, high_kdrive])\n", - "high_all.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dd585c2", - "metadata": {}, - "outputs": [], - "source": [ - "df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bd39d9f", - "metadata": {}, - "outputs": [], - "source": [ - "# Get all columns that start with 'AU'\n", - "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", - "\n", - "# Calculate number of rows and columns for subplots\n", - "n_cols = len(au_columns)\n", - "n_rows = 4\n", - "n_cols_subplot = 5\n", - "\n", - "# Create figure with subplots\n", - "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n", - "axes = axes.flatten()\n", - "fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n", - "\n", - "# Create histogram for each AU column\n", - "for idx, col in enumerate(au_columns):\n", - " ax = axes[idx]\n", - " \n", - " # Plot overlapping histograms\n", - " ax.hist(low_all[col].dropna(), bins=30, alpha=0.6, color='blue', label='low_all', edgecolor='black')\n", - " ax.hist(high_all[col].dropna(), bins=30, alpha=0.6, color='red', label='high_all', edgecolor='black')\n", - " \n", - " # Set title and labels\n", - " ax.set_title(col, fontsize=10, fontweight='bold')\n", - " ax.set_xlabel('Value', fontsize=8)\n", - " ax.set_ylabel('Frequency', fontsize=8)\n", - " ax.legend(fontsize=8)\n", - " ax.grid(True, alpha=0.3)\n", - "\n", - "# Hide any unused subplots\n", - "for idx in range(len(au_columns), len(axes)):\n", - " axes[idx].set_visible(False)\n", - "\n", - "# Adjust layout\n", - "plt.tight_layout()\n", - "plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/EDA/owncloud.ipynb b/EDA/owncloud.ipynb deleted file mode 100644 index 68572b0..0000000 --- a/EDA/owncloud.ipynb +++ /dev/null @@ -1,157 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "aab6b326-a583-47ad-8bb7-723c2fddcc63", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# %pip install pyocclient\n", - "import yaml\n", - "import owncloud\n", - "import pandas as pd\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f42846c-27c3-4394-a40a-e22d73c2902e", - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "\n", - "with open(\"../login.yaml\") as f:\n", - " cfg = yaml.safe_load(f)\n", - "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n", - "file = \"adabase-public-0022-v_0_0_2.h5py\"\n", - "oc = owncloud.Client.from_public_link(url, folder_password=password)\n", - "\n", - "\n", - "oc.get_file(file, \"tmp22.h5\")\n", - "\n", - "end = time.time()\n", - "print(end - start)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3", - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n", - "end = time.time()\n", - "print(end - start)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f50e97d0", - "metadata": {}, - "outputs": [], - "source": [ - "print(22)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c131c816", - "metadata": {}, - "outputs": [], - "source": [ - "df_performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ae47e52-ad86-4f8d-b929-0080dc99f646", - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n", - "end = time.time()\n", - "print(end - start)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5", - "metadata": {}, - "outputs": [], - "source": [ - "df_4_col.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7", - "metadata": {}, - "outputs": [], - "source": [ - "df_4_col.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95aa4523-3784-4ab6-bf92-0227ce60e863", - "metadata": {}, - "outputs": [], - "source": [ - "df_4_col.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5", - "metadata": {}, - "outputs": [], - "source": [ - "df_4_col.isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72313895-c478-44a5-9108-00b0bec01bb8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/EDA/researchOnSubjectPerformance.ipynb b/EDA/researchOnSubjectPerformance.ipynb deleted file mode 100644 index 5a53635..0000000 --- a/EDA/researchOnSubjectPerformance.ipynb +++ /dev/null @@ -1,213 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8fb02733", - "metadata": {}, - "source": [ - "Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96f3b128", - "metadata": {}, - "outputs": [], - "source": [ - "import yaml\n", - "import owncloud\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "id": "c20cee7c", - "metadata": {}, - "source": [ - "Connection to Owncloud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4c94558", - "metadata": {}, - "outputs": [], - "source": [ - "# Load credentials\n", - "with open(\"../login.yaml\") as f:\n", - " cfg = yaml.safe_load(f)\n", - " \n", - "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n", - "\n", - "# Connect once\n", - "oc = owncloud.Client.from_public_link(url, folder_password=password)\n", - "# File pattern\n", - "# base = \"adabase-public-{num:04d}-v_0_0_2.h5py\"\n", - "base = \"{num:04d}-*.h5py\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07c03d07", - "metadata": {}, - "outputs": [], - "source": [ - "num_files = 2 # number of files to process (min: 1, max: 30)\n", - "performance_data = []\n", - "\n", - "for i in range(num_files):\n", - " file_pattern = f\"{i:04d}-*\"\n", - " \n", - " # Get list of files matching the pattern\n", - " files = oc.list('.')\n", - " matching_files = [f.get_name() for f in files if f.get_name().startswith(f\"{i:04d}-\")]\n", - " \n", - " if matching_files:\n", - " file_name = matching_files[0] # Take the first matching file\n", - " local_tmp = f\"tmp_{i:04d}.h5\"\n", - " \n", - " oc.get_file(file_name, local_tmp)\n", - " print(f\"{file_name} geöffnet\")\n", - " else:\n", - " print(f\"Keine Datei gefunden für Muster: {file_pattern}\")\n", - " # file_name = base.format(num=i)\n", - " # local_tmp = f\"tmp_{i:04d}.h5\"\n", - "\n", - " # oc.get_file(file_name, local_tmp)\n", - " # print(f\"{file_name} geöffnet\")\n", - "\n", - " # check SIGNALS table for AUs\n", - " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", - " cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n", - " au_cols = [c for c in cols if c.startswith(\"AU\")]\n", - " if not au_cols:\n", - " print(f\"Subject {i} enthält keine AUs\")\n", - " continue\n", - "\n", - " # load performance table\n", - " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", - " perf_df = store.select(\"PERFORMANCE\")\n", - "\n", - " f1_cols = [c for c in [\"AUDITIVE F1\", \"VISUAL F1\", \"F1\"] if c in perf_df.columns]\n", - " if not f1_cols:\n", - " print(f\"Subject {i}: keine F1-Spalten gefunden\")\n", - " continue\n", - "\n", - " subject_entry = {\"subjectID\": i}\n", - " valid_scores = []\n", - "\n", - " # iterate rows: each (study, level, phase)\n", - " for _, row in perf_df.iterrows():\n", - " study, level, phase = row[\"STUDY\"], row[\"LEVEL\"], row[\"PHASE\"]\n", - " col_name = f\"STUDY_{study}_LEVEL_{level}_PHASE_{phase}\"\n", - "\n", - " # collect valid F1 values among the three columns\n", - " scores = [row[c] for c in f1_cols if pd.notna(row[c])]\n", - " if scores:\n", - " mean_score = float(np.mean(scores))\n", - " subject_entry[col_name] = mean_score\n", - " valid_scores.extend(scores)\n", - "\n", - " # compute overall average across all valid combinations\n", - " if valid_scores:\n", - " subject_entry[\"overall_score\"] = float(np.mean(valid_scores))\n", - " performance_data.append(subject_entry)\n", - " print(f\"Subject {i}: {len(valid_scores)} gültige Scores, Overall = {subject_entry['overall_score']:.3f}\")\n", - " else:\n", - " print(f\"Subject {i}: keine gültigen F1-Scores\")\n", - "\n", - "# build dataframe\n", - "if performance_data:\n", - " performance_df = pd.DataFrame(performance_data)\n", - " combination_cols = sorted([c for c in performance_df.columns if c.startswith(\"STUDY_\")])\n", - " final_cols = [\"subjectID\", \"overall_score\"] + combination_cols\n", - " performance_df = performance_df[final_cols]\n", - " performance_df.to_csv(\"n_au_performance.csv\", index=False)\n", - "\n", - " print(f\"\\nGesamt Subjects mit Action Units: {len(performance_df)}\")\n", - "else:\n", - " print(\"Keine gültigen Daten gefunden.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bcaf065", - "metadata": {}, - "outputs": [], - "source": [ - "performance_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db95eea7", - "metadata": {}, - "outputs": [], - "source": [ - "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n", - " md = store.select(\"META\")\n", - "print(\"File 0:\")\n", - "print(md)\n", - "with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\") as store:\n", - " md = store.select(\"META\")\n", - "print(\"File 1\")\n", - "print(md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8067036b", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_rows', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f18e7385", - "metadata": {}, - "outputs": [], - "source": [ - "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n", - " md = store.select(\"SIGNALS\", start=0, stop=1)\n", - "print(\"File 0:\")\n", - "md.head()\n", - "# with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\",start=0, stop=1) as store:\n", - "# md = store.select(\"SIGNALS\")\n", - "# print(\"File 1\")\n", - "# print(md.columns)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}