{ "cells": [ { "cell_type": "markdown", "id": "89d81009", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "7440a5b3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from pathlib import Path\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler" ] }, { "cell_type": "markdown", "id": "09b7d707", "metadata": {}, "source": [ "### Config" ] }, { "cell_type": "code", "execution_count": null, "id": "2401aaef", "metadata": {}, "outputs": [], "source": [ "dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n", "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/60s_combined_dataset_25hz.parquet\")\n", "# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0282b0b1", "metadata": {}, "outputs": [], "source": [ "FILTER_MAD = True\n", "THRESHOLD = 3.5\n", "METHOD = 'minmax'\n", "SCOPE = 'subject'\n", "FILTER_SUBSETS = True" ] }, { "cell_type": "markdown", "id": "a8f1716b", "metadata": {}, "source": [ "### Calculations" ] }, { "cell_type": "code", "execution_count": null, "id": "ac32444a", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(dataset_path)\n", "df.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "3ba4401c", "metadata": {}, "outputs": [], "source": [ "if(FILTER_SUBSETS):\n", " # Special filter: Keep only specific subsets\n", "# - k-drive L1 baseline\n", "# - n-back L1 baseline \n", "# - k-drive test with levels 1, 2, 3\n", "\n", " df = df[\n", " (\n", " # k-drive L1 baseline\n", " ((df['STUDY'] == 'k-drive') & \n", " (df['LEVEL'] == 1) & \n", " (df['PHASE'] == 'baseline'))\n", " ) | \n", " (\n", " # n-back L1 baseline\n", " ((df['STUDY'] == 'n-back') & \n", " (df['LEVEL'] == 1) & \n", " (df['PHASE'] == 'baseline'))\n", " ) | \n", " (\n", " # k-drive test with levels 1, 2, 3\n", " ((df['STUDY'] == 'k-drive') & \n", " (df['LEVEL'].isin([1, 2, 3])) & \n", " (df['PHASE'] == 'test'))\n", " )].copy()\n", "\n", "print(f\"Filtered dataframe shape: {df.shape}\")\n", "print(f\"Remaining subsets: {df.groupby(['STUDY', 'LEVEL', 'PHASE']).size()}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "77dbd6df", "metadata": {}, "outputs": [], "source": [ "face_au_cols = [c for c in df.columns if c.startswith(\"FACE_AU\")]\n", "eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", " 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n", " 'Pupil_mean', 'Pupil_IPA']\n", "eye_cols_without_blink = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n", " 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n", " 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n", " 'Sac_median_dur', 'Pupil_mean', 'Pupil_IPA']\n", "print(len(eye_cols))\n", "all_signal_columns = eye_cols+face_au_cols\n", "print(len(all_signal_columns))" ] }, { "cell_type": "markdown", "id": "d5e9c67a", "metadata": {}, "source": [ "MAD" ] }, { "cell_type": "code", "execution_count": null, "id": "592291ef", "metadata": {}, "outputs": [], "source": [ "def calculate_mad_params(df, columns):\n", " \"\"\"\n", " Calculate median and MAD parameters for each column.\n", " This should be run ONLY on the training data.\n", " \n", " Returns a dictionary: {col: (median, mad)}\n", " \"\"\"\n", " params = {}\n", " for col in columns:\n", " median = df[col].median()\n", " mad = np.median(np.abs(df[col] - median))\n", " params[col] = (median, mad)\n", " return params\n", "def apply_mad_filter(df, params, threshold=3.5):\n", " \"\"\"\n", " Apply MAD-based outlier removal using precomputed parameters.\n", " Works on training, validation, and test data.\n", " \n", " df: DataFrame to filter\n", " params: dictionary {col: (median, mad)} from training data\n", " threshold: cutoff for robust Z-score\n", " \"\"\"\n", " df_clean = df.copy()\n", "\n", " for col, (median, mad) in params.items():\n", " if mad == 0:\n", " continue # no spread; nothing to remove for this column\n", "\n", " robust_z = 0.6745 * (df_clean[col] - median) / mad\n", " outlier_mask = np.abs(robust_z) > threshold\n", "\n", " # Remove values only in this specific column\n", " df_clean.loc[outlier_mask, col] = median\n", " \n", " \n", " print(df_clean.shape)\n", " return df_clean" ] }, { "cell_type": "code", "execution_count": null, "id": "4ddad4a8", "metadata": {}, "outputs": [], "source": [ "if(FILTER_MAD):\n", " mad_params = calculate_mad_params(df, all_signal_columns)\n", " df = apply_mad_filter(df, mad_params, THRESHOLD)" ] }, { "cell_type": "markdown", "id": "89387879", "metadata": {}, "source": [ "Normalizer" ] }, { "cell_type": "code", "execution_count": null, "id": "9c129cdd", "metadata": {}, "outputs": [], "source": [ "def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n", " \"\"\"\n", " Fit normalization scalers on training data.\n", " \n", " Parameters:\n", " -----------\n", " train_data : pd.DataFrame\n", " Training dataframe with AU columns and subjectID\n", " au_columns : list\n", " List of AU column names to normalize\n", " method : str, default='standard'\n", " Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n", " scope : str, default='global'\n", " Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n", " \n", " Returns:\n", " --------\n", " dict\n", " Dictionary containing fitted scalers and statistics for new subjects\n", " \"\"\"\n", " if method == 'standard':\n", " Scaler = StandardScaler\n", " elif method == 'minmax':\n", " Scaler = MinMaxScaler\n", " else:\n", " raise ValueError(\"method must be 'standard' or 'minmax'\")\n", " \n", " scalers = {}\n", " if scope == 'subject':\n", " # Fit one scaler per subject\n", " subject_stats = []\n", " \n", " for subject in train_data['subjectID'].unique():\n", " subject_mask = train_data['subjectID'] == subject\n", " scaler = Scaler()\n", " scaler.fit(train_data.loc[subject_mask, au_columns].values)\n", " scalers[subject] = scaler\n", " \n", " # Store statistics for averaging\n", " if method == 'standard':\n", " subject_stats.append({\n", " 'mean': scaler.mean_,\n", " 'std': scaler.scale_\n", " })\n", " elif method == 'minmax':\n", " subject_stats.append({\n", " 'min': scaler.data_min_,\n", " 'max': scaler.data_max_\n", " })\n", " \n", " # Calculate average statistics for new subjects\n", " if method == 'standard':\n", " avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n", " avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n", " fallback_scaler = StandardScaler()\n", " fallback_scaler.mean_ = avg_mean\n", " fallback_scaler.scale_ = avg_std\n", " fallback_scaler.var_ = avg_std ** 2\n", " fallback_scaler.n_features_in_ = len(au_columns)\n", " elif method == 'minmax':\n", " avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n", " avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n", " fallback_scaler = MinMaxScaler()\n", " fallback_scaler.data_min_ = avg_min\n", " fallback_scaler.data_max_ = avg_max\n", " fallback_scaler.data_range_ = avg_max - avg_min\n", " fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n", " fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n", " fallback_scaler.n_features_in_ = len(au_columns)\n", " \n", " scalers['_fallback'] = fallback_scaler\n", " \n", " elif scope == 'global':\n", " # Fit one scaler for all subjects\n", " scaler = Scaler()\n", " scaler.fit(train_data[au_columns].values)\n", " scalers['global'] = scaler\n", " \n", " else:\n", " raise ValueError(\"scope must be 'subject' or 'global'\")\n", " \n", " return {'scalers': scalers, 'method': method, 'scope': scope}" ] }, { "cell_type": "code", "execution_count": null, "id": "9cfabd37", "metadata": {}, "outputs": [], "source": [ "def apply_normalizer(data, columns, normalizer_dict):\n", " \"\"\"\n", " Apply fitted normalization scalers to data.\n", " \n", " Parameters:\n", " -----------\n", " data : pd.DataFrame\n", " Dataframe with AU columns and subjectID\n", " au_columns : list\n", " List of AU column names to normalize\n", " normalizer_dict : dict\n", " Dictionary containing fitted scalers from fit_normalizer()\n", " \n", " Returns:\n", " --------\n", " pd.DataFrame\n", " DataFrame with normalized AU columns\n", " \"\"\"\n", " normalized_data = data.copy()\n", " scalers = normalizer_dict['scalers']\n", " scope = normalizer_dict['scope']\n", " normalized_data[columns] = normalized_data[columns].astype(np.float64)\n", "\n", " if scope == 'subject':\n", " # Apply per-subject normalization\n", " for subject in data['subjectID'].unique():\n", " subject_mask = data['subjectID'] == subject\n", " \n", " # Use the subject's scaler if available, otherwise use fallback\n", " if subject in scalers:\n", " scaler = scalers[subject]\n", " else:\n", " # Use averaged scaler for new subjects\n", " scaler = scalers['_fallback']\n", " print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n", " \n", " normalized_data.loc[subject_mask, columns] = scaler.transform(\n", " data.loc[subject_mask, columns].values\n", " )\n", " \n", " elif scope == 'global':\n", " # Apply global normalization\n", " scaler = scalers['global']\n", " normalized_data[columns] = scaler.transform(data[columns].values)\n", " \n", " return normalized_data" ] }, { "cell_type": "code", "execution_count": null, "id": "4dbbebf7", "metadata": {}, "outputs": [], "source": [ "scaler = fit_normalizer(df, all_signal_columns, method=METHOD, scope=SCOPE)\n", "df_min_max_normalised = apply_normalizer(df, all_signal_columns, scaler)" ] }, { "cell_type": "code", "execution_count": null, "id": "6b9b2ae8", "metadata": {}, "outputs": [], "source": [ "a= df_min_max_normalised[['STUDY','LEVEL','PHASE']]\n", "print(a.dtypes)" ] }, { "cell_type": "code", "execution_count": null, "id": "e3e1bc34", "metadata": {}, "outputs": [], "source": [ "# Define signal columns (adjust only once)\n", "signal_columns = all_signal_columns\n", "\n", "# Get all unique combinations of STUDY, LEVEL and PHASE\n", "unique_combinations = df_min_max_normalised[['STUDY', 'LEVEL', 'PHASE']].drop_duplicates().reset_index(drop=True)\n", "\n", "# Dictionary to store subsets\n", "subsets = {}\n", "subset_sizes = {}\n", "\n", "for idx, row in unique_combinations.iterrows():\n", " study = row['STUDY']\n", " level = row['LEVEL']\n", " phase = row['PHASE']\n", " key = f\"{study}_L{level}_P{phase}\"\n", " subset = df_min_max_normalised[\n", " (df_min_max_normalised['STUDY'] == study) & \n", " (df_min_max_normalised['LEVEL'] == level) & \n", " (df_min_max_normalised['PHASE'] == phase)\n", " ]\n", " subsets[key] = subset\n", " subset_sizes[key] = len(subset)\n", "\n", "# Output subset sizes\n", "print(\"Number of samples per subset:\")\n", "print(\"=\" * 40)\n", "for key, size in subset_sizes.items():\n", " print(f\"{key}: {size} samples\")\n", "print(\"=\" * 40)\n", "print(f\"Total number of subsets: {len(subsets)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c7fdeb5c", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Function to categorize subsets\n", "def categorize_subset(key):\n", " \"\"\"Categorizes a subset as 'low' or 'high' based on the given logic\"\"\"\n", " parts = key.split('_')\n", " study = parts[0]\n", " level = int(parts[1][1:]) # 'L1' -> 1\n", " phase = parts[2][1:] # 'Pbaseline' -> 'baseline'\n", " \n", " # LOW: baseline OR (n-back with level 1 or 4)\n", " if phase == \"baseline\":\n", " return 'low'\n", " elif study == \"n-back\" and level in [1, 4]:\n", " return 'low'\n", " \n", " # HIGH: (n-back with level 2,3,5,6 and phase train/test) OR (k-drive not baseline)\n", " elif study == \"n-back\" and level in [2, 3, 5, 6] and phase in [\"train\", \"test\"]:\n", " return 'high'\n", " elif study == \"k-drive\" and phase != \"baseline\":\n", " return 'high'\n", " \n", " return None\n", "\n", "# Categorize subsets\n", "low_subsets = {}\n", "high_subsets = {}\n", "\n", "for key, subset in subsets.items():\n", " category = categorize_subset(key)\n", " if category == 'low':\n", " low_subsets[key] = subset\n", " elif category == 'high':\n", " high_subsets[key] = subset\n", "\n", "# Output statistics\n", "print(\"\\n\" + \"=\" * 50)\n", "print(\"SUBSET CATEGORIZATION\")\n", "print(\"=\" * 50)\n", "\n", "print(\"\\nLOW subsets (Blue):\")\n", "print(\"-\" * 50)\n", "low_total = 0\n", "for key in sorted(low_subsets.keys()):\n", " size = subset_sizes[key]\n", " low_total += size\n", " print(f\" {key}: {size} samples\")\n", "print(f\"{'TOTAL LOW:':<30} {low_total} samples\")\n", "print(f\"{'NUMBER OF LOW SUBSETS:':<30} {len(low_subsets)}\")\n", "\n", "print(\"\\nHIGH subsets (Red):\")\n", "print(\"-\" * 50)\n", "high_total = 0\n", "for key in sorted(high_subsets.keys()):\n", " size = subset_sizes[key]\n", " high_total += size\n", " print(f\" {key}: {size} samples\")\n", "print(f\"{'TOTAL HIGH:':<30} {high_total} samples\")\n", "print(f\"{'NUMBER OF HIGH SUBSETS:':<30} {len(high_subsets)}\")\n", "\n", "print(\"\\n\" + \"=\" * 50)\n", "print(f\"TOTAL SAMPLES: {low_total + high_total}\")\n", "print(f\"TOTAL SUBSETS: {len(low_subsets) + len(high_subsets)}\")\n", "print(\"=\" * 50)\n", "\n", "# Find minimum subset size\n", "min_subset_size = min(subset_sizes.values())\n", "print(f\"\\nMinimum subset size: {min_subset_size}\")\n", "\n", "# Number of points to plot per subset (50% of minimum size)\n", "sampling_factor = 1\n", "n_samples_per_subset = int(sampling_factor * min_subset_size)\n", "print(f\"Number of randomly drawn points per subset: {n_samples_per_subset}\")" ] }, { "cell_type": "markdown", "id": "ff363fc5", "metadata": {}, "source": [ "### Plot" ] }, { "cell_type": "code", "execution_count": null, "id": "3a9d9163", "metadata": {}, "outputs": [], "source": [ "# Create comparison plots\n", "fig, axes = plt.subplots(len(signal_columns), 1, figsize=(14, 4 * len(signal_columns)))\n", "\n", "# If only one signal column exists, convert axes to list\n", "if len(signal_columns) == 1:\n", " axes = [axes]\n", "\n", "# Create a plot for each signal column\n", "for i, signal_col in enumerate(signal_columns):\n", " ax = axes[i]\n", " \n", " y_pos = 0\n", " labels = []\n", " \n", " # First plot all LOW subsets (sorted, blue)\n", " for label in sorted(low_subsets.keys()):\n", " subset = low_subsets[label]\n", " if len(subset) > 0 and signal_col in subset.columns:\n", " # Draw random sample\n", " n_samples = min(n_samples_per_subset, len(subset))\n", " sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n", " \n", " # Calculate mean and median\n", " mean_val = subset[signal_col].mean()\n", " median_val = subset[signal_col].median()\n", " \n", " # Plot points in blue\n", " ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n", " alpha=0.5, s=30, color='blue')\n", " \n", " # Mean as black cross\n", " ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", " color='black', zorder=5)\n", " \n", " # Median as brown cross\n", " ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", " color='brown', zorder=5)\n", " \n", " labels.append(f\"{label} (n={subset_sizes[label]})\")\n", " y_pos += 1\n", " \n", " # Separation line between LOW and HIGH\n", " if len(low_subsets) > 0 and len(high_subsets) > 0:\n", " ax.axhline(y=y_pos - 0.5, color='gray', linestyle='--', linewidth=2, alpha=0.7)\n", " \n", " # Then plot all HIGH subsets (sorted, red)\n", " for label in sorted(high_subsets.keys()):\n", " subset = high_subsets[label]\n", " if len(subset) > 0 and signal_col in subset.columns:\n", " # Draw random sample\n", " n_samples = min(n_samples_per_subset, len(subset))\n", " sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n", " \n", " # Calculate mean and median\n", " mean_val = subset[signal_col].mean()\n", " median_val = subset[signal_col].median()\n", " \n", " # Plot points in red\n", " ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n", " alpha=0.5, s=30, color='red')\n", " \n", " # Mean as black cross\n", " ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", " color='black', zorder=5)\n", " \n", " # Median as brown cross\n", " ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n", " color='brown', zorder=5)\n", " \n", " labels.append(f\"{label} (n={subset_sizes[label]})\")\n", " y_pos += 1\n", " \n", " ax.set_yticks(range(len(labels)))\n", " ax.set_yticklabels(labels)\n", " ax.set_xlabel(f'{signal_col} value')\n", " ax.set_title(f'{signal_col}: LOW (Blue) vs HIGH (Red) | {n_samples_per_subset} points/subset | Black X = Mean, Brown X = Median')\n", " ax.grid(True, alpha=0.3, axis='x')\n", " ax.axvline(0, color='gray', linestyle='--', alpha=0.5)\n", "\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "print(f\"\\nNote: {n_samples_per_subset} random points were plotted per subset.\")\n", "print(\"Blue points = LOW subsets | Red points = HIGH subsets\")\n", "print(\"Black 'X' = Mean of entire subset | Brown 'X' = Median of entire subset\")\n", "print(f\"Total subsets plotted: {len(low_subsets)} LOW + {len(high_subsets)} HIGH = {len(low_subsets) + len(high_subsets)} subsets\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }