Compare commits
1 Commits
deployment
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 9951d8b4f9 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,5 +3,4 @@
|
||||
!*.py
|
||||
!*.ipynb
|
||||
!*.md
|
||||
!*.parquet
|
||||
!.gitignore
|
||||
|
||||
259
EDA/EDA.ipynb
Normal file
259
EDA/EDA.ipynb
Normal file
@ -0,0 +1,259 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "7440a5b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import h5py\n",
|
||||
"import os\n",
|
||||
"import warnings\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2401aaef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"file_path = \"adabase-public-0020-v_0_0_2.h5py\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "46280999",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SKT_SR = 100\n",
|
||||
"ECG_SR = 500\n",
|
||||
"RSP_SR = 250\n",
|
||||
"EMG_SR = 1000\n",
|
||||
"EDA_SR = 500\n",
|
||||
"EYE_SR = 250"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e23eb552",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_signals = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b7f494d1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.set_option('display.max_columns', None)\n",
|
||||
"pd.set_option('display.max_rows', None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dd2f4d84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"settings = df_signals[['STUDY','PHASE','LEVEL']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1699ddc2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"settings.value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4731c56",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Actions units"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9db0b4b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_signals.columns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ceccc89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"au_data = df_signals.iloc[:,-20:]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d4ee088",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"au_data.tail()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5d85a8cb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(au_data.shape)\n",
|
||||
"print(au_data.isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "efff356f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clean_au_data = au_data.dropna()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "42ed1bcd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clean_au_data.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c7c3f14",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(len(clean_au_data.columns)):\n",
|
||||
" print(clean_au_data.iloc[:,i].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "332740a8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Plots"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f30b8814",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# df_signals_ecg = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I'])\n",
|
||||
"df_signals_ecg = df_signals[[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I']]\n",
|
||||
"df_signals_ecg.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ee80fd79",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"study_filter = df_signals[\"STUDY\"] == \"n-back\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ef29446",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fig, ax = plt.subplots(figsize=(16, 2))\n",
|
||||
"# Set the number of seconds to plot\n",
|
||||
"seconds = 20\n",
|
||||
"# Get the ECG signal data\n",
|
||||
"ecg_signal = df_signals.loc[study_filter, \"RAW_ECG_I\"].dropna()\n",
|
||||
"# Set the x-axis limits to the number of samples in the specified time range\n",
|
||||
"num_samples = ECG_SR * seconds\n",
|
||||
"# Plot the ECG signal\n",
|
||||
"ax.plot(ecg_signal.index[:num_samples]/1000, ecg_signal[:num_samples]);\n",
|
||||
"ax.set_title(\"ECG I\");\n",
|
||||
"ax.set_xlabel('Seconds');\n",
|
||||
"# Set figure size with a 16:6 aspect ratio\n",
|
||||
"fig, ax = plt.subplots(figsize=(16, 2))\n",
|
||||
"# Set the number of seconds to plot\n",
|
||||
"start_second = 0\n",
|
||||
"end_second = 60*30\n",
|
||||
"# Get the EYE signal data - we replace inf with nan to get the original signal.␣\n",
|
||||
"\n",
|
||||
"eye_left_signal = df_signals.loc[study_filter, \"LEFT_PUPIL_DIAMETER\"].dropna()\n",
|
||||
"eye_right_signal = df_signals.loc[study_filter, \"RIGHT_PUPIL_DIAMETER\"].dropna()\n",
|
||||
"#eye_left_signal = df_signals.loc[:, \"LEFT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
|
||||
"\n",
|
||||
"#eye_right_signal = df_signals.loc[:, \"RIGHT_PUPIL_DIAMETER\"].replace([np.inf],␣\n",
|
||||
"\n",
|
||||
"# Set the x-axis limits to the number of samples in the specified time range\n",
|
||||
"num_samples_start = EYE_SR * start_second\n",
|
||||
"num_samples_end = EYE_SR * end_second\n",
|
||||
"ax.plot(eye_left_signal.index[num_samples_start:num_samples_end]/1000,eye_left_signal[num_samples_start:num_samples_end], label=\"Left\")\n",
|
||||
"ax.plot(eye_right_signal.index[num_samples_start:num_samples_end]/1000,eye_right_signal[num_samples_start:num_samples_end], label=\"Right\")\n",
|
||||
"ax.set_title(\"Pupil Dilation\")\n",
|
||||
"ax.set_xlabel('Seconds')\n",
|
||||
"ax.legend()\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
625
EDA/distribution_plots.ipynb
Normal file
625
EDA/distribution_plots.ipynb
Normal file
@ -0,0 +1,625 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "89d81009",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7440a5b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from pathlib import Path\n",
|
||||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09b7d707",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2401aaef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n",
|
||||
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/60s_combined_dataset_25hz.parquet\")\n",
|
||||
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0282b0b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FILTER_MAD = True\n",
|
||||
"THRESHOLD = 3.5\n",
|
||||
"METHOD = 'minmax'\n",
|
||||
"SCOPE = 'subject'\n",
|
||||
"FILTER_SUBSETS = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a8f1716b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ac32444a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_parquet(dataset_path)\n",
|
||||
"df.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ba4401c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if(FILTER_SUBSETS):\n",
|
||||
" # Special filter: Keep only specific subsets\n",
|
||||
"# - k-drive L1 baseline\n",
|
||||
"# - n-back L1 baseline \n",
|
||||
"# - k-drive test with levels 1, 2, 3\n",
|
||||
"\n",
|
||||
" df = df[\n",
|
||||
" (\n",
|
||||
" # k-drive L1 baseline\n",
|
||||
" ((df['STUDY'] == 'k-drive') & \n",
|
||||
" (df['LEVEL'] == 1) & \n",
|
||||
" (df['PHASE'] == 'baseline'))\n",
|
||||
" ) | \n",
|
||||
" (\n",
|
||||
" # n-back L1 baseline\n",
|
||||
" ((df['STUDY'] == 'n-back') & \n",
|
||||
" (df['LEVEL'] == 1) & \n",
|
||||
" (df['PHASE'] == 'baseline'))\n",
|
||||
" ) | \n",
|
||||
" (\n",
|
||||
" # k-drive test with levels 1, 2, 3\n",
|
||||
" ((df['STUDY'] == 'k-drive') & \n",
|
||||
" (df['LEVEL'].isin([1, 2, 3])) & \n",
|
||||
" (df['PHASE'] == 'test'))\n",
|
||||
" )].copy()\n",
|
||||
"\n",
|
||||
"print(f\"Filtered dataframe shape: {df.shape}\")\n",
|
||||
"print(f\"Remaining subsets: {df.groupby(['STUDY', 'LEVEL', 'PHASE']).size()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "77dbd6df",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"face_au_cols = [c for c in df.columns if c.startswith(\"FACE_AU\")]\n",
|
||||
"eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
|
||||
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
|
||||
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
|
||||
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
|
||||
" 'Pupil_mean', 'Pupil_IPA']\n",
|
||||
"eye_cols_without_blink = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
|
||||
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
|
||||
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
|
||||
" 'Sac_median_dur', 'Pupil_mean', 'Pupil_IPA']\n",
|
||||
"print(len(eye_cols))\n",
|
||||
"all_signal_columns = eye_cols+face_au_cols\n",
|
||||
"print(len(all_signal_columns))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d5e9c67a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"MAD"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "592291ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def calculate_mad_params(df, columns):\n",
|
||||
" \"\"\"\n",
|
||||
" Calculate median and MAD parameters for each column.\n",
|
||||
" This should be run ONLY on the training data.\n",
|
||||
" \n",
|
||||
" Returns a dictionary: {col: (median, mad)}\n",
|
||||
" \"\"\"\n",
|
||||
" params = {}\n",
|
||||
" for col in columns:\n",
|
||||
" median = df[col].median()\n",
|
||||
" mad = np.median(np.abs(df[col] - median))\n",
|
||||
" params[col] = (median, mad)\n",
|
||||
" return params\n",
|
||||
"def apply_mad_filter(df, params, threshold=3.5):\n",
|
||||
" \"\"\"\n",
|
||||
" Apply MAD-based outlier removal using precomputed parameters.\n",
|
||||
" Works on training, validation, and test data.\n",
|
||||
" \n",
|
||||
" df: DataFrame to filter\n",
|
||||
" params: dictionary {col: (median, mad)} from training data\n",
|
||||
" threshold: cutoff for robust Z-score\n",
|
||||
" \"\"\"\n",
|
||||
" df_clean = df.copy()\n",
|
||||
"\n",
|
||||
" for col, (median, mad) in params.items():\n",
|
||||
" if mad == 0:\n",
|
||||
" continue # no spread; nothing to remove for this column\n",
|
||||
"\n",
|
||||
" robust_z = 0.6745 * (df_clean[col] - median) / mad\n",
|
||||
" outlier_mask = np.abs(robust_z) > threshold\n",
|
||||
"\n",
|
||||
" # Remove values only in this specific column\n",
|
||||
" df_clean.loc[outlier_mask, col] = median\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" print(df_clean.shape)\n",
|
||||
" return df_clean"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ddad4a8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if(FILTER_MAD):\n",
|
||||
" mad_params = calculate_mad_params(df, all_signal_columns)\n",
|
||||
" df = apply_mad_filter(df, mad_params, THRESHOLD)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "89387879",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Normalizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c129cdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def fit_normalizer(train_data, au_columns, method='standard', scope='global'):\n",
|
||||
" \"\"\"\n",
|
||||
" Fit normalization scalers on training data.\n",
|
||||
" \n",
|
||||
" Parameters:\n",
|
||||
" -----------\n",
|
||||
" train_data : pd.DataFrame\n",
|
||||
" Training dataframe with AU columns and subjectID\n",
|
||||
" au_columns : list\n",
|
||||
" List of AU column names to normalize\n",
|
||||
" method : str, default='standard'\n",
|
||||
" Normalization method: 'standard' for StandardScaler or 'minmax' for MinMaxScaler\n",
|
||||
" scope : str, default='global'\n",
|
||||
" Normalization scope: 'subject' for per-subject or 'global' for across all subjects\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" --------\n",
|
||||
" dict\n",
|
||||
" Dictionary containing fitted scalers and statistics for new subjects\n",
|
||||
" \"\"\"\n",
|
||||
" if method == 'standard':\n",
|
||||
" Scaler = StandardScaler\n",
|
||||
" elif method == 'minmax':\n",
|
||||
" Scaler = MinMaxScaler\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"method must be 'standard' or 'minmax'\")\n",
|
||||
" \n",
|
||||
" scalers = {}\n",
|
||||
" if scope == 'subject':\n",
|
||||
" # Fit one scaler per subject\n",
|
||||
" subject_stats = []\n",
|
||||
" \n",
|
||||
" for subject in train_data['subjectID'].unique():\n",
|
||||
" subject_mask = train_data['subjectID'] == subject\n",
|
||||
" scaler = Scaler()\n",
|
||||
" scaler.fit(train_data.loc[subject_mask, au_columns].values)\n",
|
||||
" scalers[subject] = scaler\n",
|
||||
" \n",
|
||||
" # Store statistics for averaging\n",
|
||||
" if method == 'standard':\n",
|
||||
" subject_stats.append({\n",
|
||||
" 'mean': scaler.mean_,\n",
|
||||
" 'std': scaler.scale_\n",
|
||||
" })\n",
|
||||
" elif method == 'minmax':\n",
|
||||
" subject_stats.append({\n",
|
||||
" 'min': scaler.data_min_,\n",
|
||||
" 'max': scaler.data_max_\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" # Calculate average statistics for new subjects\n",
|
||||
" if method == 'standard':\n",
|
||||
" avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)\n",
|
||||
" avg_std = np.mean([s['std'] for s in subject_stats], axis=0)\n",
|
||||
" fallback_scaler = StandardScaler()\n",
|
||||
" fallback_scaler.mean_ = avg_mean\n",
|
||||
" fallback_scaler.scale_ = avg_std\n",
|
||||
" fallback_scaler.var_ = avg_std ** 2\n",
|
||||
" fallback_scaler.n_features_in_ = len(au_columns)\n",
|
||||
" elif method == 'minmax':\n",
|
||||
" avg_min = np.mean([s['min'] for s in subject_stats], axis=0)\n",
|
||||
" avg_max = np.mean([s['max'] for s in subject_stats], axis=0)\n",
|
||||
" fallback_scaler = MinMaxScaler()\n",
|
||||
" fallback_scaler.data_min_ = avg_min\n",
|
||||
" fallback_scaler.data_max_ = avg_max\n",
|
||||
" fallback_scaler.data_range_ = avg_max - avg_min\n",
|
||||
" fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_\n",
|
||||
" fallback_scaler.min_ = -avg_min * fallback_scaler.scale_\n",
|
||||
" fallback_scaler.n_features_in_ = len(au_columns)\n",
|
||||
" \n",
|
||||
" scalers['_fallback'] = fallback_scaler\n",
|
||||
" \n",
|
||||
" elif scope == 'global':\n",
|
||||
" # Fit one scaler for all subjects\n",
|
||||
" scaler = Scaler()\n",
|
||||
" scaler.fit(train_data[au_columns].values)\n",
|
||||
" scalers['global'] = scaler\n",
|
||||
" \n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"scope must be 'subject' or 'global'\")\n",
|
||||
" \n",
|
||||
" return {'scalers': scalers, 'method': method, 'scope': scope}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9cfabd37",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def apply_normalizer(data, columns, normalizer_dict):\n",
|
||||
" \"\"\"\n",
|
||||
" Apply fitted normalization scalers to data.\n",
|
||||
" \n",
|
||||
" Parameters:\n",
|
||||
" -----------\n",
|
||||
" data : pd.DataFrame\n",
|
||||
" Dataframe with AU columns and subjectID\n",
|
||||
" au_columns : list\n",
|
||||
" List of AU column names to normalize\n",
|
||||
" normalizer_dict : dict\n",
|
||||
" Dictionary containing fitted scalers from fit_normalizer()\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" --------\n",
|
||||
" pd.DataFrame\n",
|
||||
" DataFrame with normalized AU columns\n",
|
||||
" \"\"\"\n",
|
||||
" normalized_data = data.copy()\n",
|
||||
" scalers = normalizer_dict['scalers']\n",
|
||||
" scope = normalizer_dict['scope']\n",
|
||||
" normalized_data[columns] = normalized_data[columns].astype(np.float64)\n",
|
||||
"\n",
|
||||
" if scope == 'subject':\n",
|
||||
" # Apply per-subject normalization\n",
|
||||
" for subject in data['subjectID'].unique():\n",
|
||||
" subject_mask = data['subjectID'] == subject\n",
|
||||
" \n",
|
||||
" # Use the subject's scaler if available, otherwise use fallback\n",
|
||||
" if subject in scalers:\n",
|
||||
" scaler = scalers[subject]\n",
|
||||
" else:\n",
|
||||
" # Use averaged scaler for new subjects\n",
|
||||
" scaler = scalers['_fallback']\n",
|
||||
" print(f\"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.\")\n",
|
||||
" \n",
|
||||
" normalized_data.loc[subject_mask, columns] = scaler.transform(\n",
|
||||
" data.loc[subject_mask, columns].values\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" elif scope == 'global':\n",
|
||||
" # Apply global normalization\n",
|
||||
" scaler = scalers['global']\n",
|
||||
" normalized_data[columns] = scaler.transform(data[columns].values)\n",
|
||||
" \n",
|
||||
" return normalized_data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4dbbebf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scaler = fit_normalizer(df, all_signal_columns, method=METHOD, scope=SCOPE)\n",
|
||||
"df_min_max_normalised = apply_normalizer(df, all_signal_columns, scaler)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6b9b2ae8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a= df_min_max_normalised[['STUDY','LEVEL','PHASE']]\n",
|
||||
"print(a.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3e1bc34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define signal columns (adjust only once)\n",
|
||||
"signal_columns = all_signal_columns\n",
|
||||
"\n",
|
||||
"# Get all unique combinations of STUDY, LEVEL and PHASE\n",
|
||||
"unique_combinations = df_min_max_normalised[['STUDY', 'LEVEL', 'PHASE']].drop_duplicates().reset_index(drop=True)\n",
|
||||
"\n",
|
||||
"# Dictionary to store subsets\n",
|
||||
"subsets = {}\n",
|
||||
"subset_sizes = {}\n",
|
||||
"\n",
|
||||
"for idx, row in unique_combinations.iterrows():\n",
|
||||
" study = row['STUDY']\n",
|
||||
" level = row['LEVEL']\n",
|
||||
" phase = row['PHASE']\n",
|
||||
" key = f\"{study}_L{level}_P{phase}\"\n",
|
||||
" subset = df_min_max_normalised[\n",
|
||||
" (df_min_max_normalised['STUDY'] == study) & \n",
|
||||
" (df_min_max_normalised['LEVEL'] == level) & \n",
|
||||
" (df_min_max_normalised['PHASE'] == phase)\n",
|
||||
" ]\n",
|
||||
" subsets[key] = subset\n",
|
||||
" subset_sizes[key] = len(subset)\n",
|
||||
"\n",
|
||||
"# Output subset sizes\n",
|
||||
"print(\"Number of samples per subset:\")\n",
|
||||
"print(\"=\" * 40)\n",
|
||||
"for key, size in subset_sizes.items():\n",
|
||||
" print(f\"{key}: {size} samples\")\n",
|
||||
"print(\"=\" * 40)\n",
|
||||
"print(f\"Total number of subsets: {len(subsets)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c7fdeb5c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Function to categorize subsets\n",
|
||||
"def categorize_subset(key):\n",
|
||||
" \"\"\"Categorizes a subset as 'low' or 'high' based on the given logic\"\"\"\n",
|
||||
" parts = key.split('_')\n",
|
||||
" study = parts[0]\n",
|
||||
" level = int(parts[1][1:]) # 'L1' -> 1\n",
|
||||
" phase = parts[2][1:] # 'Pbaseline' -> 'baseline'\n",
|
||||
" \n",
|
||||
" # LOW: baseline OR (n-back with level 1 or 4)\n",
|
||||
" if phase == \"baseline\":\n",
|
||||
" return 'low'\n",
|
||||
" elif study == \"n-back\" and level in [1, 4]:\n",
|
||||
" return 'low'\n",
|
||||
" \n",
|
||||
" # HIGH: (n-back with level 2,3,5,6 and phase train/test) OR (k-drive not baseline)\n",
|
||||
" elif study == \"n-back\" and level in [2, 3, 5, 6] and phase in [\"train\", \"test\"]:\n",
|
||||
" return 'high'\n",
|
||||
" elif study == \"k-drive\" and phase != \"baseline\":\n",
|
||||
" return 'high'\n",
|
||||
" \n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"# Categorize subsets\n",
|
||||
"low_subsets = {}\n",
|
||||
"high_subsets = {}\n",
|
||||
"\n",
|
||||
"for key, subset in subsets.items():\n",
|
||||
" category = categorize_subset(key)\n",
|
||||
" if category == 'low':\n",
|
||||
" low_subsets[key] = subset\n",
|
||||
" elif category == 'high':\n",
|
||||
" high_subsets[key] = subset\n",
|
||||
"\n",
|
||||
"# Output statistics\n",
|
||||
"print(\"\\n\" + \"=\" * 50)\n",
|
||||
"print(\"SUBSET CATEGORIZATION\")\n",
|
||||
"print(\"=\" * 50)\n",
|
||||
"\n",
|
||||
"print(\"\\nLOW subsets (Blue):\")\n",
|
||||
"print(\"-\" * 50)\n",
|
||||
"low_total = 0\n",
|
||||
"for key in sorted(low_subsets.keys()):\n",
|
||||
" size = subset_sizes[key]\n",
|
||||
" low_total += size\n",
|
||||
" print(f\" {key}: {size} samples\")\n",
|
||||
"print(f\"{'TOTAL LOW:':<30} {low_total} samples\")\n",
|
||||
"print(f\"{'NUMBER OF LOW SUBSETS:':<30} {len(low_subsets)}\")\n",
|
||||
"\n",
|
||||
"print(\"\\nHIGH subsets (Red):\")\n",
|
||||
"print(\"-\" * 50)\n",
|
||||
"high_total = 0\n",
|
||||
"for key in sorted(high_subsets.keys()):\n",
|
||||
" size = subset_sizes[key]\n",
|
||||
" high_total += size\n",
|
||||
" print(f\" {key}: {size} samples\")\n",
|
||||
"print(f\"{'TOTAL HIGH:':<30} {high_total} samples\")\n",
|
||||
"print(f\"{'NUMBER OF HIGH SUBSETS:':<30} {len(high_subsets)}\")\n",
|
||||
"\n",
|
||||
"print(\"\\n\" + \"=\" * 50)\n",
|
||||
"print(f\"TOTAL SAMPLES: {low_total + high_total}\")\n",
|
||||
"print(f\"TOTAL SUBSETS: {len(low_subsets) + len(high_subsets)}\")\n",
|
||||
"print(\"=\" * 50)\n",
|
||||
"\n",
|
||||
"# Find minimum subset size\n",
|
||||
"min_subset_size = min(subset_sizes.values())\n",
|
||||
"print(f\"\\nMinimum subset size: {min_subset_size}\")\n",
|
||||
"\n",
|
||||
"# Number of points to plot per subset (50% of minimum size)\n",
|
||||
"sampling_factor = 1\n",
|
||||
"n_samples_per_subset = int(sampling_factor * min_subset_size)\n",
|
||||
"print(f\"Number of randomly drawn points per subset: {n_samples_per_subset}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ff363fc5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Plot"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a9d9163",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create comparison plots\n",
|
||||
"fig, axes = plt.subplots(len(signal_columns), 1, figsize=(14, 4 * len(signal_columns)))\n",
|
||||
"\n",
|
||||
"# If only one signal column exists, convert axes to list\n",
|
||||
"if len(signal_columns) == 1:\n",
|
||||
" axes = [axes]\n",
|
||||
"\n",
|
||||
"# Create a plot for each signal column\n",
|
||||
"for i, signal_col in enumerate(signal_columns):\n",
|
||||
" ax = axes[i]\n",
|
||||
" \n",
|
||||
" y_pos = 0\n",
|
||||
" labels = []\n",
|
||||
" \n",
|
||||
" # First plot all LOW subsets (sorted, blue)\n",
|
||||
" for label in sorted(low_subsets.keys()):\n",
|
||||
" subset = low_subsets[label]\n",
|
||||
" if len(subset) > 0 and signal_col in subset.columns:\n",
|
||||
" # Draw random sample\n",
|
||||
" n_samples = min(n_samples_per_subset, len(subset))\n",
|
||||
" sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
|
||||
" \n",
|
||||
" # Calculate mean and median\n",
|
||||
" mean_val = subset[signal_col].mean()\n",
|
||||
" median_val = subset[signal_col].median()\n",
|
||||
" \n",
|
||||
" # Plot points in blue\n",
|
||||
" ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
|
||||
" alpha=0.5, s=30, color='blue')\n",
|
||||
" \n",
|
||||
" # Mean as black cross\n",
|
||||
" ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
|
||||
" color='black', zorder=5)\n",
|
||||
" \n",
|
||||
" # Median as brown cross\n",
|
||||
" ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
|
||||
" color='brown', zorder=5)\n",
|
||||
" \n",
|
||||
" labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
|
||||
" y_pos += 1\n",
|
||||
" \n",
|
||||
" # Separation line between LOW and HIGH\n",
|
||||
" if len(low_subsets) > 0 and len(high_subsets) > 0:\n",
|
||||
" ax.axhline(y=y_pos - 0.5, color='gray', linestyle='--', linewidth=2, alpha=0.7)\n",
|
||||
" \n",
|
||||
" # Then plot all HIGH subsets (sorted, red)\n",
|
||||
" for label in sorted(high_subsets.keys()):\n",
|
||||
" subset = high_subsets[label]\n",
|
||||
" if len(subset) > 0 and signal_col in subset.columns:\n",
|
||||
" # Draw random sample\n",
|
||||
" n_samples = min(n_samples_per_subset, len(subset))\n",
|
||||
" sampled_data = subset[signal_col].sample(n=n_samples, random_state=42)\n",
|
||||
" \n",
|
||||
" # Calculate mean and median\n",
|
||||
" mean_val = subset[signal_col].mean()\n",
|
||||
" median_val = subset[signal_col].median()\n",
|
||||
" \n",
|
||||
" # Plot points in red\n",
|
||||
" ax.scatter(sampled_data, [y_pos] * len(sampled_data), \n",
|
||||
" alpha=0.5, s=30, color='red')\n",
|
||||
" \n",
|
||||
" # Mean as black cross\n",
|
||||
" ax.plot(mean_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
|
||||
" color='black', zorder=5)\n",
|
||||
" \n",
|
||||
" # Median as brown cross\n",
|
||||
" ax.plot(median_val, y_pos, 'x', markersize=12, markeredgewidth=3, \n",
|
||||
" color='brown', zorder=5)\n",
|
||||
" \n",
|
||||
" labels.append(f\"{label} (n={subset_sizes[label]})\")\n",
|
||||
" y_pos += 1\n",
|
||||
" \n",
|
||||
" ax.set_yticks(range(len(labels)))\n",
|
||||
" ax.set_yticklabels(labels)\n",
|
||||
" ax.set_xlabel(f'{signal_col} value')\n",
|
||||
" ax.set_title(f'{signal_col}: LOW (Blue) vs HIGH (Red) | {n_samples_per_subset} points/subset | Black X = Mean, Brown X = Median')\n",
|
||||
" ax.grid(True, alpha=0.3, axis='x')\n",
|
||||
" ax.axvline(0, color='gray', linestyle='--', alpha=0.5)\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"print(f\"\\nNote: {n_samples_per_subset} random points were plotted per subset.\")\n",
|
||||
"print(\"Blue points = LOW subsets | Red points = HIGH subsets\")\n",
|
||||
"print(\"Black 'X' = Mean of entire subset | Brown 'X' = Median of entire subset\")\n",
|
||||
"print(f\"Total subsets plotted: {len(low_subsets)} LOW + {len(high_subsets)} HIGH = {len(low_subsets) + len(high_subsets)} subsets\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
166
EDA/histogramms.ipynb
Normal file
166
EDA/histogramms.ipynb
Normal file
@ -0,0 +1,166 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1014c5e0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e42f3011",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0a834496",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
|
||||
"df = pd.read_parquet(path=path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa4759fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"high_nback = df[\n",
|
||||
" (df[\"STUDY\"]==\"n-back\") &\n",
|
||||
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
|
||||
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
|
||||
"]\n",
|
||||
"high_nback.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2aa0596",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"low_all = df[\n",
|
||||
" ((df[\"PHASE\"] == \"baseline\") |\n",
|
||||
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
|
||||
"]\n",
|
||||
"print(low_all.shape)\n",
|
||||
"high_kdrive = df[\n",
|
||||
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
|
||||
"]\n",
|
||||
"print(high_kdrive.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f7d446a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
|
||||
"print(df.shape[0])\n",
|
||||
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "474e144a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"high_all = pd.concat([high_nback, high_kdrive])\n",
|
||||
"high_all.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5dd585c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.dtypes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0bd39d9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all columns that start with 'AU'\n",
|
||||
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||
"\n",
|
||||
"# Calculate number of rows and columns for subplots\n",
|
||||
"n_cols = len(au_columns)\n",
|
||||
"n_rows = 4\n",
|
||||
"n_cols_subplot = 5\n",
|
||||
"\n",
|
||||
"# Create figure with subplots\n",
|
||||
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
||||
"axes = axes.flatten()\n",
|
||||
"fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
||||
"\n",
|
||||
"# Create histogram for each AU column\n",
|
||||
"for idx, col in enumerate(au_columns):\n",
|
||||
" ax = axes[idx]\n",
|
||||
" \n",
|
||||
" # Plot overlapping histograms\n",
|
||||
" ax.hist(low_all[col].dropna(), bins=30, alpha=0.6, color='blue', label='low_all', edgecolor='black')\n",
|
||||
" ax.hist(high_all[col].dropna(), bins=30, alpha=0.6, color='red', label='high_all', edgecolor='black')\n",
|
||||
" \n",
|
||||
" # Set title and labels\n",
|
||||
" ax.set_title(col, fontsize=10, fontweight='bold')\n",
|
||||
" ax.set_xlabel('Value', fontsize=8)\n",
|
||||
" ax.set_ylabel('Frequency', fontsize=8)\n",
|
||||
" ax.legend(fontsize=8)\n",
|
||||
" ax.grid(True, alpha=0.3)\n",
|
||||
"\n",
|
||||
"# Hide any unused subplots\n",
|
||||
"for idx in range(len(au_columns), len(axes)):\n",
|
||||
" axes[idx].set_visible(False)\n",
|
||||
"\n",
|
||||
"# Adjust layout\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
157
EDA/owncloud.ipynb
Normal file
157
EDA/owncloud.ipynb
Normal file
@ -0,0 +1,157 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aab6b326-a583-47ad-8bb7-723c2fddcc63",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install pyocclient\n",
|
||||
"import yaml\n",
|
||||
"import owncloud\n",
|
||||
"import pandas as pd\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f42846c-27c3-4394-a40a-e22d73c2902e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"start = time.time()\n",
|
||||
"\n",
|
||||
"with open(\"../login.yaml\") as f:\n",
|
||||
" cfg = yaml.safe_load(f)\n",
|
||||
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
|
||||
"file = \"adabase-public-0022-v_0_0_2.h5py\"\n",
|
||||
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"oc.get_file(file, \"tmp22.h5\")\n",
|
||||
"\n",
|
||||
"end = time.time()\n",
|
||||
"print(end - start)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3714dec2-85d0-4f76-af46-ea45ebec2fa3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"start = time.time()\n",
|
||||
"df_performance = pd.read_hdf(\"tmp22.h5\", \"PERFORMANCE\")\n",
|
||||
"end = time.time()\n",
|
||||
"print(end - start)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f50e97d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(22)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c131c816",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_performance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ae47e52-ad86-4f8d-b929-0080dc99f646",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"start = time.time()\n",
|
||||
"df_4_col = pd.read_hdf(\"tmp.h5\", \"SIGNALS\", mode=\"r\", columns=[\"STUDY\"], start=0, stop=1)\n",
|
||||
"end = time.time()\n",
|
||||
"print(end - start)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c139f3a-ede8-4530-957d-d1bb939f6cb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_4_col.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a68d58ea-65f2-46c4-a2b2-8c3447c715d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_4_col.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "95aa4523-3784-4ab6-bf92-0227ce60e863",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_4_col.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "defbcaf4-ad1b-453f-9b48-ab0ecfc4b5d5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_4_col.isna().sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "72313895-c478-44a5-9108-00b0bec01bb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
213
EDA/researchOnSubjectPerformance.ipynb
Normal file
213
EDA/researchOnSubjectPerformance.ipynb
Normal file
@ -0,0 +1,213 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8fb02733",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "96f3b128",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"import owncloud\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c20cee7c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Connection to Owncloud"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c4c94558",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load credentials\n",
|
||||
"with open(\"../login.yaml\") as f:\n",
|
||||
" cfg = yaml.safe_load(f)\n",
|
||||
" \n",
|
||||
"url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n",
|
||||
"\n",
|
||||
"# Connect once\n",
|
||||
"oc = owncloud.Client.from_public_link(url, folder_password=password)\n",
|
||||
"# File pattern\n",
|
||||
"# base = \"adabase-public-{num:04d}-v_0_0_2.h5py\"\n",
|
||||
"base = \"{num:04d}-*.h5py\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "07c03d07",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_files = 2 # number of files to process (min: 1, max: 30)\n",
|
||||
"performance_data = []\n",
|
||||
"\n",
|
||||
"for i in range(num_files):\n",
|
||||
" file_pattern = f\"{i:04d}-*\"\n",
|
||||
" \n",
|
||||
" # Get list of files matching the pattern\n",
|
||||
" files = oc.list('.')\n",
|
||||
" matching_files = [f.get_name() for f in files if f.get_name().startswith(f\"{i:04d}-\")]\n",
|
||||
" \n",
|
||||
" if matching_files:\n",
|
||||
" file_name = matching_files[0] # Take the first matching file\n",
|
||||
" local_tmp = f\"tmp_{i:04d}.h5\"\n",
|
||||
" \n",
|
||||
" oc.get_file(file_name, local_tmp)\n",
|
||||
" print(f\"{file_name} geöffnet\")\n",
|
||||
" else:\n",
|
||||
" print(f\"Keine Datei gefunden für Muster: {file_pattern}\")\n",
|
||||
" # file_name = base.format(num=i)\n",
|
||||
" # local_tmp = f\"tmp_{i:04d}.h5\"\n",
|
||||
"\n",
|
||||
" # oc.get_file(file_name, local_tmp)\n",
|
||||
" # print(f\"{file_name} geöffnet\")\n",
|
||||
"\n",
|
||||
" # check SIGNALS table for AUs\n",
|
||||
" with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
|
||||
" cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n",
|
||||
" au_cols = [c for c in cols if c.startswith(\"AU\")]\n",
|
||||
" if not au_cols:\n",
|
||||
" print(f\"Subject {i} enthält keine AUs\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # load performance table\n",
|
||||
" with pd.HDFStore(local_tmp, mode=\"r\") as store:\n",
|
||||
" perf_df = store.select(\"PERFORMANCE\")\n",
|
||||
"\n",
|
||||
" f1_cols = [c for c in [\"AUDITIVE F1\", \"VISUAL F1\", \"F1\"] if c in perf_df.columns]\n",
|
||||
" if not f1_cols:\n",
|
||||
" print(f\"Subject {i}: keine F1-Spalten gefunden\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" subject_entry = {\"subjectID\": i}\n",
|
||||
" valid_scores = []\n",
|
||||
"\n",
|
||||
" # iterate rows: each (study, level, phase)\n",
|
||||
" for _, row in perf_df.iterrows():\n",
|
||||
" study, level, phase = row[\"STUDY\"], row[\"LEVEL\"], row[\"PHASE\"]\n",
|
||||
" col_name = f\"STUDY_{study}_LEVEL_{level}_PHASE_{phase}\"\n",
|
||||
"\n",
|
||||
" # collect valid F1 values among the three columns\n",
|
||||
" scores = [row[c] for c in f1_cols if pd.notna(row[c])]\n",
|
||||
" if scores:\n",
|
||||
" mean_score = float(np.mean(scores))\n",
|
||||
" subject_entry[col_name] = mean_score\n",
|
||||
" valid_scores.extend(scores)\n",
|
||||
"\n",
|
||||
" # compute overall average across all valid combinations\n",
|
||||
" if valid_scores:\n",
|
||||
" subject_entry[\"overall_score\"] = float(np.mean(valid_scores))\n",
|
||||
" performance_data.append(subject_entry)\n",
|
||||
" print(f\"Subject {i}: {len(valid_scores)} gültige Scores, Overall = {subject_entry['overall_score']:.3f}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"Subject {i}: keine gültigen F1-Scores\")\n",
|
||||
"\n",
|
||||
"# build dataframe\n",
|
||||
"if performance_data:\n",
|
||||
" performance_df = pd.DataFrame(performance_data)\n",
|
||||
" combination_cols = sorted([c for c in performance_df.columns if c.startswith(\"STUDY_\")])\n",
|
||||
" final_cols = [\"subjectID\", \"overall_score\"] + combination_cols\n",
|
||||
" performance_df = performance_df[final_cols]\n",
|
||||
" performance_df.to_csv(\"n_au_performance.csv\", index=False)\n",
|
||||
"\n",
|
||||
" print(f\"\\nGesamt Subjects mit Action Units: {len(performance_df)}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Keine gültigen Daten gefunden.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0bcaf065",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"performance_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "db95eea7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
|
||||
" md = store.select(\"META\")\n",
|
||||
"print(\"File 0:\")\n",
|
||||
"print(md)\n",
|
||||
"with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\") as store:\n",
|
||||
" md = store.select(\"META\")\n",
|
||||
"print(\"File 1\")\n",
|
||||
"print(md)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8067036b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.set_option('display.max_columns', None)\n",
|
||||
"pd.set_option('display.max_rows', None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f18e7385",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n",
|
||||
" md = store.select(\"SIGNALS\", start=0, stop=1)\n",
|
||||
"print(\"File 0:\")\n",
|
||||
"md.head()\n",
|
||||
"# with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\",start=0, stop=1) as store:\n",
|
||||
"# md = store.select(\"SIGNALS\")\n",
|
||||
"# print(\"File 1\")\n",
|
||||
"# print(md.columns)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@ -17,9 +17,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df= pd.read_parquet(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")\n",
|
||||
"print(df.shape)\n",
|
||||
"\n"
|
||||
"df= pd.read_parquet(r\" \")\n",
|
||||
"print(df.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Binary file not shown.
@ -107,7 +107,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset_path = Path(r\"data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")"
|
||||
"dataset_path = Path(r\"data-paulusjafahrsimulator-gpu/new_datasets/combined_dataset_25hz.parquet\")\n",
|
||||
"# dataset_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/120s_combined_dataset_25hz.parquet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -475,7 +476,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"normalizer_path=Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/normalizer.pkl')"
|
||||
"normalizer_path=Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/normalizer_min_max_global.pkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -494,7 +495,7 @@
|
||||
"print(len(eye_cols))\n",
|
||||
"all_signal_columns = face_au_cols+eye_cols\n",
|
||||
"print(len(all_signal_columns))\n",
|
||||
"normalizer = fit_normalizer(train_df, all_signal_columns, method='standard', scope='subject')\n",
|
||||
"normalizer = fit_normalizer(train_df, all_signal_columns, method='minmax', scope='global')\n",
|
||||
"save_normalizer(normalizer, normalizer_path )"
|
||||
]
|
||||
},
|
||||
@ -691,10 +692,10 @@
|
||||
"model = build_intermediate_fusion_autoencoder(\n",
|
||||
" input_dim_mod1=len(face_au_cols),\n",
|
||||
" input_dim_mod2=len(eye_cols),\n",
|
||||
" encoder_hidden_dim_mod1=15, # individuell\n",
|
||||
" encoder_hidden_dim_mod2=10, # individuell\n",
|
||||
" latent_dim=8,\n",
|
||||
" dropout_rate=0.3, # einstellbar\n",
|
||||
" encoder_hidden_dim_mod1=12, # individuell\n",
|
||||
" encoder_hidden_dim_mod2=8, # individuell\n",
|
||||
" latent_dim=4,\n",
|
||||
" dropout_rate=0.7, # einstellbar\n",
|
||||
" neg_slope=0.1,\n",
|
||||
" weight_decay=1e-3\n",
|
||||
")\n",
|
||||
@ -708,7 +709,7 @@
|
||||
" \"recon_modality_1\": 1.0,\n",
|
||||
" \"recon_modality_2\": 1.0,\n",
|
||||
" },\n",
|
||||
" optimizer=tf.keras.optimizers.Adam(1e-2)\n",
|
||||
" optimizer=tf.keras.optimizers.Adam(1e-3)\n",
|
||||
" \n",
|
||||
")\n",
|
||||
"\n",
|
||||
@ -739,7 +740,7 @@
|
||||
" \"recon_modality_1\": 1.0,\n",
|
||||
" \"recon_modality_2\": 1.0,\n",
|
||||
" },\n",
|
||||
" optimizer=tf.keras.optimizers.Adam(1e-5),\n",
|
||||
" optimizer=tf.keras.optimizers.Adam(1e-4),\n",
|
||||
")\n",
|
||||
"model.fit(\n",
|
||||
" x=[X_face, X_eye],\n",
|
||||
@ -779,7 +780,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoder_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/encoder_6_deep.keras')\n",
|
||||
"encoder_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/encoder_8_deep.keras')\n",
|
||||
"encoder.save(encoder_save_path)"
|
||||
]
|
||||
},
|
||||
@ -943,7 +944,7 @@
|
||||
" return get_radius_from_arrays(nu, X_face, X_eye)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"nu = 0.05\n",
|
||||
"nu = 0.25\n",
|
||||
"\n",
|
||||
"train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye)).shuffle(64).batch(64)\n",
|
||||
"# train_dataset = tf.data.Dataset.from_tensor_slices((X_face, X_eye))\n",
|
||||
@ -1018,7 +1019,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"deep_svdd_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/deep_svdd_05.keras')\n",
|
||||
"deep_svdd_save_path =Path('data-paulusjafahrsimulator-gpu/saved_models/deepsvdd_save/deep_svdd_06.keras')\n",
|
||||
"deep_svdd_net.save(deep_svdd_save_path)"
|
||||
]
|
||||
},
|
||||
@ -1075,6 +1076,18 @@
|
||||
"test_predictions = (test_scores > 0).astype(int)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "575dddcf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"normal_acc = np.mean(test_predictions[y_test == 0] == 0)\n",
|
||||
"anomaly_acc = np.mean(test_predictions[y_test == 1] == 1)\n",
|
||||
"print(f'Accuracy on Test set: {accuracy_score(y_test, test_predictions)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@ -220,14 +220,637 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# SET\n",
|
||||
"threshold_mad = 100\n",
|
||||
"threshold_mad = 5\n",
|
||||
"column_praefix ='AU'\n",
|
||||
"\n",
|
||||
"au_columns = [col for col in df.columns if col.startswith(column_praefix)]\n",
|
||||
"cleaned_df = mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n",
|
||||
"cleaned_df = mad_outlier_removal.mad_outlier_removal(df,columns=au_columns, threshold=threshold_mad)\n",
|
||||
"print(cleaned_df.shape)\n",
|
||||
"print(df.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9a6c1732",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### TO DO\n",
|
||||
" * pipeline aus Autoencoder und SVM\n",
|
||||
" * group k fold\n",
|
||||
" * AE überpüfen, loss dokumentieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "877309d9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### Variational Autoencoder with Classifier Head\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from tensorflow import keras\n",
|
||||
"from tensorflow.keras import layers, Model\n",
|
||||
"from sklearn.model_selection import GroupKFold\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.metrics import (\n",
|
||||
" accuracy_score, precision_score, recall_score, f1_score, \n",
|
||||
" roc_auc_score, confusion_matrix, classification_report\n",
|
||||
")\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 1. CREATE LABELS\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"# Low workload: baseline + n-back level 1,4\n",
|
||||
"low_all = cleaned_df[\n",
|
||||
" ((cleaned_df[\"PHASE\"] == \"baseline\") |\n",
|
||||
" ((cleaned_df[\"STUDY\"] == \"n-back\") & (cleaned_df[\"PHASE\"] != \"baseline\") & (cleaned_df[\"LEVEL\"].isin([1,4]))))\n",
|
||||
"].copy()\n",
|
||||
"low_all['label'] = 0\n",
|
||||
"print(f\"Low workload samples: {low_all.shape[0]}\")\n",
|
||||
"\n",
|
||||
"# High workload n-back: level 2,3,5,6\n",
|
||||
"high_nback = cleaned_df[\n",
|
||||
" (cleaned_df[\"STUDY\"]==\"n-back\") &\n",
|
||||
" (cleaned_df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
|
||||
" (cleaned_df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
|
||||
"].copy()\n",
|
||||
"high_nback['label'] = 1\n",
|
||||
"print(f\"High n-back samples: {high_nback.shape[0]}\")\n",
|
||||
"\n",
|
||||
"# High workload k-drive\n",
|
||||
"high_kdrive = cleaned_df[\n",
|
||||
" (cleaned_df[\"STUDY\"] == \"k-drive\") & (cleaned_df[\"PHASE\"] != \"baseline\")\n",
|
||||
"].copy()\n",
|
||||
"high_kdrive['label'] = 1\n",
|
||||
"print(f\"High k-drive samples: {high_kdrive.shape[0]}\")\n",
|
||||
"\n",
|
||||
"# Combine all high workload\n",
|
||||
"high_all = pd.concat([high_nback, high_kdrive])\n",
|
||||
"print(f\"Total high workload samples: {high_all.shape[0]}\")\n",
|
||||
"\n",
|
||||
"# Complete labeled dataset\n",
|
||||
"labeled_df = pd.concat([low_all, high_all]).reset_index(drop=True)\n",
|
||||
"print(f\"\\nTotal labeled samples: {labeled_df.shape[0]}\")\n",
|
||||
"print(f\"Class distribution:\\n{labeled_df['label'].value_counts()}\")\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 2. TRAIN/TEST SPLIT BY SUBJECTS\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"train_df = labeled_df[labeled_df['subjectID'].isin(training_subjects)].copy()\n",
|
||||
"test_df = labeled_df[labeled_df['subjectID'].isin(test_subjects)].copy()\n",
|
||||
"\n",
|
||||
"print(f\"\\nTraining subjects: {training_subjects}\")\n",
|
||||
"print(f\"Test subjects: {test_subjects}\")\n",
|
||||
"print(f\"Train samples: {train_df.shape[0]}, Test samples: {test_df.shape[0]}\")\n",
|
||||
"\n",
|
||||
"# Extract features and labels\n",
|
||||
"au_columns = [col for col in labeled_df.columns if col.startswith('AU')]\n",
|
||||
"print(f\"\\nUsing {len(au_columns)} AU features: {au_columns}\")\n",
|
||||
"\n",
|
||||
"X_train = train_df[au_columns].values\n",
|
||||
"y_train = train_df['label'].values\n",
|
||||
"groups_train = train_df['subjectID'].values\n",
|
||||
"\n",
|
||||
"X_test = test_df[au_columns].values\n",
|
||||
"y_test = test_df['label'].values\n",
|
||||
"\n",
|
||||
"# Normalize features\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_train_scaled = scaler.fit_transform(X_train)\n",
|
||||
"X_test_scaled = scaler.transform(X_test)\n",
|
||||
"\n",
|
||||
"print(f\"\\nTrain class distribution: {np.bincount(y_train)}\")\n",
|
||||
"print(f\"Test class distribution: {np.bincount(y_test)}\")\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 3. VAE WITH CLASSIFIER HEAD MODEL\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"class Sampling(layers.Layer):\n",
|
||||
" \"\"\"Reparameterization trick for VAE\"\"\"\n",
|
||||
" def call(self, inputs):\n",
|
||||
" z_mean, z_log_var = inputs\n",
|
||||
" batch = tf.shape(z_mean)[0]\n",
|
||||
" dim = tf.shape(z_mean)[1]\n",
|
||||
" epsilon = tf.random.normal(shape=(batch, dim))\n",
|
||||
" return z_mean + tf.exp(0.5 * z_log_var) * epsilon\n",
|
||||
"\n",
|
||||
"def build_vae_classifier(input_dim, latent_dim, encoder_dims=[32, 16], \n",
|
||||
" decoder_dims=[16, 32], classifier_dims=[16]):\n",
|
||||
" \"\"\"\n",
|
||||
" Build VAE with classifier head\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" input_dim: Number of input features (20 AUs)\n",
|
||||
" latent_dim: Dimension of latent space (2-5)\n",
|
||||
" encoder_dims: Hidden layer sizes for encoder\n",
|
||||
" decoder_dims: Hidden layer sizes for decoder\n",
|
||||
" classifier_dims: Hidden layer sizes for classifier\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" # ---- ENCODER ----\n",
|
||||
" encoder_inputs = keras.Input(shape=(input_dim,), name='encoder_input')\n",
|
||||
" x = encoder_inputs\n",
|
||||
" \n",
|
||||
" for i, dim in enumerate(encoder_dims):\n",
|
||||
" x = layers.Dense(dim, activation='relu', name=f'encoder_dense_{i}')(x)\n",
|
||||
" x = layers.BatchNormalization(name=f'encoder_bn_{i}')(x)\n",
|
||||
" x = layers.Dropout(0.2, name=f'encoder_dropout_{i}')(x)\n",
|
||||
" \n",
|
||||
" z_mean = layers.Dense(latent_dim, name='z_mean')(x)\n",
|
||||
" z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)\n",
|
||||
" z = Sampling()([z_mean, z_log_var])\n",
|
||||
" \n",
|
||||
" encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')\n",
|
||||
" \n",
|
||||
" # ---- DECODER ----\n",
|
||||
" latent_inputs = keras.Input(shape=(latent_dim,), name='latent_input')\n",
|
||||
" x = latent_inputs\n",
|
||||
" \n",
|
||||
" for i, dim in enumerate(decoder_dims):\n",
|
||||
" x = layers.Dense(dim, activation='relu', name=f'decoder_dense_{i}')(x)\n",
|
||||
" x = layers.BatchNormalization(name=f'decoder_bn_{i}')(x)\n",
|
||||
" \n",
|
||||
" decoder_outputs = layers.Dense(input_dim, activation='linear', name='decoder_output')(x)\n",
|
||||
" decoder = Model(latent_inputs, decoder_outputs, name='decoder')\n",
|
||||
" \n",
|
||||
" # ---- CLASSIFIER HEAD ----\n",
|
||||
" x = latent_inputs\n",
|
||||
" for i, dim in enumerate(classifier_dims):\n",
|
||||
" x = layers.Dense(dim, activation='relu', name=f'classifier_dense_{i}')(x)\n",
|
||||
" x = layers.Dropout(0.3, name=f'classifier_dropout_{i}')(x)\n",
|
||||
" \n",
|
||||
" classifier_output = layers.Dense(1, activation='sigmoid', name='classifier_output')(x)\n",
|
||||
" classifier = Model(latent_inputs, classifier_output, name='classifier')\n",
|
||||
" \n",
|
||||
" # ---- FULL MODEL ----\n",
|
||||
" inputs = keras.Input(shape=(input_dim,), name='vae_input')\n",
|
||||
" z_mean, z_log_var, z = encoder(inputs)\n",
|
||||
" reconstructed = decoder(z)\n",
|
||||
" classification = classifier(z)\n",
|
||||
" \n",
|
||||
" model = Model(inputs, [reconstructed, classification], name='vae_classifier')\n",
|
||||
" \n",
|
||||
" return model, encoder, decoder, classifier\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 4. CUSTOM TRAINING LOOP WITH COMBINED LOSS\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"class VAEClassifier(keras.Model):\n",
|
||||
" def __init__(self, encoder, decoder, classifier, **kwargs):\n",
|
||||
" super().__init__(**kwargs)\n",
|
||||
" self.encoder = encoder\n",
|
||||
" self.decoder = decoder\n",
|
||||
" self.classifier = classifier\n",
|
||||
" self.total_loss_tracker = keras.metrics.Mean(name=\"total_loss\")\n",
|
||||
" self.reconstruction_loss_tracker = keras.metrics.Mean(name=\"reconstruction_loss\")\n",
|
||||
" self.kl_loss_tracker = keras.metrics.Mean(name=\"kl_loss\")\n",
|
||||
" self.classification_loss_tracker = keras.metrics.Mean(name=\"classification_loss\")\n",
|
||||
" self.accuracy_tracker = keras.metrics.BinaryAccuracy(name=\"accuracy\")\n",
|
||||
" \n",
|
||||
" @property\n",
|
||||
" def metrics(self):\n",
|
||||
" return [\n",
|
||||
" self.total_loss_tracker,\n",
|
||||
" self.reconstruction_loss_tracker,\n",
|
||||
" self.kl_loss_tracker,\n",
|
||||
" self.classification_loss_tracker,\n",
|
||||
" self.accuracy_tracker,\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" def train_step(self, data):\n",
|
||||
" x, y = data\n",
|
||||
" \n",
|
||||
" with tf.GradientTape() as tape:\n",
|
||||
" # Forward pass\n",
|
||||
" z_mean, z_log_var, z = self.encoder(x, training=True)\n",
|
||||
" reconstruction = self.decoder(z, training=True)\n",
|
||||
" classification = self.classifier(z, training=True)\n",
|
||||
" \n",
|
||||
" # Reconstruction loss (MSE)\n",
|
||||
" reconstruction_loss = tf.reduce_mean(\n",
|
||||
" keras.losses.mse(x, reconstruction))\n",
|
||||
" \n",
|
||||
" # KL divergence loss\n",
|
||||
" kl_loss = -0.5 * tf.reduce_mean(\n",
|
||||
" tf.reduce_sum(\n",
|
||||
" 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),\n",
|
||||
" axis=1\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Classification loss (binary crossentropy)\n",
|
||||
" # Classification loss (binary crossentropy)\n",
|
||||
" classification_loss = tf.reduce_mean(\n",
|
||||
" keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Combined loss with weights\n",
|
||||
" total_loss = reconstruction_loss + kl_loss + classification_loss\n",
|
||||
" \n",
|
||||
" # Backpropagation\n",
|
||||
" grads = tape.gradient(total_loss, self.trainable_weights)\n",
|
||||
" self.optimizer.apply_gradients(zip(grads, self.trainable_weights))\n",
|
||||
" \n",
|
||||
" # Update metrics\n",
|
||||
" self.total_loss_tracker.update_state(total_loss)\n",
|
||||
" self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n",
|
||||
" self.kl_loss_tracker.update_state(kl_loss)\n",
|
||||
" self.classification_loss_tracker.update_state(classification_loss)\n",
|
||||
" self.accuracy_tracker.update_state(y, classification)\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" \"total_loss\": self.total_loss_tracker.result(),\n",
|
||||
" \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n",
|
||||
" \"kl_loss\": self.kl_loss_tracker.result(),\n",
|
||||
" \"classification_loss\": self.classification_loss_tracker.result(),\n",
|
||||
" \"accuracy\": self.accuracy_tracker.result(),\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" def test_step(self, data):\n",
|
||||
" x, y = data\n",
|
||||
" \n",
|
||||
" z_mean, z_log_var, z = self.encoder(x, training=False)\n",
|
||||
" reconstruction = self.decoder(z, training=False)\n",
|
||||
" classification = self.classifier(z, training=False)\n",
|
||||
" \n",
|
||||
" # Reconstruction loss (MSE)\n",
|
||||
" reconstruction_loss = tf.reduce_mean(\n",
|
||||
" keras.losses.mse(x, reconstruction))\n",
|
||||
" kl_loss = -0.5 * tf.reduce_mean(\n",
|
||||
" tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)\n",
|
||||
" )\n",
|
||||
" # Classification loss (binary crossentropy)\n",
|
||||
" classification_loss = tf.reduce_mean(\n",
|
||||
" keras.losses.binary_crossentropy(tf.expand_dims(y, -1), classification)\n",
|
||||
" )\n",
|
||||
" total_loss = reconstruction_loss + kl_loss + classification_loss\n",
|
||||
" \n",
|
||||
" self.total_loss_tracker.update_state(total_loss)\n",
|
||||
" self.reconstruction_loss_tracker.update_state(reconstruction_loss)\n",
|
||||
" self.kl_loss_tracker.update_state(kl_loss)\n",
|
||||
" self.classification_loss_tracker.update_state(classification_loss)\n",
|
||||
" self.accuracy_tracker.update_state(y, classification)\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" \"total_loss\": self.total_loss_tracker.result(),\n",
|
||||
" \"reconstruction_loss\": self.reconstruction_loss_tracker.result(),\n",
|
||||
" \"kl_loss\": self.kl_loss_tracker.result(),\n",
|
||||
" \"classification_loss\": self.classification_loss_tracker.result(),\n",
|
||||
" \"accuracy\": self.accuracy_tracker.result(),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 5. GROUP K-FOLD CROSS-VALIDATION WITH GRID SEARCH\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"# Hyperparameter grid\n",
|
||||
"param_grid = {\n",
|
||||
" 'latent_dim': [2, 5],\n",
|
||||
" 'encoder_dims': [[32, 16], [64, 32]],\n",
|
||||
" 'learning_rate': [0.001, 0.005],\n",
|
||||
" 'batch_size': [32, 64],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Generate all combinations\n",
|
||||
"from itertools import product\n",
|
||||
"keys = param_grid.keys()\n",
|
||||
"values = param_grid.values()\n",
|
||||
"param_combinations = [dict(zip(keys, v)) for v in product(*values)]\n",
|
||||
"\n",
|
||||
"print(f\"\\nTotal hyperparameter combinations: {len(param_combinations)}\")\n",
|
||||
"\n",
|
||||
"# Group K-Fold setup\n",
|
||||
"n_splits = 5\n",
|
||||
"gkf = GroupKFold(n_splits=n_splits)\n",
|
||||
"\n",
|
||||
"# Store results\n",
|
||||
"cv_results = []\n",
|
||||
"\n",
|
||||
"# Grid search with cross-validation\n",
|
||||
"for idx, params in enumerate(param_combinations):\n",
|
||||
" print(f\"\\n{'='*80}\")\n",
|
||||
" print(f\"Testing combination {idx+1}/{len(param_combinations)}: {params}\")\n",
|
||||
" print(f\"{'='*80}\")\n",
|
||||
" \n",
|
||||
" fold_results = []\n",
|
||||
" \n",
|
||||
" for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_scaled, y_train, groups_train)):\n",
|
||||
" print(f\"\\nFold {fold+1}/{n_splits}\")\n",
|
||||
" \n",
|
||||
" X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]\n",
|
||||
" y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]\n",
|
||||
" \n",
|
||||
" # Build model\n",
|
||||
" model, encoder, decoder, classifier = build_vae_classifier(\n",
|
||||
" input_dim=len(au_columns),\n",
|
||||
" latent_dim=params['latent_dim'],\n",
|
||||
" encoder_dims=params['encoder_dims'],\n",
|
||||
" decoder_dims=list(reversed(params['encoder_dims'])),\n",
|
||||
" classifier_dims=[16]\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" vae_classifier = VAEClassifier(encoder, decoder, classifier)\n",
|
||||
" vae_classifier.compile(optimizer=keras.optimizers.Adam(params['learning_rate']))\n",
|
||||
" \n",
|
||||
" # Early stopping\n",
|
||||
" early_stop = keras.callbacks.EarlyStopping(\n",
|
||||
" monitor='val_total_loss',\n",
|
||||
" patience=10,\n",
|
||||
" restore_best_weights=True,\n",
|
||||
" mode='min'\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Train\n",
|
||||
" history = vae_classifier.fit(\n",
|
||||
" X_fold_train, y_fold_train,\n",
|
||||
" validation_data=(X_fold_val, y_fold_val),\n",
|
||||
" epochs=60,\n",
|
||||
" batch_size=params['batch_size'],\n",
|
||||
" callbacks=[early_stop],\n",
|
||||
" verbose=0\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Evaluate on validation fold\n",
|
||||
" z_mean_val, _, _ = encoder.predict(X_fold_val, verbose=0)\n",
|
||||
" y_pred_proba = classifier.predict(z_mean_val, verbose=0).flatten()\n",
|
||||
" y_pred = (y_pred_proba > 0.5).astype(int)\n",
|
||||
" \n",
|
||||
" fold_metrics = {\n",
|
||||
" 'accuracy': accuracy_score(y_fold_val, y_pred),\n",
|
||||
" 'precision': precision_score(y_fold_val, y_pred, zero_division=0),\n",
|
||||
" 'recall': recall_score(y_fold_val, y_pred, zero_division=0),\n",
|
||||
" 'f1': f1_score(y_fold_val, y_pred, zero_division=0),\n",
|
||||
" 'roc_auc': roc_auc_score(y_fold_val, y_pred_proba),\n",
|
||||
" 'final_recon_loss': history.history['val_reconstruction_loss'][-1],\n",
|
||||
" 'final_kl_loss': history.history['val_kl_loss'][-1],\n",
|
||||
" 'final_class_loss': history.history['val_classification_loss'][-1],\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" fold_results.append(fold_metrics)\n",
|
||||
" print(f\" Accuracy: {fold_metrics['accuracy']:.4f}, F1: {fold_metrics['f1']:.4f}, AUC: {fold_metrics['roc_auc']:.4f}\")\n",
|
||||
" \n",
|
||||
" # Clear session to free memory\n",
|
||||
" keras.backend.clear_session()\n",
|
||||
" \n",
|
||||
" # Average across folds\n",
|
||||
" avg_results = {\n",
|
||||
" 'params': params,\n",
|
||||
" 'mean_accuracy': np.mean([r['accuracy'] for r in fold_results]),\n",
|
||||
" 'std_accuracy': np.std([r['accuracy'] for r in fold_results]),\n",
|
||||
" 'mean_f1': np.mean([r['f1'] for r in fold_results]),\n",
|
||||
" 'std_f1': np.std([r['f1'] for r in fold_results]),\n",
|
||||
" 'mean_roc_auc': np.mean([r['roc_auc'] for r in fold_results]),\n",
|
||||
" 'std_roc_auc': np.std([r['roc_auc'] for r in fold_results]),\n",
|
||||
" 'mean_recon_loss': np.mean([r['final_recon_loss'] for r in fold_results]),\n",
|
||||
" 'mean_kl_loss': np.mean([r['final_kl_loss'] for r in fold_results]),\n",
|
||||
" 'mean_class_loss': np.mean([r['final_class_loss'] for r in fold_results]),\n",
|
||||
" 'fold_results': fold_results\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" cv_results.append(avg_results)\n",
|
||||
" \n",
|
||||
" print(f\"\\nMean CV Accuracy: {avg_results['mean_accuracy']:.4f} ± {avg_results['std_accuracy']:.4f}\")\n",
|
||||
" print(f\"Mean CV F1: {avg_results['mean_f1']:.4f} ± {avg_results['std_f1']:.4f}\")\n",
|
||||
" print(f\"Mean CV AUC: {avg_results['mean_roc_auc']:.4f} ± {avg_results['std_roc_auc']:.4f}\")\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 6. SELECT BEST MODEL AND EVALUATE ON TEST SET\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"# Find best hyperparameters based on mean F1 score\n",
|
||||
"best_idx = np.argmax([r['mean_f1'] for r in cv_results])\n",
|
||||
"best_params = cv_results[best_idx]['params']\n",
|
||||
"\n",
|
||||
"print(f\"\\n{'='*80}\")\n",
|
||||
"print(\"BEST HYPERPARAMETERS (based on CV F1 score):\")\n",
|
||||
"print(f\"{'='*80}\")\n",
|
||||
"for key, value in best_params.items():\n",
|
||||
" print(f\"{key}: {value}\")\n",
|
||||
"print(f\"\\nCV Performance:\")\n",
|
||||
"print(f\" Accuracy: {cv_results[best_idx]['mean_accuracy']:.4f} ± {cv_results[best_idx]['std_accuracy']:.4f}\")\n",
|
||||
"print(f\" F1 Score: {cv_results[best_idx]['mean_f1']:.4f} ± {cv_results[best_idx]['std_f1']:.4f}\")\n",
|
||||
"print(f\" ROC-AUC: {cv_results[best_idx]['mean_roc_auc']:.4f} ± {cv_results[best_idx]['std_roc_auc']:.4f}\")\n",
|
||||
"\n",
|
||||
"# Train final model on all training data\n",
|
||||
"print(f\"\\n{'='*80}\")\n",
|
||||
"print(\"TRAINING FINAL MODEL ON ALL TRAINING DATA\")\n",
|
||||
"print(f\"{'='*80}\")\n",
|
||||
"\n",
|
||||
"final_model, final_encoder, final_decoder, final_classifier = build_vae_classifier(\n",
|
||||
" input_dim=len(au_columns),\n",
|
||||
" latent_dim=best_params['latent_dim'],\n",
|
||||
" encoder_dims=best_params['encoder_dims'],\n",
|
||||
" decoder_dims=list(reversed(best_params['encoder_dims'])),\n",
|
||||
" classifier_dims=[16]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"final_vae_classifier = VAEClassifier(final_encoder, final_decoder, final_classifier)\n",
|
||||
"final_vae_classifier.compile(optimizer=keras.optimizers.Adam(best_params['learning_rate']))\n",
|
||||
"\n",
|
||||
"final_history = final_vae_classifier.fit(\n",
|
||||
" X_train_scaled, y_train,\n",
|
||||
" validation_split=0.2,\n",
|
||||
" epochs=100,\n",
|
||||
" batch_size=best_params['batch_size'],\n",
|
||||
" callbacks=[keras.callbacks.EarlyStopping(monitor='val_total_loss', patience=15, restore_best_weights=True, mode='min')],\n",
|
||||
" verbose=1\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Evaluate on held-out test set\n",
|
||||
"print(f\"\\n{'='*80}\")\n",
|
||||
"print(\"EVALUATION ON HELD-OUT TEST SET\")\n",
|
||||
"print(f\"{'='*80}\")\n",
|
||||
"\n",
|
||||
"z_mean_test, _, _ = final_encoder.predict(X_test_scaled, verbose=0)\n",
|
||||
"y_test_pred_proba = final_classifier.predict(z_mean_test, verbose=0).flatten()\n",
|
||||
"y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n",
|
||||
"\n",
|
||||
"test_metrics = {\n",
|
||||
" 'accuracy': accuracy_score(y_test, y_test_pred),\n",
|
||||
" 'precision': precision_score(y_test, y_test_pred),\n",
|
||||
" 'recall': recall_score(y_test, y_test_pred),\n",
|
||||
" 'f1': f1_score(y_test, y_test_pred),\n",
|
||||
" 'roc_auc': roc_auc_score(y_test, y_test_pred_proba),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"\\nTest Set Performance:\")\n",
|
||||
"for metric, value in test_metrics.items():\n",
|
||||
" print(f\" {metric.capitalize()}: {value:.4f}\")\n",
|
||||
"\n",
|
||||
"print(\"\\nConfusion Matrix:\")\n",
|
||||
"print(confusion_matrix(y_test, y_test_pred))\n",
|
||||
"\n",
|
||||
"print(\"\\nClassification Report:\")\n",
|
||||
"print(classification_report(y_test, y_test_pred, target_names=['Low Workload', 'High Workload']))\n",
|
||||
"\n",
|
||||
"# ============================================================================\n",
|
||||
"# 7. VISUALIZATION\n",
|
||||
"# ============================================================================\n",
|
||||
"\n",
|
||||
"# Plot training history\n",
|
||||
"fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
|
||||
"\n",
|
||||
"axes[0, 0].plot(final_history.history['reconstruction_loss'], label='Train')\n",
|
||||
"axes[0, 0].plot(final_history.history['val_reconstruction_loss'], label='Val')\n",
|
||||
"axes[0, 0].set_title('Reconstruction Loss')\n",
|
||||
"axes[0, 0].set_xlabel('Epoch')\n",
|
||||
"axes[0, 0].set_ylabel('Loss')\n",
|
||||
"axes[0, 0].legend()\n",
|
||||
"axes[0, 0].grid(True)\n",
|
||||
"\n",
|
||||
"axes[0, 1].plot(final_history.history['kl_loss'], label='Train')\n",
|
||||
"axes[0, 1].plot(final_history.history['val_kl_loss'], label='Val')\n",
|
||||
"axes[0, 1].set_title('KL Divergence Loss')\n",
|
||||
"axes[0, 1].set_xlabel('Epoch')\n",
|
||||
"axes[0, 1].set_ylabel('Loss')\n",
|
||||
"axes[0, 1].legend()\n",
|
||||
"axes[0, 1].grid(True)\n",
|
||||
"\n",
|
||||
"axes[1, 0].plot(final_history.history['classification_loss'], label='Train')\n",
|
||||
"axes[1, 0].plot(final_history.history['val_classification_loss'], label='Val')\n",
|
||||
"axes[1, 0].set_title('Classification Loss')\n",
|
||||
"axes[1, 0].set_xlabel('Epoch')\n",
|
||||
"axes[1, 0].set_ylabel('Loss')\n",
|
||||
"axes[1, 0].legend()\n",
|
||||
"axes[1, 0].grid(True)\n",
|
||||
"\n",
|
||||
"axes[1, 1].plot(final_history.history['accuracy'], label='Train')\n",
|
||||
"axes[1, 1].plot(final_history.history['val_accuracy'], label='Val')\n",
|
||||
"axes[1, 1].set_title('Classification Accuracy')\n",
|
||||
"axes[1, 1].set_xlabel('Epoch')\n",
|
||||
"axes[1, 1].set_ylabel('Accuracy')\n",
|
||||
"axes[1, 1].legend()\n",
|
||||
"axes[1, 1].grid(True)\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Visualize latent space (if 2D or 3D)\n",
|
||||
"if best_params['latent_dim'] == 2:\n",
|
||||
" z_mean_train, _, _ = final_encoder.predict(X_train_scaled, verbose=0)\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(10, 8))\n",
|
||||
" scatter = plt.scatter(z_mean_train[:, 0], z_mean_train[:, 1], \n",
|
||||
" c=y_train, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n",
|
||||
" plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n",
|
||||
" plt.xlabel('Latent Dimension 1')\n",
|
||||
" plt.ylabel('Latent Dimension 2')\n",
|
||||
" plt.title('2D Latent Space Representation (Training Data)')\n",
|
||||
" plt.grid(True, alpha=0.3)\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
" # Test set latent space\n",
|
||||
" plt.figure(figsize=(10, 8))\n",
|
||||
" scatter = plt.scatter(z_mean_test[:, 0], z_mean_test[:, 1], \n",
|
||||
" c=y_test, cmap='RdYlBu', alpha=0.6, edgecolors='k')\n",
|
||||
" plt.colorbar(scatter, label='Workload (0=Low, 1=High)')\n",
|
||||
" plt.xlabel('Latent Dimension 1')\n",
|
||||
" plt.ylabel('Latent Dimension 2')\n",
|
||||
" plt.title('2D Latent Space Representation (Test Data)')\n",
|
||||
" plt.grid(True, alpha=0.3)\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
"print(\"\\n\" + \"=\"*80)\n",
|
||||
"print(\"TRAINING COMPLETE!\")\n",
|
||||
"print(\"=\"*80)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "79bcfc58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### Save Trained VAE Classifier Model\n",
|
||||
"from pathlib import Path\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# Define save path\n",
|
||||
"model_dir = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models\")\n",
|
||||
"model_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
||||
"model_path = model_dir / f\"vae_classifier_{timestamp}.keras\"\n",
|
||||
"\n",
|
||||
"# Save the complete model\n",
|
||||
"final_vae_classifier.save(model_path)\n",
|
||||
"\n",
|
||||
"print(f\"Model saved to: {model_path}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d700e517",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "30d8d100",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### Plot Confusion Matrix for Final Model\n",
|
||||
"from sklearn.metrics import ConfusionMatrixDisplay\n",
|
||||
"x = Path(\"/home/jovyan/data-paulusjafahrsimulator-gpu/trained_models/vae_classifier_20251210_230121.keras\")\n",
|
||||
"# Load the saved model\n",
|
||||
"print(f\"Loading model from: {x}\")\n",
|
||||
"# loaded_vae_classifier = tf.keras.models.load_model(x)\n",
|
||||
"loaded_vae_classifier = final_vae_classifier\n",
|
||||
"print(\"✓ Model loaded successfully!\")\n",
|
||||
"\n",
|
||||
"# Extract encoder and classifier from loaded model\n",
|
||||
"loaded_encoder = loaded_vae_classifier.encoder\n",
|
||||
"loaded_classifier = loaded_vae_classifier.classifier\n",
|
||||
"\n",
|
||||
"# Get predictions on test set\n",
|
||||
"z_mean_test, _, _ = loaded_encoder.predict(X_test_scaled, verbose=0)\n",
|
||||
"y_test_pred_proba = loaded_classifier.predict(z_mean_test, verbose=0).flatten()\n",
|
||||
"y_test_pred = (y_test_pred_proba > 0.5).astype(int)\n",
|
||||
"\n",
|
||||
"# Create and plot confusion matrix\n",
|
||||
"cm = confusion_matrix(y_test, y_test_pred)\n",
|
||||
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, \n",
|
||||
" display_labels=['Low Workload', 'High Workload'])\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(figsize=(8, 6))\n",
|
||||
"disp.plot(ax=ax, cmap='Blues', values_format='d')\n",
|
||||
"plt.title('Confusion Matrix - Test Set (Loaded Model)')\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Print metrics\n",
|
||||
"print(f\"\\nTest Set Performance (Loaded Model):\")\n",
|
||||
"print(f\" Accuracy: {accuracy_score(y_test, y_test_pred):.4f}\")\n",
|
||||
"print(f\" Precision: {precision_score(y_test, y_test_pred):.4f}\")\n",
|
||||
"print(f\" Recall: {recall_score(y_test, y_test_pred):.4f}\")\n",
|
||||
"print(f\" F1 Score: {f1_score(y_test, y_test_pred):.4f}\")\n",
|
||||
"print(f\" ROC-AUC: {roc_auc_score(y_test, y_test_pred_proba):.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e826a998",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"TO DO\n",
|
||||
" * autoencoder langsam anfangen mit 19 schichten\n",
|
||||
" * dann AE und SVM mit hybridem training wie bei claude?!\n",
|
||||
" * dataset aus eyetracking verwenden?"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
||||
import pandas as pd
|
||||
import pickle
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
|
||||
"""
|
||||
@ -19,9 +21,8 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
|
||||
Returns:
|
||||
--------
|
||||
dict
|
||||
Dictionary containing fitted scalers
|
||||
Dictionary containing fitted scalers and statistics for new subjects
|
||||
"""
|
||||
# Select scaler based on method
|
||||
if method == 'standard':
|
||||
Scaler = StandardScaler
|
||||
elif method == 'minmax':
|
||||
@ -30,19 +31,54 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
|
||||
raise ValueError("method must be 'standard' or 'minmax'")
|
||||
|
||||
scalers = {}
|
||||
|
||||
if scope == 'subject':
|
||||
# Fit one scaler per subject
|
||||
subject_stats = []
|
||||
|
||||
for subject in train_data['subjectID'].unique():
|
||||
subject_mask = train_data['subjectID'] == subject
|
||||
scaler = Scaler()
|
||||
scaler.fit(train_data.loc[subject_mask, au_columns])
|
||||
scaler.fit(train_data.loc[subject_mask, au_columns].values)
|
||||
scalers[subject] = scaler
|
||||
|
||||
# Store statistics for averaging
|
||||
if method == 'standard':
|
||||
subject_stats.append({
|
||||
'mean': scaler.mean_,
|
||||
'std': scaler.scale_
|
||||
})
|
||||
elif method == 'minmax':
|
||||
subject_stats.append({
|
||||
'min': scaler.data_min_,
|
||||
'max': scaler.data_max_
|
||||
})
|
||||
|
||||
# Calculate average statistics for new subjects
|
||||
if method == 'standard':
|
||||
avg_mean = np.mean([s['mean'] for s in subject_stats], axis=0)
|
||||
avg_std = np.mean([s['std'] for s in subject_stats], axis=0)
|
||||
fallback_scaler = StandardScaler()
|
||||
fallback_scaler.mean_ = avg_mean
|
||||
fallback_scaler.scale_ = avg_std
|
||||
fallback_scaler.var_ = avg_std ** 2
|
||||
fallback_scaler.n_features_in_ = len(au_columns)
|
||||
elif method == 'minmax':
|
||||
avg_min = np.mean([s['min'] for s in subject_stats], axis=0)
|
||||
avg_max = np.mean([s['max'] for s in subject_stats], axis=0)
|
||||
fallback_scaler = MinMaxScaler()
|
||||
fallback_scaler.data_min_ = avg_min
|
||||
fallback_scaler.data_max_ = avg_max
|
||||
fallback_scaler.data_range_ = avg_max - avg_min
|
||||
fallback_scaler.scale_ = 1.0 / fallback_scaler.data_range_
|
||||
fallback_scaler.min_ = -avg_min * fallback_scaler.scale_
|
||||
fallback_scaler.n_features_in_ = len(au_columns)
|
||||
|
||||
scalers['_fallback'] = fallback_scaler
|
||||
|
||||
elif scope == 'global':
|
||||
# Fit one scaler for all subjects
|
||||
scaler = Scaler()
|
||||
scaler.fit(train_data[au_columns])
|
||||
scaler.fit(train_data[au_columns].values)
|
||||
scalers['global'] = scaler
|
||||
|
||||
else:
|
||||
@ -50,7 +86,7 @@ def fit_normalizer(train_data, au_columns, method='standard', scope='global'):
|
||||
|
||||
return {'scalers': scalers, 'method': method, 'scope': scope}
|
||||
|
||||
def apply_normalizer(data, au_columns, normalizer_dict):
|
||||
def apply_normalizer(data, columns, normalizer_dict):
|
||||
"""
|
||||
Apply fitted normalization scalers to data.
|
||||
|
||||
@ -71,28 +107,70 @@ def apply_normalizer(data, au_columns, normalizer_dict):
|
||||
normalized_data = data.copy()
|
||||
scalers = normalizer_dict['scalers']
|
||||
scope = normalizer_dict['scope']
|
||||
normalized_data[columns] = normalized_data[columns].astype(np.float64)
|
||||
|
||||
if scope == 'subject':
|
||||
# Apply per-subject normalization
|
||||
for subject in data['subjectID'].unique():
|
||||
subject_mask = data['subjectID'] == subject
|
||||
|
||||
# Use the subject's scaler if available, otherwise use a fitted scaler from training
|
||||
# Use the subject's scaler if available, otherwise use fallback
|
||||
if subject in scalers:
|
||||
scaler = scalers[subject]
|
||||
else:
|
||||
# For new subjects not seen in training, use the first available scaler
|
||||
# (This is a fallback - ideally all test subjects should be in training for subject-level normalization)
|
||||
print(f"Warning: Subject {subject} not found in training data. Using fallback scaler.")
|
||||
scaler = list(scalers.values())[0]
|
||||
# Use averaged scaler for new subjects
|
||||
scaler = scalers['_fallback']
|
||||
print(f"Info: Subject {subject} not in training data. Using averaged scaler from training subjects.")
|
||||
|
||||
normalized_data.loc[subject_mask, au_columns] = scaler.transform(
|
||||
data.loc[subject_mask, au_columns]
|
||||
normalized_data.loc[subject_mask, columns] = scaler.transform(
|
||||
data.loc[subject_mask, columns].values
|
||||
)
|
||||
|
||||
elif scope == 'global':
|
||||
# Apply global normalization
|
||||
scaler = scalers['global']
|
||||
normalized_data[au_columns] = scaler.transform(data[au_columns])
|
||||
normalized_data[columns] = scaler.transform(data[columns].values)
|
||||
|
||||
return normalized_data
|
||||
|
||||
|
||||
|
||||
def save_normalizer(normalizer_dict, filepath):
|
||||
"""
|
||||
Save fitted normalizer to disk.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
normalizer_dict : dict
|
||||
Dictionary containing fitted scalers from fit_normalizer()
|
||||
filepath : str
|
||||
Path to save the normalizer (e.g., 'normalizer.pkl')
|
||||
"""
|
||||
# Create directory if it does not exist
|
||||
dirpath = os.path.dirname(filepath)
|
||||
if dirpath:
|
||||
os.makedirs(dirpath, exist_ok=True)
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
pickle.dump(normalizer_dict, f)
|
||||
|
||||
print(f"Normalizer saved to {filepath}")
|
||||
|
||||
def load_normalizer(filepath):
|
||||
"""
|
||||
Load fitted normalizer from disk.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
filepath : str
|
||||
Path to the saved normalizer file
|
||||
|
||||
Returns:
|
||||
--------
|
||||
dict
|
||||
Dictionary containing fitted scalers
|
||||
"""
|
||||
with open(filepath, 'rb') as f:
|
||||
normalizer_dict = pickle.load(f)
|
||||
print(f"Normalizer loaded from {filepath}")
|
||||
return normalizer_dict
|
||||
@ -1,11 +0,0 @@
|
||||
# from tools import db_helpers
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
print(sys.version)
|
||||
# db_helpers.add_columns_to_table()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,9 +0,0 @@
|
||||
import sqlite3
|
||||
|
||||
def main():
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,211 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0d70a13f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append('/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/tools')\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import db_helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ce696366",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"database_path = Path(r\"/home/edgekit/MSY_FS/databases/rawdata.sqlite\")\n",
|
||||
"parquet_path = Path(r\"/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/files_for_testing/both_mod_0000.parquet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b1aa9398",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = pd.read_parquet(parquet_path)\n",
|
||||
"dataset.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b183746e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset.dtypes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24ed769d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"con, cursor = db_helpers.connect_db(database_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e604ed30",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean = dataset.drop(columns=['subjectID','rowID', 'STUDY', 'LEVEL', 'PHASE'])\n",
|
||||
"df_first_100 = df_clean.head(200)\n",
|
||||
"df_first_100 = df_first_100.reset_index(drop=True)\n",
|
||||
"df_first_100.insert(0, '_Id', df_first_100.index + 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e77a812e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def pandas_to_sqlite_dtype(dtype):\n",
|
||||
" if pd.api.types.is_integer_dtype(dtype):\n",
|
||||
" return \"INTEGER\"\n",
|
||||
" if pd.api.types.is_float_dtype(dtype):\n",
|
||||
" return \"REAL\"\n",
|
||||
" if pd.api.types.is_bool_dtype(dtype):\n",
|
||||
" return \"INTEGER\"\n",
|
||||
" if pd.api.types.is_datetime64_any_dtype(dtype):\n",
|
||||
" return \"TEXT\"\n",
|
||||
" return \"TEXT\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0e8897b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"columns = {\n",
|
||||
" col: pandas_to_sqlite_dtype(dtype)\n",
|
||||
" for col, dtype in df_first_100.dtypes.items()\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"constraints = {\n",
|
||||
" \"_Id\": [\"NOT NULL\"]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"primary_key = {\n",
|
||||
" \"pk_df_first_100\": [\"_Id\"]\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ab57624",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sql = db_helpers.create_table(\n",
|
||||
" conn=con,\n",
|
||||
" cursor=cursor,\n",
|
||||
" table_name=\"rawdata\",\n",
|
||||
" columns=columns,\n",
|
||||
" constraints=constraints,\n",
|
||||
" primary_key=primary_key,\n",
|
||||
" commit=True\n",
|
||||
")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "25096a7f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"columns_to_insert = {\n",
|
||||
" col: df_first_100[col].tolist()\n",
|
||||
" for col in df_first_100.columns\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a5a3aa8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db_helpers.insert_rows_into_table(\n",
|
||||
" conn=con,\n",
|
||||
" cursor=cursor,\n",
|
||||
" table_name=\"rawdata\",\n",
|
||||
" columns=columns_to_insert,\n",
|
||||
" commit=True\n",
|
||||
")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b56beae2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = db_helpers.get_data_from_table(conn=con, table_name='rawdata',columns_list=['*'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4a74a9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "da0f8737",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db_helpers.disconnect_db(con, cursor)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "MSY_FS_env",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@ -1,43 +0,0 @@
|
||||
# Imports
|
||||
import pandas as pd
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
def getLastEntryFromSQLite():
|
||||
|
||||
return
|
||||
|
||||
def callModel(sample):
|
||||
prediction: np.int32 = sample # noch unklar ob jedes mal ein load oder z.B. mit Flask API
|
||||
return prediction
|
||||
|
||||
def getMessageConfig( config_file_path):
|
||||
|
||||
return dict()
|
||||
|
||||
|
||||
def buildMessage(result: np.int32, config: dict):
|
||||
# message =json...
|
||||
message = 5
|
||||
return message
|
||||
|
||||
|
||||
def sendMessage(destination, message):
|
||||
return 2
|
||||
|
||||
def main():
|
||||
config_file_path = Path("")
|
||||
config = getMessageConfig(config_file_path=config_file_path)
|
||||
|
||||
sample = getLastEntryFromSQLite()
|
||||
|
||||
prediction = callModel(sample=sample)
|
||||
|
||||
message = buildMessage(result=prediction, config=config)
|
||||
|
||||
sendMessage(config, message)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,78 +0,0 @@
|
||||
# ============================================================
|
||||
# SMALLER ENVIRONMENT - Korrigiert & Erweitert
|
||||
# Für Fahrsimulator-Projekt mit ML & IoT
|
||||
# ============================================================
|
||||
|
||||
name: smaller_env
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
|
||||
dependencies:
|
||||
# ====== PYTHON ======
|
||||
- python=3.8 # Kompatibel mit Jetson Nano
|
||||
|
||||
# ====== CORE DATA SCIENCE ======
|
||||
- numpy=1.19.5
|
||||
- pandas=1.3.5
|
||||
- scipy=1.7.3
|
||||
- scikit-learn=1.0.2 # sklearn ist ein Alias
|
||||
|
||||
# ====== VISUALIZATION ======
|
||||
|
||||
# ====== ML/DL SUPPORT ======
|
||||
- h5py=3.6.0
|
||||
- joblib=1.1.0
|
||||
|
||||
# ====== VIDEO PROCESSING ======
|
||||
- moviepy=1.0.3
|
||||
|
||||
# ====== MACHINE LEARNING ======
|
||||
- xgboost=1.5.2
|
||||
|
||||
# ====== FILE FORMATS ======
|
||||
- pyyaml # yaml Modul
|
||||
|
||||
# ====== IoT & COMMUNICATION (NEU) ======
|
||||
- paho-mqtt=1.6.1 # MQTT Client
|
||||
|
||||
# ====== DATABASE (NEU) ======
|
||||
# sqlite3 ist bereits in Python eingebaut!
|
||||
|
||||
# ====== UTILITIES ======
|
||||
- tqdm=4.64.1 # Progress bars
|
||||
- requests=2.28.1 # HTTP requests
|
||||
|
||||
# ====== PIP PACKAGES ======
|
||||
- pip
|
||||
- pip:
|
||||
# TensorFlow (wird separat für Jetson installiert)
|
||||
# - tensorflow==2.7.0 # Jetson: via NVIDIA repo installieren
|
||||
|
||||
# Eye-tracking Analysis
|
||||
- pygazeanalyser==0.2.0
|
||||
|
||||
|
||||
# ML Detection (falls vorhanden auf PyPI)
|
||||
# - detectors # Prüfen ob verfügbar
|
||||
# - feat # Prüfen ob verfügbar
|
||||
|
||||
# MQTT zusätzlich via pip falls conda Version Probleme macht
|
||||
# - paho-mqtt==1.6.1
|
||||
|
||||
# ============================================================
|
||||
# HINWEISE:
|
||||
# ============================================================
|
||||
|
||||
#
|
||||
# 3. TENSORFLOW FÜR JETSON:
|
||||
# Installiere nach Environment-Erstellung separat:
|
||||
# pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v46 tensorflow==2.7.0+nv22.1
|
||||
#
|
||||
# 4. SQLITE3:
|
||||
# Ist bereits in Python eingebaut, keine Installation nötig!
|
||||
# Import: import sqlite3
|
||||
#
|
||||
# 5. MQTT:
|
||||
# paho-mqtt ist der Standard MQTT-Client für Python
|
||||
# Broker-Empfehlungen: Mosquitto, HiveMQ, EMQX
|
||||
@ -1,166 +0,0 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def connect_db(path_to_file: os.PathLike) -> tuple[sqlite3.Connection, sqlite3.Cursor]:
|
||||
''' Establishes a connection with a sqlite3 database. '''
|
||||
conn = sqlite3.connect(path_to_file)
|
||||
cursor = conn.cursor()
|
||||
return conn, cursor
|
||||
|
||||
def disconnect_db(conn: sqlite3.Connection, cursor: sqlite3.Cursor, commit: bool = True) -> None:
|
||||
''' Commits all remaining changes and closes the connection with an sqlite3 database. '''
|
||||
cursor.close()
|
||||
if commit: conn.commit() # commit all pending changes made to the sqlite3 database before closing
|
||||
conn.close()
|
||||
|
||||
def create_table(
|
||||
conn: sqlite3.Connection,
|
||||
cursor: sqlite3.Cursor,
|
||||
table_name: str,
|
||||
columns: dict,
|
||||
constraints: dict,
|
||||
primary_key: dict,
|
||||
commit: bool = True
|
||||
) -> str:
|
||||
'''
|
||||
Creates a new empty table with the given columns, constraints and primary key.
|
||||
|
||||
:param columns: dict with column names (=keys) and dtypes (=values) (e.g. BIGINT, INT, ...)
|
||||
:param constraints: dict with column names (=keys) and list of constraints (=values) (like [\'NOT NULL\'(,...)])
|
||||
:param primary_key: dict with primary key name (=key) and list of attributes which combined define the table's primary key (=values, like [\'att1\'(,...)])
|
||||
'''
|
||||
assert len(primary_key.keys()) == 1
|
||||
sql = f'CREATE TABLE {table_name} (\n '
|
||||
for column,dtype in columns.items():
|
||||
sql += f'{column} {dtype}{" "+" ".join(constraints[column]) if column in constraints.keys() else ""},\n '
|
||||
if list(primary_key.keys())[0]: sql += f'CONSTRAINT {list(primary_key.keys())[0]} '
|
||||
sql += f'PRIMARY KEY ({", ".join(list(primary_key.values())[0])})\n)'
|
||||
cursor.execute(sql)
|
||||
if commit: conn.commit()
|
||||
return sql
|
||||
|
||||
def add_columns_to_table(
|
||||
conn: sqlite3.Connection,
|
||||
cursor: sqlite3.Cursor,
|
||||
table_name: str,
|
||||
columns: dict,
|
||||
constraints: dict = dict(),
|
||||
commit: bool = True
|
||||
) -> str:
|
||||
''' Adds one/multiple columns (each with a list of constraints) to the given table. '''
|
||||
sql_total = ''
|
||||
for column,dtype in columns.items(): # sqlite can only add one column per query
|
||||
sql = f'ALTER TABLE {table_name}\n '
|
||||
sql += f'ADD "{column}" {dtype}{" "+" ".join(constraints[column]) if column in constraints.keys() else ""}'
|
||||
sql_total += sql + '\n'
|
||||
cursor.execute(sql)
|
||||
if commit: conn.commit()
|
||||
return sql_total
|
||||
|
||||
|
||||
|
||||
|
||||
def insert_rows_into_table(
|
||||
conn: sqlite3.Connection,
|
||||
cursor: sqlite3.Cursor,
|
||||
table_name: str,
|
||||
columns: dict,
|
||||
commit: bool = True
|
||||
) -> str:
|
||||
'''
|
||||
Inserts values as multiple rows into the given table.
|
||||
|
||||
:param columns: dict with column names (=keys) and values to insert as lists with at least one element (=values)
|
||||
|
||||
Note: The number of given values per attribute must match the number of rows to insert!
|
||||
Note: The values for the rows must be of normal python types (e.g. list, str, int, ...) instead of e.g. numpy arrays!
|
||||
'''
|
||||
assert len(set(map(len, columns.values()))) == 1, 'ERROR: Provide equal number of values for each column!'
|
||||
assert len(set(list(map(type,columns.values())))) == 1 and isinstance(list(columns.values())[0], list), 'ERROR: Provide values as Python lists!'
|
||||
assert set([type(a) for b in list(columns.values()) for a in b]).issubset({str,int,float,bool}), 'ERROR: Provide values as basic Python data types!'
|
||||
|
||||
values = list(zip(*columns.values()))
|
||||
sql = f'INSERT INTO {table_name} ({", ".join(columns.keys())})\n VALUES ({("?,"*len(values[0]))[:-1]})'
|
||||
cursor.executemany(sql, values)
|
||||
if commit: conn.commit()
|
||||
return sql
|
||||
|
||||
def update_multiple_rows_in_table(
|
||||
conn: sqlite3.Connection,
|
||||
cursor: sqlite3.Cursor,
|
||||
table_name: str,
|
||||
new_vals: dict,
|
||||
conditions: str,
|
||||
commit: bool = True
|
||||
) -> str:
|
||||
'''
|
||||
Updates attribute values of some rows in the given table.
|
||||
|
||||
:param new_vals: dict with column names (=keys) and the new values to set (=values)
|
||||
:param conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...)
|
||||
'''
|
||||
assignments = ', '.join([f'{k}={v}' for k,v in zip(new_vals.keys(), new_vals.values())])
|
||||
sql = f'UPDATE {table_name}\n SET {assignments}\n WHERE {conditions}'
|
||||
cursor.execute(sql)
|
||||
if commit: conn.commit()
|
||||
return sql
|
||||
|
||||
def delete_rows_from_table(
|
||||
conn: sqlite3.Connection,
|
||||
cursor: sqlite3.Cursor,
|
||||
table_name: str,
|
||||
conditions: str,
|
||||
commit: bool = True
|
||||
) -> str:
|
||||
'''
|
||||
Deletes rows from the given table.
|
||||
|
||||
:param conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...)
|
||||
'''
|
||||
sql = f'DELETE FROM {table_name} WHERE {conditions}'
|
||||
cursor.execute(sql)
|
||||
if commit: conn.commit()
|
||||
return sql
|
||||
|
||||
|
||||
|
||||
def get_data_from_table(
|
||||
conn: sqlite3.Connection,
|
||||
table_name: str,
|
||||
columns_list: list = ['*'],
|
||||
aggregations: [None,dict] = None,
|
||||
where_conditions: [None,str] = None,
|
||||
order_by: [None, dict] = None,
|
||||
limit: [None, int] = None,
|
||||
offset: [None, int] = None
|
||||
) -> pd.DataFrame:
|
||||
'''
|
||||
Helper function which returns (if desired: aggregated) contents from the given table as a pandas DataFrame. The rows can be filtered by providing the condition as a string.
|
||||
|
||||
:param columns_list: use if no aggregation is needed to select which columns to get from the table
|
||||
:param (optional) aggregations: use to apply aggregations on the data from the table; dictionary with column(s) as key(s) and aggregation(s) as corresponding value(s) (e.g. {'col1': 'MIN', 'col2': 'AVG', ...} or {'*': 'COUNT'})
|
||||
:param (optional) where_conditions: string which defines all concatenated conditions (e.g. \'cond1 AND (cond2 OR cond3)\' with cond1: att1=5, ...) applied on table.
|
||||
:param (optional) order_by: dict defining the ordering of the outputs with column(s) as key(s) and ordering as corresponding value(s) (e.g. {'col1': 'ASC'})
|
||||
:param (optional) limit: use to limit the number of returned rows
|
||||
:param (optional) offset: use to skip the first n rows before displaying
|
||||
|
||||
Note: If aggregations is set, the columns_list is ignored.
|
||||
Note: Get all data as a DataFrame with get_data_from_table(conn, table_name).
|
||||
Note: If one output is wanted (e.g. count(*) or similar), get it with get_data_from_table(...).iloc[0,0] from the DataFrame.
|
||||
'''
|
||||
assert columns_list or aggregations
|
||||
|
||||
if aggregations:
|
||||
selection = [f'{agg}({col})' for col,agg in aggregations.items()]
|
||||
else:
|
||||
selection = columns_list
|
||||
selection = ", ".join(selection)
|
||||
where_conditions = 'WHERE ' + where_conditions if where_conditions else ''
|
||||
order_by = 'ORDER BY ' + ', '.join([f'{k} {v}' for k,v in order_by.items()]) if order_by else ''
|
||||
limit = f'LIMIT {limit}' if limit else ''
|
||||
offset = f'OFFSET {offset}' if offset else ''
|
||||
|
||||
sql = f'SELECT {selection} FROM {table_name} {where_conditions} {order_by} {limit} {offset}'
|
||||
return pd.read_sql_query(sql, conn)
|
||||
Loading…
x
Reference in New Issue
Block a user