Compare commits
No commits in common. "de0084dc09acbed3c3a2ecfe8e284bcb3026d844" and "3d8c7c6639610cfb40de88fc2694abd248dd85c0" have entirely different histories.
de0084dc09
...
3d8c7c6639
@ -1,13 +1,5 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "cc08936c",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Insights into the dataset with histogramms and scatter plots"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "1014c5e0",
|
"id": "1014c5e0",
|
||||||
@ -25,8 +17,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt"
|
||||||
"from pathlib import Path"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -36,7 +27,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n",
|
"path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
|
||||||
"df = pd.read_parquet(path=path)"
|
"df = pd.read_parquet(path=path)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -113,27 +104,21 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"face_au_cols = [c for c in low_all.columns if c.startswith(\"FACE_AU\")]\n",
|
"# Get all columns that start with 'AU'\n",
|
||||||
"eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
|
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||||
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
|
|
||||||
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
|
|
||||||
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
|
|
||||||
" 'Pupil_mean', 'Pupil_IPA']\n",
|
|
||||||
"\n",
|
|
||||||
"cols = face_au_cols+eye_cols\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Calculate number of rows and columns for subplots\n",
|
"# Calculate number of rows and columns for subplots\n",
|
||||||
"n_cols = len(cols)\n",
|
"n_cols = len(au_columns)\n",
|
||||||
"n_rows = 7\n",
|
"n_rows = 4\n",
|
||||||
"n_cols_subplot = 5\n",
|
"n_cols_subplot = 5\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Create figure with subplots\n",
|
"# Create figure with subplots\n",
|
||||||
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
||||||
"axes = axes.flatten()\n",
|
"axes = axes.flatten()\n",
|
||||||
"fig.suptitle('Feature Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
"fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Create histogram for each AU column\n",
|
"# Create histogram for each AU column\n",
|
||||||
"for idx, col in enumerate(cols):\n",
|
"for idx, col in enumerate(au_columns):\n",
|
||||||
" ax = axes[idx]\n",
|
" ax = axes[idx]\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Plot overlapping histograms\n",
|
" # Plot overlapping histograms\n",
|
||||||
@ -148,48 +133,18 @@
|
|||||||
" ax.grid(True, alpha=0.3)\n",
|
" ax.grid(True, alpha=0.3)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Hide any unused subplots\n",
|
"# Hide any unused subplots\n",
|
||||||
"for idx in range(len(cols), len(axes)):\n",
|
"for idx in range(len(au_columns), len(axes)):\n",
|
||||||
" axes[idx].set_visible(False)\n",
|
" axes[idx].set_visible(False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Adjust layout\n",
|
"# Adjust layout\n",
|
||||||
"plt.tight_layout()\n",
|
"plt.tight_layout()\n",
|
||||||
"plt.show()"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "6cd53cdb",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Create figure with subplots\n",
|
|
||||||
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
|
|
||||||
"axes = axes.flatten()\n",
|
|
||||||
"fig.suptitle('Feature Scatter: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
|
|
||||||
"\n",
|
|
||||||
"for idx, col in enumerate(cols):\n",
|
|
||||||
" ax = axes[idx]\n",
|
|
||||||
"\n",
|
|
||||||
" # Scatterplots\n",
|
|
||||||
" ax.scatter(range(len(low_all[col])), low_all[col], alpha=0.6, color='blue', label='low_all', s=10)\n",
|
|
||||||
" ax.scatter(range(len(high_all[col])), high_all[col], alpha=0.6, color='red', label='high_all', s=10)\n",
|
|
||||||
"\n",
|
|
||||||
" ax.set_title(col, fontsize=10, fontweight='bold')\n",
|
|
||||||
" ax.set_xlabel('Sample index', fontsize=8)\n",
|
|
||||||
" ax.set_ylabel('Value', fontsize=8)\n",
|
|
||||||
" ax.legend(fontsize=8)\n",
|
|
||||||
" ax.grid(True, alpha=0.3)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"plt.tight_layout()\n",
|
|
||||||
"plt.show()"
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "base",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -203,7 +158,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.10"
|
"version": "3.11.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -1,10 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
# TODO: Set paths correctly
|
|
||||||
data_dir = Path("") # path to the directory with all .h5 files
|
|
||||||
base_dir = Path(r"") # directory to store the parquet files in
|
|
||||||
|
|
||||||
|
data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")
|
||||||
|
|
||||||
# Get all .h5 files and sort them
|
# Get all .h5 files and sort them
|
||||||
matching_files = sorted(data_dir.glob("*.h5"))
|
matching_files = sorted(data_dir.glob("*.h5"))
|
||||||
@ -13,8 +11,8 @@ matching_files = sorted(data_dir.glob("*.h5"))
|
|||||||
CHUNK_SIZE = 50_000
|
CHUNK_SIZE = 50_000
|
||||||
|
|
||||||
for i, file_path in enumerate(matching_files):
|
for i, file_path in enumerate(matching_files):
|
||||||
print(f"Starting with subject {i}")
|
print(f"Subject {i} gestartet")
|
||||||
print(f"Opened: {file_path}")
|
print(f"{file_path} geoeffnet")
|
||||||
|
|
||||||
# Step 1: Get total number of rows and column names
|
# Step 1: Get total number of rows and column names
|
||||||
with pd.HDFStore(file_path, mode="r") as store:
|
with pd.HDFStore(file_path, mode="r") as store:
|
||||||
@ -83,7 +81,7 @@ for i, file_path in enumerate(matching_files):
|
|||||||
print(f"Final dataframe shape: {df_final.shape}")
|
print(f"Final dataframe shape: {df_final.shape}")
|
||||||
|
|
||||||
# Save to parquet
|
# Save to parquet
|
||||||
|
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
|
||||||
os.makedirs(base_dir, exist_ok=True)
|
os.makedirs(base_dir, exist_ok=True)
|
||||||
|
|
||||||
out_name = base_dir / f"both_mod_{i:04d}.parquet"
|
out_name = base_dir / f"both_mod_{i:04d}.parquet"
|
||||||
@ -0,0 +1,91 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
print(os.getcwd())
|
||||||
|
num_files = 2 # number of files to process (min: 1, max: 30)
|
||||||
|
|
||||||
|
print("connection aufgebaut")
|
||||||
|
|
||||||
|
data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
|
||||||
|
# os.chdir(data_dir)
|
||||||
|
# Get all .h5 files and sort them
|
||||||
|
matching_files = sorted(data_dir.glob("*.h5"))
|
||||||
|
|
||||||
|
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
|
||||||
|
CHUNK_SIZE = 100_000
|
||||||
|
|
||||||
|
for i, file_path in enumerate(matching_files):
|
||||||
|
print(f"Subject {i} gestartet")
|
||||||
|
print(f"{file_path} geoeffnet")
|
||||||
|
|
||||||
|
# Step 1: Get total number of rows and column names
|
||||||
|
with pd.HDFStore(file_path, mode="r") as store:
|
||||||
|
cols = store.select("SIGNALS", start=0, stop=1).columns
|
||||||
|
nrows = store.get_storer("SIGNALS").nrows
|
||||||
|
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
|
||||||
|
|
||||||
|
# Step 2: Filter columns that start with "FACE_AU"
|
||||||
|
eye_cols = [c for c in cols if c.startswith("EYE_")]
|
||||||
|
print(f"eye-tracking columns found: {eye_cols}")
|
||||||
|
|
||||||
|
if len(eye_cols) == 0:
|
||||||
|
print(f"keine eye-tracking-Signale in Subject {i}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Columns to read
|
||||||
|
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
|
||||||
|
|
||||||
|
# Step 3: Process file in chunks
|
||||||
|
chunks_to_save = []
|
||||||
|
|
||||||
|
for start_row in range(0, nrows, CHUNK_SIZE):
|
||||||
|
stop_row = min(start_row + CHUNK_SIZE, nrows)
|
||||||
|
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
|
||||||
|
|
||||||
|
# Read chunk
|
||||||
|
df_chunk = pd.read_hdf(
|
||||||
|
file_path,
|
||||||
|
key="SIGNALS",
|
||||||
|
columns=columns_to_read,
|
||||||
|
start=start_row,
|
||||||
|
stop=stop_row
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add metadata columns
|
||||||
|
df_chunk["subjectID"] = i
|
||||||
|
df_chunk["rowID"] = range(start_row, stop_row)
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
|
||||||
|
df_chunk = df_chunk.dropna()
|
||||||
|
|
||||||
|
# Only keep non-empty chunks
|
||||||
|
if len(df_chunk) > 0:
|
||||||
|
chunks_to_save.append(df_chunk)
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_chunk
|
||||||
|
|
||||||
|
print("load and cleaning done")
|
||||||
|
|
||||||
|
# Step 4: Combine all chunks and save
|
||||||
|
if chunks_to_save:
|
||||||
|
df_final = pd.concat(chunks_to_save, ignore_index=True)
|
||||||
|
print(f"Final dataframe shape: {df_final.shape}")
|
||||||
|
|
||||||
|
# Save to parquet
|
||||||
|
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
|
||||||
|
os.makedirs(base_dir, exist_ok=True)
|
||||||
|
|
||||||
|
out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
|
||||||
|
df_final.to_parquet(out_name, index=False)
|
||||||
|
print(f"Saved to {out_name}")
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_final
|
||||||
|
del chunks_to_save
|
||||||
|
else:
|
||||||
|
print(f"No valid data found for Subject {i}")
|
||||||
|
|
||||||
|
print("All files processed!")
|
||||||
91
dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
Normal file
91
dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
print(os.getcwd())
|
||||||
|
num_files = 2 # number of files to process (min: 1, max: 30)
|
||||||
|
|
||||||
|
print("connection aufgebaut")
|
||||||
|
|
||||||
|
data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
|
||||||
|
|
||||||
|
# Get all .h5 files and sort them
|
||||||
|
matching_files = sorted(data_dir.glob("*.h5"))
|
||||||
|
|
||||||
|
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
|
||||||
|
CHUNK_SIZE = 100_000
|
||||||
|
|
||||||
|
for i, file_path in enumerate(matching_files):
|
||||||
|
print(f"Subject {i} gestartet")
|
||||||
|
print(f"{file_path} geoeffnet")
|
||||||
|
|
||||||
|
# Step 1: Get total number of rows and column names
|
||||||
|
with pd.HDFStore(file_path, mode="r") as store:
|
||||||
|
cols = store.select("SIGNALS", start=0, stop=1).columns
|
||||||
|
nrows = store.get_storer("SIGNALS").nrows
|
||||||
|
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
|
||||||
|
|
||||||
|
# Step 2: Filter columns that start with "FACE_AU"
|
||||||
|
eye_cols = [c for c in cols if c.startswith("FACE_AU")]
|
||||||
|
print(f"FACE_AU columns found: {eye_cols}")
|
||||||
|
|
||||||
|
if len(eye_cols) == 0:
|
||||||
|
print(f"keine FACE_AU-Signale in Subject {i}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Columns to read
|
||||||
|
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
|
||||||
|
|
||||||
|
# Step 3: Process file in chunks
|
||||||
|
chunks_to_save = []
|
||||||
|
|
||||||
|
for start_row in range(0, nrows, CHUNK_SIZE):
|
||||||
|
stop_row = min(start_row + CHUNK_SIZE, nrows)
|
||||||
|
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
|
||||||
|
|
||||||
|
# Read chunk
|
||||||
|
df_chunk = pd.read_hdf(
|
||||||
|
file_path,
|
||||||
|
key="SIGNALS",
|
||||||
|
columns=columns_to_read,
|
||||||
|
start=start_row,
|
||||||
|
stop=stop_row
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add metadata columns
|
||||||
|
df_chunk["subjectID"] = i
|
||||||
|
df_chunk["rowID"] = range(start_row, stop_row)
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
|
||||||
|
df_chunk = df_chunk.dropna()
|
||||||
|
|
||||||
|
# Only keep non-empty chunks
|
||||||
|
if len(df_chunk) > 0:
|
||||||
|
chunks_to_save.append(df_chunk)
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_chunk
|
||||||
|
|
||||||
|
print("load and cleaning done")
|
||||||
|
|
||||||
|
# Step 4: Combine all chunks and save
|
||||||
|
if chunks_to_save:
|
||||||
|
df_final = pd.concat(chunks_to_save, ignore_index=True)
|
||||||
|
print(f"Final dataframe shape: {df_final.shape}")
|
||||||
|
|
||||||
|
# Save to parquet
|
||||||
|
base_dir = Path(r"C:\new_AU_parquet_files")
|
||||||
|
os.makedirs(base_dir, exist_ok=True)
|
||||||
|
|
||||||
|
out_name = base_dir / f"cleaned_{i:04d}.parquet"
|
||||||
|
df_final.to_parquet(out_name, index=False)
|
||||||
|
print(f"Saved to {out_name}")
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_final
|
||||||
|
del chunks_to_save
|
||||||
|
else:
|
||||||
|
print(f"No valid data found for Subject {i}")
|
||||||
|
|
||||||
|
print("All files processed!")
|
||||||
@ -4,26 +4,27 @@ import pandas as pd
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
from scipy.signal import welch
|
from scipy.signal import welch
|
||||||
from pygazeanalyser.detectors import fixation_detection, saccade_detection # not installed by default
|
from pygazeanalyser.detectors import fixation_detection, saccade_detection
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# CONFIGURATION
|
# KONFIGURATION
|
||||||
##############################################################################
|
##############################################################################
|
||||||
INPUT_DIR = Path(r"") # directory that stores the parquet files (one file per subject)
|
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
|
||||||
OUTPUT_FILE = Path(r"") # path for resulting dataset
|
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet")
|
||||||
WINDOW_SIZE_SAMPLES = 25*50 # 50s at 25Hz
|
|
||||||
STEP_SIZE_SAMPLES = 125 # 5s at 25Hz
|
WINDOW_SIZE_SAMPLES = 25*50 # 50s bei 25Hz
|
||||||
|
STEP_SIZE_SAMPLES = 125 # 5s bei 25Hz
|
||||||
SAMPLING_RATE = 25 # Hz
|
SAMPLING_RATE = 25 # Hz
|
||||||
MIN_DUR_BLINKS = 2 # x * 40ms
|
MIN_DUR_BLINKS = 2 # x * 40ms
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# EYE-TRACKING FUNCTIONS
|
# EYE-TRACKING FUNKTIONEN
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
def clean_eye_df(df):
|
def clean_eye_df(df):
|
||||||
"""Extracts Eye-Tracking columns only and removes empty rows."""
|
"""Extrahiert nur Eye-Tracking Spalten und entfernt leere Zeilen."""
|
||||||
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
|
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
|
||||||
|
|
||||||
if not eye_cols:
|
if not eye_cols:
|
||||||
@ -37,7 +38,7 @@ def clean_eye_df(df):
|
|||||||
|
|
||||||
|
|
||||||
def extract_gaze_signal(df):
|
def extract_gaze_signal(df):
|
||||||
"""Extracts 2D gaze positions, masks invalid samples, and interpolates."""
|
"""Extrahiert 2D-Gaze-Positionen, maskiert ungültige Samples und interpoliert."""
|
||||||
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
@ -50,14 +51,14 @@ def extract_gaze_signal(df):
|
|||||||
for arr in [gx_L, gy_L, gx_R, gy_R]:
|
for arr in [gx_L, gy_L, gx_R, gy_R]:
|
||||||
arr.replace([np.inf, -np.inf], np.nan, inplace=True)
|
arr.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
|
||||||
# Mask invalids
|
# Ungültige maskieren
|
||||||
gx_L[~val_L] = np.nan
|
gx_L[~val_L] = np.nan
|
||||||
gy_L[~val_L] = np.nan
|
gy_L[~val_L] = np.nan
|
||||||
gx_R[~val_R] = np.nan
|
gx_R[~val_R] = np.nan
|
||||||
gy_R[~val_R] = np.nan
|
gy_R[~val_R] = np.nan
|
||||||
|
|
||||||
|
|
||||||
# Mean of both eyes
|
# Mittelwert beider Augen
|
||||||
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
|
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
|
||||||
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
|
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
|
||||||
|
|
||||||
@ -65,7 +66,7 @@ def extract_gaze_signal(df):
|
|||||||
gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill()
|
gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill()
|
||||||
gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill()
|
gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill()
|
||||||
|
|
||||||
# MinMax scaling
|
# MinMax Skalierung
|
||||||
xscaler = MinMaxScaler()
|
xscaler = MinMaxScaler()
|
||||||
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ def extract_gaze_signal(df):
|
|||||||
|
|
||||||
|
|
||||||
def extract_pupil(df):
|
def extract_pupil(df):
|
||||||
"""Extract pupil size (average of both eyes)."""
|
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
|
||||||
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
@ -95,7 +96,7 @@ def extract_pupil(df):
|
|||||||
|
|
||||||
|
|
||||||
def detect_blinks(pupil_validity, min_duration=5):
|
def detect_blinks(pupil_validity, min_duration=5):
|
||||||
"""Detect blinks: Validity=0 → Blink."""
|
"""Erkennt Blinks: Validity=0 → Blink."""
|
||||||
blinks = []
|
blinks = []
|
||||||
start = None
|
start = None
|
||||||
|
|
||||||
@ -119,13 +120,13 @@ def compute_IPA(pupil, fs=25):
|
|||||||
|
|
||||||
def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
|
def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
|
||||||
"""
|
"""
|
||||||
Extracts eye tracking features for a single window.
|
Extrahiert Eye-Tracking Features für ein einzelnes Window.
|
||||||
Returns a dictionary containing all eye features.
|
Gibt Dictionary mit allen Eye-Features zurück.
|
||||||
"""
|
"""
|
||||||
# Gaze
|
# Gaze
|
||||||
gaze = extract_gaze_signal(df_eye_window)
|
gaze = extract_gaze_signal(df_eye_window)
|
||||||
|
|
||||||
# Pupil
|
# Pupille
|
||||||
pupil, pupil_validity = extract_pupil(df_eye_window)
|
pupil, pupil_validity = extract_pupil(df_eye_window)
|
||||||
|
|
||||||
window_size = len(df_eye_window)
|
window_size = len(df_eye_window)
|
||||||
@ -142,6 +143,7 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
|
|||||||
|
|
||||||
fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0]
|
fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0]
|
||||||
|
|
||||||
|
# Kategorien
|
||||||
F_short = sum(66 <= d <= 150 for d in fixation_durations)
|
F_short = sum(66 <= d <= 150 for d in fixation_durations)
|
||||||
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
|
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
|
||||||
F_long = sum(d >= 1000 for d in fixation_durations)
|
F_long = sum(d >= 1000 for d in fixation_durations)
|
||||||
@ -195,27 +197,27 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
|
|||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Combined feature extraction
|
# KOMBINIERTE FEATURE-EXTRAKTION
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2):
|
def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2):
|
||||||
"""
|
"""
|
||||||
Processes Parquet files with FACE_AU and EYE columns.
|
Verarbeitet Parquet-Dateien mit FACE_AU und EYE Spalten.
|
||||||
Extracts both feature sets and combines them.
|
Extrahiert beide Feature-Sets und kombiniert sie.
|
||||||
"""
|
"""
|
||||||
input_path = Path(input_dir)
|
input_path = Path(input_dir)
|
||||||
parquet_files = sorted(input_path.glob("*.parquet"))
|
parquet_files = sorted(input_path.glob("*.parquet"))
|
||||||
|
|
||||||
if not parquet_files:
|
if not parquet_files:
|
||||||
print(f"Error: No parquet-files found in {input_dir}!")
|
print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
print(f"\n{'='*70}")
|
print(f"\n{'='*70}")
|
||||||
print(f"Combined feature-extraction")
|
print(f"KOMBINIERTE FEATURE-EXTRAKTION")
|
||||||
print(f"{'='*70}")
|
print(f"{'='*70}")
|
||||||
print(f"Files: {len(parquet_files)}")
|
print(f"Dateien: {len(parquet_files)}")
|
||||||
print(f"Window: {window_size} Samples ({window_size/fs:.1f}s at {fs}Hz)")
|
print(f"Window: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
|
||||||
print(f"Step: {step_size} Samples ({step_size/fs:.1f}s at {fs}Hz)")
|
print(f"Step: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
|
||||||
print(f"{'='*70}\n")
|
print(f"{'='*70}\n")
|
||||||
|
|
||||||
all_windows = []
|
all_windows = []
|
||||||
@ -225,22 +227,24 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
df = pd.read_parquet(parquet_file)
|
df = pd.read_parquet(parquet_file)
|
||||||
print(f" Entries: {len(df)}")
|
print(f" Einträge: {len(df)}")
|
||||||
|
|
||||||
|
|
||||||
|
# Identifiziere Spalten
|
||||||
au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
|
au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
|
||||||
eye_columns = [col for col in df.columns if col.startswith('EYE_')]
|
eye_columns = [col for col in df.columns if col.startswith('EYE_')]
|
||||||
|
|
||||||
print(f" AU-columns: {len(au_columns)}")
|
print(f" AU-Spalten: {len(au_columns)}")
|
||||||
print(f" Eye-columns: {len(eye_columns)}")
|
print(f" Eye-Spalten: {len(eye_columns)}")
|
||||||
|
|
||||||
has_au = len(au_columns) > 0
|
has_au = len(au_columns) > 0
|
||||||
has_eye = len(eye_columns) > 0
|
has_eye = len(eye_columns) > 0
|
||||||
|
|
||||||
if not has_au and not has_eye:
|
if not has_au and not has_eye:
|
||||||
print(f" Warning: No AU or eye tracking columns found!")
|
print(f" WARNUNG: Keine AU oder Eye Spalten gefunden!")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Group by STUDY, LEVEL, PHASE
|
# Gruppiere nach STUDY, LEVEL, PHASE
|
||||||
group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns]
|
group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns]
|
||||||
|
|
||||||
if group_cols:
|
if group_cols:
|
||||||
@ -254,7 +258,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
|
|
||||||
group_df = group_df.reset_index(drop=True)
|
group_df = group_df.reset_index(drop=True)
|
||||||
|
|
||||||
# calculate number of windows
|
# Berechne Anzahl Windows
|
||||||
num_windows = (len(group_df) - window_size) // step_size + 1
|
num_windows = (len(group_df) - window_size) // step_size + 1
|
||||||
|
|
||||||
if num_windows <= 0:
|
if num_windows <= 0:
|
||||||
@ -268,7 +272,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
|
|
||||||
window_df = group_df.iloc[start_idx:end_idx]
|
window_df = group_df.iloc[start_idx:end_idx]
|
||||||
|
|
||||||
# basic metadata
|
# Basis-Metadaten
|
||||||
result = {
|
result = {
|
||||||
'subjectID': window_df['subjectID'].iloc[0],
|
'subjectID': window_df['subjectID'].iloc[0],
|
||||||
'start_time': window_df['rowID'].iloc[0],
|
'start_time': window_df['rowID'].iloc[0],
|
||||||
@ -277,12 +281,12 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan
|
'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan
|
||||||
}
|
}
|
||||||
|
|
||||||
# FACE AU features
|
# FACE AU Features
|
||||||
if has_au:
|
if has_au:
|
||||||
for au_col in au_columns:
|
for au_col in au_columns:
|
||||||
result[f'{au_col}_mean'] = window_df[au_col].mean()
|
result[f'{au_col}_mean'] = window_df[au_col].mean()
|
||||||
|
|
||||||
# Eye-tracking features
|
# Eye-Tracking Features
|
||||||
if has_eye:
|
if has_eye:
|
||||||
try:
|
try:
|
||||||
# clean dataframe from all nan rows
|
# clean dataframe from all nan rows
|
||||||
@ -292,7 +296,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
result.update(eye_features)
|
result.update(eye_features)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" WARNUNG: Eye-Features fehlgeschlagen: {str(e)}")
|
print(f" WARNUNG: Eye-Features fehlgeschlagen: {str(e)}")
|
||||||
# Add NaN-values for eye-features
|
# Füge NaN-Werte für Eye-Features hinzu
|
||||||
result.update({
|
result.update({
|
||||||
"Fix_count_short_66_150": np.nan,
|
"Fix_count_short_66_150": np.nan,
|
||||||
"Fix_count_medium_300_500": np.nan,
|
"Fix_count_medium_300_500": np.nan,
|
||||||
@ -321,7 +325,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Combine all windows
|
# Kombiniere alle Windows
|
||||||
if not all_windows:
|
if not all_windows:
|
||||||
print("\nKEINE FEATURES EXTRAHIERT!")
|
print("\nKEINE FEATURES EXTRAHIERT!")
|
||||||
return None
|
return None
|
||||||
@ -336,7 +340,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
print(f"Spalten: {len(result_df.columns)}")
|
print(f"Spalten: {len(result_df.columns)}")
|
||||||
print(f"Subjects: {result_df['subjectID'].nunique()}")
|
print(f"Subjects: {result_df['subjectID'].nunique()}")
|
||||||
|
|
||||||
# Save
|
# Speichern
|
||||||
output_path = Path(output_file)
|
output_path = Path(output_file)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
result_df.to_parquet(output_file, index=False)
|
result_df.to_parquet(output_file, index=False)
|
||||||
@ -353,7 +357,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("\n" + "="*70)
|
print("\n" + "="*70)
|
||||||
print("Combined extraction (AU + EYE)")
|
print("KOMBINIERTE FEATURE-EXTRAKTION (AU + EYE)")
|
||||||
print("="*70)
|
print("="*70)
|
||||||
|
|
||||||
result = process_combined_features(
|
result = process_combined_features(
|
||||||
@ -366,16 +370,16 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
print("\First 5 rows:")
|
print("\nErste 5 Zeilen:")
|
||||||
print(result.head())
|
print(result.head())
|
||||||
|
|
||||||
print("\nColumns overview:")
|
print("\nSpalten-Übersicht:")
|
||||||
print(result.dtypes)
|
print(result.dtypes)
|
||||||
|
|
||||||
print("\Statistics:")
|
print("\nStatistik:")
|
||||||
print(result.describe())
|
print(result.describe())
|
||||||
|
|
||||||
print("\nDone!\n")
|
print("\n✓ FERTIG!\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
113
dataset_creation/create_feature_table.py
Normal file
113
dataset_creation/create_feature_table.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def process_parquet_files(input_dir, output_file, window_size=1250, step_size=125):
|
||||||
|
"""
|
||||||
|
Verarbeitet Parquet-Dateien mit Sliding Window Aggregation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
input_dir : str
|
||||||
|
Verzeichnis mit Parquet-Dateien
|
||||||
|
output_file : str
|
||||||
|
Pfad für die Ausgabe-Parquet-Datei
|
||||||
|
window_size : int
|
||||||
|
Größe des Sliding Windows (default: 3000)
|
||||||
|
step_size : int
|
||||||
|
Schrittweite in Einträgen (default: 250 = 10 Sekunden bei 25 Hz)
|
||||||
|
"""
|
||||||
|
|
||||||
|
input_path = Path(input_dir)
|
||||||
|
parquet_files = sorted(input_path.glob("*.parquet"))
|
||||||
|
|
||||||
|
if not parquet_files:
|
||||||
|
print(f"Keine Parquet-Dateien in {input_dir} gefunden!")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Gefundene Dateien: {len(parquet_files)}")
|
||||||
|
|
||||||
|
all_windows = []
|
||||||
|
|
||||||
|
for file_idx, parquet_file in enumerate(parquet_files):
|
||||||
|
print(f"\nVerarbeite Datei {file_idx + 1}/{len(parquet_files)}: {parquet_file.name}")
|
||||||
|
|
||||||
|
# Lade Parquet-Datei
|
||||||
|
df = pd.read_parquet(parquet_file)
|
||||||
|
print(f" Einträge: {len(df)}")
|
||||||
|
|
||||||
|
# Identifiziere AU-Spalten
|
||||||
|
au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
|
||||||
|
print(f" AU-Spalten: {len(au_columns)}")
|
||||||
|
|
||||||
|
# Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
|
||||||
|
for (study_val, level_val, phase_val), level_df in df.groupby(['STUDY', 'LEVEL', 'PHASE'], sort=False):
|
||||||
|
print(f" STUDY {study_val}, LEVEL {level_val}, PHASE {phase_val}: {len(level_df)} Einträge")
|
||||||
|
|
||||||
|
# Reset index für korrekte Position-Berechnung
|
||||||
|
level_df = level_df.reset_index(drop=True)
|
||||||
|
|
||||||
|
# Sliding Window über dieses Level
|
||||||
|
num_windows = (len(level_df) - window_size) // step_size + 1
|
||||||
|
|
||||||
|
if num_windows <= 0:
|
||||||
|
print(f" Zu wenige Einträge für Window (benötigt {window_size})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for i in range(num_windows):
|
||||||
|
start_idx = i * step_size
|
||||||
|
end_idx = start_idx + window_size
|
||||||
|
|
||||||
|
window_df = level_df.iloc[start_idx:end_idx]
|
||||||
|
|
||||||
|
# Erstelle aggregiertes Ergebnis
|
||||||
|
result = {
|
||||||
|
'subjectID': window_df['subjectID'].iloc[0],
|
||||||
|
'start_time': window_df['rowID'].iloc[0], # rowID als start_time
|
||||||
|
'STUDY': window_df['STUDY'].iloc[0],
|
||||||
|
'LEVEL': window_df['LEVEL'].iloc[0],
|
||||||
|
'PHASE': window_df['PHASE'].iloc[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Summiere alle AU-Spalten
|
||||||
|
for au_col in au_columns:
|
||||||
|
# result[f'{au_col}_sum'] = window_df[au_col].sum()
|
||||||
|
result[f'{au_col}_mean'] = window_df[au_col].mean()
|
||||||
|
|
||||||
|
all_windows.append(result)
|
||||||
|
|
||||||
|
print(f" Windows erstellt: {num_windows}")
|
||||||
|
|
||||||
|
# Erstelle finalen DataFrame
|
||||||
|
result_df = pd.DataFrame(all_windows)
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Gesamt Windows erstellt: {len(result_df)}")
|
||||||
|
print(f"Spalten: {list(result_df.columns)}")
|
||||||
|
|
||||||
|
# Speichere Ergebnis
|
||||||
|
result_df.to_parquet(output_file, index=False)
|
||||||
|
print(f"\nErgebnis gespeichert in: {output_file}")
|
||||||
|
|
||||||
|
return result_df
|
||||||
|
|
||||||
|
|
||||||
|
# Beispiel-Verwendung
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Anpassen an deine Pfade
|
||||||
|
input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
|
||||||
|
output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
result = process_parquet_files(
|
||||||
|
input_dir=input_directory,
|
||||||
|
output_file=output_file,
|
||||||
|
window_size=1250,
|
||||||
|
step_size=125
|
||||||
|
)
|
||||||
|
|
||||||
|
# Zeige erste Zeilen
|
||||||
|
if result is not None:
|
||||||
|
print("\nErste 5 Zeilen des Ergebnisses:")
|
||||||
|
print(result.head())
|
||||||
56
dataset_creation/create_multimodal_dataset_by_merge.py
Normal file
56
dataset_creation/create_multimodal_dataset_by_merge.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
USER CONFIGURATION
|
||||||
|
------------------
|
||||||
|
Specify input files and output directory here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Input parquet files (single-modality datasets)
|
||||||
|
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
|
||||||
|
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
|
||||||
|
|
||||||
|
# Output directory and file name
|
||||||
|
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
|
||||||
|
output_file = output_dir / "merged_dataset.parquet"
|
||||||
|
|
||||||
|
# Column names (adjust only if your schema differs)
|
||||||
|
subject_col = "subjectID"
|
||||||
|
time_col = "start_time"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Load datasets
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
df1 = pd.read_parquet(file_modality_1)
|
||||||
|
df2 = pd.read_parquet(file_modality_2)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Keep only subjects that appear in BOTH datasets
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
|
||||||
|
|
||||||
|
df1 = df1[df1[subject_col].isin(common_subjects)]
|
||||||
|
df2 = df2[df2[subject_col].isin(common_subjects)]
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Inner join on subject ID AND start_time
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
merged_df = pd.merge(
|
||||||
|
df1,
|
||||||
|
df2,
|
||||||
|
on=[subject_col, time_col],
|
||||||
|
how="inner",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Save merged dataset
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
merged_df.to_parquet(output_file, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -1,5 +1,6 @@
|
|||||||
|
# pip install pyocclient
|
||||||
import yaml
|
import yaml
|
||||||
import owncloud # pip install pyocclient
|
import owncloud
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import h5py
|
import h5py
|
||||||
import os
|
import os
|
||||||
@ -25,7 +26,7 @@ for i in range(num_files):
|
|||||||
|
|
||||||
# Download file from ownCloud
|
# Download file from ownCloud
|
||||||
oc.get_file(file_name, local_tmp)
|
oc.get_file(file_name, local_tmp)
|
||||||
print(f"Opened: {file_name}")
|
print(f"{file_name} geoeffnet")
|
||||||
# Load into memory and extract needed columns
|
# Load into memory and extract needed columns
|
||||||
# with h5py.File(local_tmp, "r") as f:
|
# with h5py.File(local_tmp, "r") as f:
|
||||||
# # Adjust this path depending on actual dataset layout inside .h5py file
|
# # Adjust this path depending on actual dataset layout inside .h5py file
|
||||||
@ -34,9 +35,14 @@ for i in range(num_files):
|
|||||||
with pd.HDFStore(local_tmp, mode="r") as store:
|
with pd.HDFStore(local_tmp, mode="r") as store:
|
||||||
cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
|
cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
|
||||||
|
|
||||||
|
# Step 2: Filter columns that start with "AU"
|
||||||
|
au_cols = [c for c in cols if c.startswith("AU")]
|
||||||
|
print(au_cols)
|
||||||
|
if len(au_cols)==0:
|
||||||
|
print(f"keine AU Signale in Subject {i}")
|
||||||
|
continue
|
||||||
# Step 3: Read only those columns (plus any others you want)
|
# Step 3: Read only those columns (plus any others you want)
|
||||||
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + cols)
|
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
|
||||||
|
|
||||||
|
|
||||||
print("load done")
|
print("load done")
|
||||||
@ -57,7 +63,7 @@ for i in range(num_files):
|
|||||||
|
|
||||||
|
|
||||||
# Save to parquet
|
# Save to parquet
|
||||||
os.makedirs("ParquetFiles", exist_ok=True) # TODO: change for custom directory
|
os.makedirs("ParquetFiles", exist_ok=True)
|
||||||
out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
|
out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
|
||||||
df.to_parquet(out_name, index=False)
|
df.to_parquet(out_name, index=False)
|
||||||
|
|
||||||
323
dataset_creation/eyeAlt.py
Normal file
323
dataset_creation/eyeAlt.py
Normal file
@ -0,0 +1,323 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import h5py
|
||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from scipy.signal import welch
|
||||||
|
from pygazeanalyser.detectors import fixation_detection, saccade_detection
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 1. HELFERFUNKTIONEN
|
||||||
|
##############################################################################
|
||||||
|
def clean_eye_df(df):
|
||||||
|
"""
|
||||||
|
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
|
||||||
|
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
|
||||||
|
"""
|
||||||
|
eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
|
||||||
|
df_eye = df[eye_cols]
|
||||||
|
|
||||||
|
# INF → NaN
|
||||||
|
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
|
||||||
|
df_eye = df_eye.dropna(subset=eye_cols, how="all")
|
||||||
|
|
||||||
|
print("Eyetracking-Zeilen vorher:", len(df))
|
||||||
|
print("Eyetracking-Zeilen nachher:", len(df_eye))
|
||||||
|
|
||||||
|
#Index zurücksetzen
|
||||||
|
return df_eye.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_gaze_signal(df):
|
||||||
|
"""
|
||||||
|
Extrahiert 2D-Gaze-Positionen auf dem Display,
|
||||||
|
maskiert ungültige Samples und interpoliert Lücken.
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
|
||||||
|
|
||||||
|
# Gaze-Spalten
|
||||||
|
gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
|
||||||
|
|
||||||
|
# Validity-Spalten (1 = gültig)
|
||||||
|
val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
|
||||||
|
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
|
||||||
|
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
|
||||||
|
# Ungültige Werte maskieren
|
||||||
|
gx_L[~val_L] = np.nan
|
||||||
|
gy_L[~val_L] = np.nan
|
||||||
|
gx_R[~val_R] = np.nan
|
||||||
|
gy_R[~val_R] = np.nan
|
||||||
|
|
||||||
|
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
|
||||||
|
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
|
||||||
|
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
|
||||||
|
|
||||||
|
# Interpolation (wichtig für PyGaze!)
|
||||||
|
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
|
||||||
|
# xscaler = MinMaxScaler()
|
||||||
|
# gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
# yscaler = MinMaxScaler()
|
||||||
|
# gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
#print("xmax ymax", gxscale.max(), gyscale.max())
|
||||||
|
|
||||||
|
#out = np.column_stack((gxscale, gyscale))
|
||||||
|
out = np.column_stack((gx, gy))
|
||||||
|
|
||||||
|
print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pupil(df):
|
||||||
|
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
|
||||||
|
|
||||||
|
pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
vl = df.get("LEFT_PUPIL_VALIDITY")
|
||||||
|
vr = df.get("RIGHT_PUPIL_VALIDITY")
|
||||||
|
|
||||||
|
if vl is None or vr is None:
|
||||||
|
# Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
|
||||||
|
# gültig, wenn Pupillendurchmesser nicht NaN.
|
||||||
|
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
|
||||||
|
else:
|
||||||
|
# Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
|
||||||
|
validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
|
||||||
|
|
||||||
|
# Mittelwert der verfügbaren Pupillen
|
||||||
|
p = np.mean(np.column_stack([pl, pr]), axis=1)
|
||||||
|
|
||||||
|
# INF/NaN reparieren
|
||||||
|
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
p = p.to_numpy()
|
||||||
|
|
||||||
|
print("→ extract_pupil(): Pupillensignal Länge:", len(p))
|
||||||
|
return p, validity
|
||||||
|
|
||||||
|
|
||||||
|
def detect_blinks(pupil_validity, min_duration=5):
|
||||||
|
"""Erkennt Blinks: Validity=0 → Blink."""
|
||||||
|
blinks = []
|
||||||
|
start = None
|
||||||
|
|
||||||
|
for i, v in enumerate(pupil_validity):
|
||||||
|
if v == 0 and start is None:
|
||||||
|
start = i
|
||||||
|
elif v == 1 and start is not None:
|
||||||
|
if i - start >= min_duration:
|
||||||
|
blinks.append([start, i])
|
||||||
|
start = None
|
||||||
|
|
||||||
|
return blinks
|
||||||
|
|
||||||
|
|
||||||
|
def compute_IPA(pupil, fs=250):
|
||||||
|
"""
|
||||||
|
IPA = Index of Pupillary Activity (nach Duchowski 2018).
|
||||||
|
Hochfrequenzanteile der Pupillenzeitreihe.
|
||||||
|
"""
|
||||||
|
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
|
||||||
|
|
||||||
|
hf_band = (f >= 0.6) & (f <= 2.0)
|
||||||
|
ipa = np.sum(Pxx[hf_band])
|
||||||
|
|
||||||
|
return ipa
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def extract_eye_features(df, window_length_sec=50, fs=250):
|
||||||
|
"""
|
||||||
|
df = Tobii DataFrame
|
||||||
|
window_length_sec = Fenstergröße (z.B. W=1s)
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("→ extract_eye_features(): Starte Feature-Berechnung...")
|
||||||
|
print(" Fensterlänge W =", window_length_sec, "s")
|
||||||
|
|
||||||
|
W = int(window_length_sec * fs) # Window größe in Samples
|
||||||
|
|
||||||
|
# Gaze
|
||||||
|
gaze = extract_gaze_signal(df)
|
||||||
|
gx, gy = gaze[:, 0], gaze[:, 1]
|
||||||
|
print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
|
||||||
|
print("Range:", np.nanmin(gx), np.nanmax(gx))
|
||||||
|
print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
|
||||||
|
print("Range:", np.nanmin(gy), np.nanmax(gy))
|
||||||
|
|
||||||
|
# Pupille
|
||||||
|
pupil, pupil_validity = extract_pupil(df)
|
||||||
|
|
||||||
|
features = []
|
||||||
|
|
||||||
|
# Sliding windows
|
||||||
|
for start in range(0, len(df), W):
|
||||||
|
end = start + W
|
||||||
|
if end > len(df):
|
||||||
|
break #das letzte Fenster wird ignoriert
|
||||||
|
|
||||||
|
|
||||||
|
w_gaze = gaze[start:end]
|
||||||
|
w_pupil = pupil[start:end]
|
||||||
|
w_valid = pupil_validity[start:end]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FIXATIONS (PyGaze)
|
||||||
|
# ----------------------------
|
||||||
|
time_ms = np.arange(W) * 1000.0 / fs
|
||||||
|
|
||||||
|
# print("gx im Fenster:", w_gaze[:,0][:20])
|
||||||
|
# print("gy im Fenster:", w_gaze[:,1][:20])
|
||||||
|
# print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
|
||||||
|
|
||||||
|
# print("Werte X im Fenster:", w_gaze[:,0])
|
||||||
|
# print("Werte Y im Fenster:", w_gaze[:,1])
|
||||||
|
# print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
|
||||||
|
# print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
|
||||||
|
print("time_ms:", time_ms)
|
||||||
|
|
||||||
|
fix, efix = fixation_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
|
||||||
|
missing=0.0, maxdist=0.003, mindur=10 # mindur=100ms
|
||||||
|
)
|
||||||
|
|
||||||
|
#print("Raw Fixation Output:", efix[0])
|
||||||
|
|
||||||
|
if start == 0:
|
||||||
|
print("DEBUG fix raw:", fix[:10])
|
||||||
|
|
||||||
|
# Robust fixations: PyGaze may return malformed entries
|
||||||
|
fixation_durations = []
|
||||||
|
for f in efix:
|
||||||
|
print("Efix:", f[2])
|
||||||
|
# start_t = f[1] # in ms
|
||||||
|
# end_t = f[2] # in ms
|
||||||
|
# duration = (end_t - start_t) / 1000.0 # in Sekunden
|
||||||
|
|
||||||
|
#duration = f[2] / 1000.0
|
||||||
|
if np.isfinite(f[2]) and f[2] > 0:
|
||||||
|
fixation_durations.append(f[2])
|
||||||
|
|
||||||
|
# Kategorien laut Paper
|
||||||
|
F_short = sum(66 <= d <= 150 for d in fixation_durations)
|
||||||
|
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
|
||||||
|
F_long = sum(d >= 1000 for d in fixation_durations)
|
||||||
|
F_hundred = sum(d > 100 for d in fixation_durations)
|
||||||
|
F_Cancel = sum(66 < d for d in fixation_durations)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# SACCADES
|
||||||
|
# ----------------------------
|
||||||
|
sac, esac = saccade_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
|
||||||
|
)
|
||||||
|
|
||||||
|
sac_durations = [s[2] for s in esac]
|
||||||
|
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# BLINKS
|
||||||
|
# ----------------------------
|
||||||
|
blinks = detect_blinks(w_valid)
|
||||||
|
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# PUPIL
|
||||||
|
# ----------------------------
|
||||||
|
if np.all(np.isnan(w_pupil)):
|
||||||
|
mean_pupil = np.nan
|
||||||
|
ipa = np.nan
|
||||||
|
else:
|
||||||
|
mean_pupil = np.nanmean(w_pupil)
|
||||||
|
ipa = compute_IPA(w_pupil, fs=fs)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FEATURE-TABELLE FÜLLEN
|
||||||
|
# ----------------------------
|
||||||
|
features.append({
|
||||||
|
"Fix_count_short_66_150": F_short,
|
||||||
|
"Fix_count_medium_300_500": F_medium,
|
||||||
|
"Fix_count_long_gt_1000": F_long,
|
||||||
|
"Fix_count_100": F_hundred,
|
||||||
|
"Fix_cancel": F_Cancel,
|
||||||
|
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
|
||||||
|
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
|
||||||
|
|
||||||
|
"Sac_count": len(sac),
|
||||||
|
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
|
||||||
|
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
|
||||||
|
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
|
||||||
|
|
||||||
|
"Blink_count": len(blinks),
|
||||||
|
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
|
||||||
|
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
|
||||||
|
|
||||||
|
"Pupil_mean": mean_pupil,
|
||||||
|
"Pupil_IPA": ipa
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
result = pd.DataFrame(features)
|
||||||
|
print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 3. MAIN FUNKTION
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("### STARTE FEATURE-EXTRAKTION ###")
|
||||||
|
print("Aktueller Arbeitsordner:", os.getcwd())
|
||||||
|
|
||||||
|
#df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
|
||||||
|
df = pd.read_parquet("cleaned_0001.parquet")
|
||||||
|
print("DataFrame geladen:", df.shape)
|
||||||
|
|
||||||
|
# Nur Eye-Tracking auswählen
|
||||||
|
#eye_cols = [c for c in df.columns if "EYE_" in c]
|
||||||
|
#df_eye = df[eye_cols]
|
||||||
|
|
||||||
|
#print("Eye-Tracking-Spalten:", len(eye_cols))
|
||||||
|
#print("→", eye_cols[:10], " ...")
|
||||||
|
|
||||||
|
print("Reinige Eyetracking-Daten ...")
|
||||||
|
df_eye = clean_eye_df(df)
|
||||||
|
|
||||||
|
# Feature Extraction
|
||||||
|
features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
|
||||||
|
|
||||||
|
print("\n### FEATURE-MATRIX (HEAD) ###")
|
||||||
|
print(features.head())
|
||||||
|
|
||||||
|
print("\nSpeichere Output in features.csv ...")
|
||||||
|
features.to_csv("features4.csv", index=False)
|
||||||
|
|
||||||
|
print("FERTIG!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
441
dataset_creation/eye_batch_processor.py
Normal file
441
dataset_creation/eye_batch_processor.py
Normal file
@ -0,0 +1,441 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import h5py
|
||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from scipy.signal import welch
|
||||||
|
from pygazeanalyser.detectors import fixation_detection, saccade_detection
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# KONFIGURATION - HIER ANPASSEN!
|
||||||
|
##############################################################################
|
||||||
|
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
|
||||||
|
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
|
||||||
|
|
||||||
|
WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
|
||||||
|
STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
|
||||||
|
SAMPLING_RATE = 250 # Hz
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 1. HELFERFUNKTIONEN
|
||||||
|
##############################################################################
|
||||||
|
def clean_eye_df(df):
|
||||||
|
"""
|
||||||
|
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
|
||||||
|
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
|
||||||
|
"""
|
||||||
|
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
|
||||||
|
df_eye = df[eye_cols]
|
||||||
|
|
||||||
|
# INF → NaN
|
||||||
|
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
|
||||||
|
df_eye = df_eye.dropna(subset=eye_cols, how="all")
|
||||||
|
|
||||||
|
print(f" Eyetracking-Zeilen: {len(df)} → {len(df_eye)}")
|
||||||
|
|
||||||
|
return df_eye.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_gaze_signal(df):
|
||||||
|
"""
|
||||||
|
Extrahiert 2D-Gaze-Positionen auf dem Display,
|
||||||
|
maskiert ungültige Samples und interpoliert Lücken.
|
||||||
|
"""
|
||||||
|
# Gaze-Spalten
|
||||||
|
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
|
||||||
|
# Validity-Spalten (1 = gültig)
|
||||||
|
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
|
||||||
|
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
|
||||||
|
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
|
||||||
|
# Ungültige Werte maskieren
|
||||||
|
gx_L[~val_L] = np.nan
|
||||||
|
gy_L[~val_L] = np.nan
|
||||||
|
gx_R[~val_R] = np.nan
|
||||||
|
gy_R[~val_R] = np.nan
|
||||||
|
|
||||||
|
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
|
||||||
|
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
|
||||||
|
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
|
||||||
|
|
||||||
|
# Interpolation (wichtig für PyGaze!)
|
||||||
|
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
|
||||||
|
xscaler = MinMaxScaler()
|
||||||
|
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
yscaler = MinMaxScaler()
|
||||||
|
gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
out = np.column_stack((gxscale, gyscale))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pupil(df):
|
||||||
|
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
|
||||||
|
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
|
||||||
|
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
|
||||||
|
|
||||||
|
if vl is None or vr is None:
|
||||||
|
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
|
||||||
|
else:
|
||||||
|
validity = ((vl == 1) | (vr == 1)).astype(int).to_numpy()
|
||||||
|
|
||||||
|
# Mittelwert der verfügbaren Pupillen
|
||||||
|
p = np.mean(np.column_stack([pl, pr]), axis=1)
|
||||||
|
|
||||||
|
# INF/NaN reparieren
|
||||||
|
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
p = p.to_numpy()
|
||||||
|
|
||||||
|
return p, validity
|
||||||
|
|
||||||
|
|
||||||
|
def detect_blinks(pupil_validity, min_duration=5):
|
||||||
|
"""Erkennt Blinks: Validity=0 → Blink."""
|
||||||
|
blinks = []
|
||||||
|
start = None
|
||||||
|
|
||||||
|
for i, v in enumerate(pupil_validity):
|
||||||
|
if v == 0 and start is None:
|
||||||
|
start = i
|
||||||
|
elif v == 1 and start is not None:
|
||||||
|
if i - start >= min_duration:
|
||||||
|
blinks.append([start, i])
|
||||||
|
start = None
|
||||||
|
|
||||||
|
return blinks
|
||||||
|
|
||||||
|
|
||||||
|
def compute_IPA(pupil, fs=250):
|
||||||
|
"""
|
||||||
|
IPA = Index of Pupillary Activity (nach Duchowski 2018).
|
||||||
|
Hochfrequenzanteile der Pupillenzeitreihe.
|
||||||
|
"""
|
||||||
|
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
|
||||||
|
|
||||||
|
hf_band = (f >= 0.6) & (f <= 2.0)
|
||||||
|
ipa = np.sum(Pxx[hf_band])
|
||||||
|
|
||||||
|
return ipa
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 2. FEATURE-EXTRAKTION MIT SLIDING WINDOW
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def extract_eye_features_sliding(df_eye, df_meta, window_size, step_size, fs=250):
|
||||||
|
"""
|
||||||
|
Extrahiert Features mit Sliding Window aus einem einzelnen Level/Phase.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
df_eye : DataFrame
|
||||||
|
Eye-Tracking Daten (bereits gereinigt)
|
||||||
|
df_meta : DataFrame
|
||||||
|
Metadaten (subjectID, rowID, STUDY, LEVEL, PHASE)
|
||||||
|
window_size : int
|
||||||
|
Anzahl Samples pro Window
|
||||||
|
step_size : int
|
||||||
|
Schrittweite in Samples
|
||||||
|
fs : int
|
||||||
|
Sampling Rate in Hz
|
||||||
|
"""
|
||||||
|
# Gaze
|
||||||
|
gaze = extract_gaze_signal(df_eye)
|
||||||
|
|
||||||
|
# Pupille
|
||||||
|
pupil, pupil_validity = extract_pupil(df_eye)
|
||||||
|
|
||||||
|
features = []
|
||||||
|
num_windows = (len(df_eye) - window_size) // step_size + 1
|
||||||
|
|
||||||
|
if num_windows <= 0:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
for i in range(num_windows):
|
||||||
|
start_idx = i * step_size
|
||||||
|
end_idx = start_idx + window_size
|
||||||
|
|
||||||
|
w_gaze = gaze[start_idx:end_idx]
|
||||||
|
w_pupil = pupil[start_idx:end_idx]
|
||||||
|
w_valid = pupil_validity[start_idx:end_idx]
|
||||||
|
|
||||||
|
# Metadaten für dieses Window
|
||||||
|
meta_row = df_meta.iloc[start_idx]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FIXATIONS (PyGaze)
|
||||||
|
# ----------------------------
|
||||||
|
time_ms = np.arange(window_size) * 1000.0 / fs
|
||||||
|
|
||||||
|
fix, efix = fixation_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
|
||||||
|
missing=0.0, maxdist=0.003, mindur=10
|
||||||
|
)
|
||||||
|
|
||||||
|
fixation_durations = []
|
||||||
|
for f in efix:
|
||||||
|
if np.isfinite(f[2]) and f[2] > 0:
|
||||||
|
fixation_durations.append(f[2])
|
||||||
|
|
||||||
|
# Kategorien laut Paper
|
||||||
|
F_short = sum(66 <= d <= 150 for d in fixation_durations)
|
||||||
|
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
|
||||||
|
F_long = sum(d >= 1000 for d in fixation_durations)
|
||||||
|
F_hundred = sum(d > 100 for d in fixation_durations)
|
||||||
|
# F_Cancel = sum(66 < d for d in fixation_durations)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# SACCADES
|
||||||
|
# ----------------------------
|
||||||
|
sac, esac = saccade_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
|
||||||
|
missing=0, minlen=12, maxvel=0.2, maxacc=1
|
||||||
|
)
|
||||||
|
|
||||||
|
sac_durations = [s[2] for s in esac]
|
||||||
|
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# BLINKS
|
||||||
|
# ----------------------------
|
||||||
|
blinks = detect_blinks(w_valid)
|
||||||
|
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# PUPIL
|
||||||
|
# ----------------------------
|
||||||
|
if np.all(np.isnan(w_pupil)):
|
||||||
|
mean_pupil = np.nan
|
||||||
|
ipa = np.nan
|
||||||
|
else:
|
||||||
|
mean_pupil = np.nanmean(w_pupil)
|
||||||
|
ipa = compute_IPA(w_pupil, fs=fs)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FEATURE-DICTIONARY
|
||||||
|
# ----------------------------
|
||||||
|
features.append({
|
||||||
|
# Metadaten
|
||||||
|
'subjectID': meta_row['subjectID'],
|
||||||
|
'start_time': meta_row['rowID'],
|
||||||
|
'STUDY': meta_row.get('STUDY', np.nan),
|
||||||
|
'LEVEL': meta_row.get('LEVEL', np.nan),
|
||||||
|
'PHASE': meta_row.get('PHASE', np.nan),
|
||||||
|
|
||||||
|
# Fixation Features
|
||||||
|
"Fix_count_short_66_150": F_short,
|
||||||
|
"Fix_count_medium_300_500": F_medium,
|
||||||
|
"Fix_count_long_gt_1000": F_long,
|
||||||
|
"Fix_count_100": F_hundred,
|
||||||
|
# "Fix_cancel": F_Cancel,
|
||||||
|
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
|
||||||
|
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
|
||||||
|
|
||||||
|
# Saccade Features
|
||||||
|
"Sac_count": len(sac),
|
||||||
|
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
|
||||||
|
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
|
||||||
|
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
|
||||||
|
|
||||||
|
# Blink Features
|
||||||
|
"Blink_count": len(blinks),
|
||||||
|
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
|
||||||
|
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
|
||||||
|
|
||||||
|
# Pupil Features
|
||||||
|
"Pupil_mean": mean_pupil,
|
||||||
|
"Pupil_IPA": ipa
|
||||||
|
})
|
||||||
|
|
||||||
|
return pd.DataFrame(features)
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 3. BATCH-VERARBEITUNG
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def process_parquet_directory(input_dir, output_file, window_size, step_size, fs=250):
|
||||||
|
"""
|
||||||
|
Verarbeitet alle Parquet-Dateien in einem Verzeichnis.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
input_dir : str
|
||||||
|
Pfad zum Verzeichnis mit Parquet-Dateien
|
||||||
|
output_file : str
|
||||||
|
Pfad für die Ausgabe-Parquet-Datei
|
||||||
|
window_size : int
|
||||||
|
Window-Größe in Samples
|
||||||
|
step_size : int
|
||||||
|
Schrittweite in Samples
|
||||||
|
fs : int
|
||||||
|
Sampling Rate in Hz
|
||||||
|
"""
|
||||||
|
input_path = Path(input_dir)
|
||||||
|
parquet_files = sorted(input_path.glob("*.parquet"))
|
||||||
|
|
||||||
|
if not parquet_files:
|
||||||
|
print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f"STARTE BATCH-VERARBEITUNG")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
print(f"Gefundene Dateien: {len(parquet_files)}")
|
||||||
|
print(f"Window Size: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
|
||||||
|
print(f"Step Size: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
|
||||||
|
print(f"{'='*70}\n")
|
||||||
|
|
||||||
|
all_features = []
|
||||||
|
|
||||||
|
for file_idx, parquet_file in enumerate(parquet_files, 1):
|
||||||
|
print(f"\n[{file_idx}/{len(parquet_files)}] Verarbeite: {parquet_file.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Lade Parquet-Datei
|
||||||
|
df = pd.read_parquet(parquet_file)
|
||||||
|
print(f" Einträge geladen: {len(df)}")
|
||||||
|
|
||||||
|
# Prüfe ob benötigte Spalten vorhanden sind
|
||||||
|
required_cols = ['subjectID', 'rowID']
|
||||||
|
missing_cols = [col for col in required_cols if col not in df.columns]
|
||||||
|
if missing_cols:
|
||||||
|
print(f" WARNUNG: Fehlende Spalten: {missing_cols} - Überspringe Datei")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Reinige Eye-Tracking-Daten
|
||||||
|
df_eye = clean_eye_df(df)
|
||||||
|
|
||||||
|
if len(df_eye) == 0:
|
||||||
|
print(f" WARNUNG: Keine gültigen Eye-Tracking-Daten - Überspringe Datei")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Metadaten extrahieren (aligned mit df_eye)
|
||||||
|
meta_cols = ['subjectID', 'rowID']
|
||||||
|
if 'STUDY' in df.columns:
|
||||||
|
meta_cols.append('STUDY')
|
||||||
|
if 'LEVEL' in df.columns:
|
||||||
|
meta_cols.append('LEVEL')
|
||||||
|
if 'PHASE' in df.columns:
|
||||||
|
meta_cols.append('PHASE')
|
||||||
|
|
||||||
|
df_meta = df[meta_cols].iloc[df_eye.index].reset_index(drop=True)
|
||||||
|
|
||||||
|
# Gruppiere nach STUDY, LEVEL, PHASE (falls vorhanden)
|
||||||
|
group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df_meta.columns]
|
||||||
|
|
||||||
|
if group_cols:
|
||||||
|
print(f" Gruppiere nach: {', '.join(group_cols)}")
|
||||||
|
for group_vals, group_df in df_meta.groupby(group_cols, sort=False):
|
||||||
|
group_eye = df_eye.iloc[group_df.index].reset_index(drop=True)
|
||||||
|
group_meta = group_df.reset_index(drop=True)
|
||||||
|
|
||||||
|
print(f" Gruppe {group_vals}: {len(group_eye)} Samples", end=" → ")
|
||||||
|
|
||||||
|
features_df = extract_eye_features_sliding(
|
||||||
|
group_eye, group_meta, window_size, step_size, fs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not features_df.empty:
|
||||||
|
all_features.append(features_df)
|
||||||
|
print(f"{len(features_df)} Windows")
|
||||||
|
else:
|
||||||
|
print("Zu wenige Daten")
|
||||||
|
else:
|
||||||
|
# Keine Gruppierung
|
||||||
|
print(f" Keine Gruppierungsspalten gefunden")
|
||||||
|
features_df = extract_eye_features_sliding(
|
||||||
|
df_eye, df_meta, window_size, step_size, fs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not features_df.empty:
|
||||||
|
all_features.append(features_df)
|
||||||
|
print(f" → {len(features_df)} Windows erstellt")
|
||||||
|
else:
|
||||||
|
print(f" → Zu wenige Daten")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FEHLER bei Verarbeitung: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Kombiniere alle Features
|
||||||
|
if not all_features:
|
||||||
|
print("\nKEINE FEATURES EXTRAHIERT!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f"ZUSAMMENFASSUNG")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
|
||||||
|
final_df = pd.concat(all_features, ignore_index=True)
|
||||||
|
|
||||||
|
print(f"Gesamt Windows: {len(final_df)}")
|
||||||
|
print(f"Spalten: {len(final_df.columns)}")
|
||||||
|
print(f"Subjects: {final_df['subjectID'].nunique()}")
|
||||||
|
|
||||||
|
# Speichere Ergebnis
|
||||||
|
output_path = Path(output_file)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
final_df.to_parquet(output_file, index=False)
|
||||||
|
|
||||||
|
print(f"\n✓ Ergebnis gespeichert: {output_file}")
|
||||||
|
print(f"{'='*70}\n")
|
||||||
|
|
||||||
|
return final_df
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 4. MAIN
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("EYE-TRACKING FEATURE EXTRAKTION - BATCH MODE")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
result = process_parquet_directory(
|
||||||
|
input_dir=INPUT_DIR,
|
||||||
|
output_file=OUTPUT_FILE,
|
||||||
|
window_size=WINDOW_SIZE_SAMPLES,
|
||||||
|
step_size=STEP_SIZE_SAMPLES,
|
||||||
|
fs=SAMPLING_RATE
|
||||||
|
)
|
||||||
|
|
||||||
|
if result is not None:
|
||||||
|
print("\nErste 5 Zeilen des Ergebnisses:")
|
||||||
|
print(result.head())
|
||||||
|
|
||||||
|
print("\nSpalten-Übersicht:")
|
||||||
|
print(result.columns.tolist())
|
||||||
|
|
||||||
|
print("\nDatentypen:")
|
||||||
|
print(result.dtypes)
|
||||||
|
|
||||||
|
print("\n✓ FERTIG!\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
323
dataset_creation/eyetrackingFeatures.py
Normal file
323
dataset_creation/eyetrackingFeatures.py
Normal file
@ -0,0 +1,323 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import h5py
|
||||||
|
import yaml
|
||||||
|
import owncloud
|
||||||
|
import os
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from scipy.signal import welch
|
||||||
|
from pygazeanalyser.detectors import fixation_detection, saccade_detection
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 1. HELFERFUNKTIONEN
|
||||||
|
##############################################################################
|
||||||
|
def clean_eye_df(df):
|
||||||
|
"""
|
||||||
|
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
|
||||||
|
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
|
||||||
|
"""
|
||||||
|
eye_cols = [c for c in df.columns if "EYE_" in c]
|
||||||
|
df_eye = df[eye_cols]
|
||||||
|
|
||||||
|
# INF → NaN
|
||||||
|
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
|
||||||
|
df_eye = df_eye.dropna(subset=eye_cols, how="all")
|
||||||
|
|
||||||
|
print("Eyetracking-Zeilen vorher:", len(df))
|
||||||
|
print("Eyetracking-Zeilen nachher:", len(df_eye))
|
||||||
|
|
||||||
|
#Index zurücksetzen
|
||||||
|
return df_eye.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_gaze_signal(df):
|
||||||
|
"""
|
||||||
|
Extrahiert 2D-Gaze-Positionen auf dem Display,
|
||||||
|
maskiert ungültige Samples und interpoliert Lücken.
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
|
||||||
|
|
||||||
|
# Gaze-Spalten
|
||||||
|
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||||
|
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||||
|
|
||||||
|
|
||||||
|
# Validity-Spalten (1 = gültig)
|
||||||
|
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
|
||||||
|
|
||||||
|
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
|
||||||
|
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||||
|
|
||||||
|
# Ungültige Werte maskieren
|
||||||
|
gx_L[~val_L] = np.nan
|
||||||
|
gy_L[~val_L] = np.nan
|
||||||
|
gx_R[~val_R] = np.nan
|
||||||
|
gy_R[~val_R] = np.nan
|
||||||
|
|
||||||
|
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
|
||||||
|
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
|
||||||
|
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
|
||||||
|
|
||||||
|
# Interpolation (wichtig für PyGaze!)
|
||||||
|
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
|
||||||
|
xscaler = MinMaxScaler()
|
||||||
|
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
yscaler = MinMaxScaler()
|
||||||
|
gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||||
|
|
||||||
|
print("xmax ymax", gxscale.max(), gyscale.max())
|
||||||
|
|
||||||
|
out = np.column_stack((gxscale, gyscale))
|
||||||
|
|
||||||
|
print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pupil(df):
|
||||||
|
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
|
||||||
|
|
||||||
|
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
|
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
|
||||||
|
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
|
||||||
|
|
||||||
|
if vl is None or vr is None:
|
||||||
|
# Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
|
||||||
|
# gültig, wenn Pupillendurchmesser nicht NaN.
|
||||||
|
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
|
||||||
|
else:
|
||||||
|
# Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
|
||||||
|
validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
|
||||||
|
|
||||||
|
# Mittelwert der verfügbaren Pupillen
|
||||||
|
p = np.mean(np.column_stack([pl, pr]), axis=1)
|
||||||
|
|
||||||
|
# INF/NaN reparieren
|
||||||
|
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||||
|
p = p.to_numpy()
|
||||||
|
|
||||||
|
print("→ extract_pupil(): Pupillensignal Länge:", len(p))
|
||||||
|
return p, validity
|
||||||
|
|
||||||
|
|
||||||
|
def detect_blinks(pupil_validity, min_duration=5):
|
||||||
|
"""Erkennt Blinks: Validity=0 → Blink."""
|
||||||
|
blinks = []
|
||||||
|
start = None
|
||||||
|
|
||||||
|
for i, v in enumerate(pupil_validity):
|
||||||
|
if v == 0 and start is None:
|
||||||
|
start = i
|
||||||
|
elif v == 1 and start is not None:
|
||||||
|
if i - start >= min_duration:
|
||||||
|
blinks.append([start, i])
|
||||||
|
start = None
|
||||||
|
|
||||||
|
return blinks
|
||||||
|
|
||||||
|
|
||||||
|
def compute_IPA(pupil, fs=250):
|
||||||
|
"""
|
||||||
|
IPA = Index of Pupillary Activity (nach Duchowski 2018).
|
||||||
|
Hochfrequenzanteile der Pupillenzeitreihe.
|
||||||
|
"""
|
||||||
|
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
|
||||||
|
|
||||||
|
hf_band = (f >= 0.6) & (f <= 2.0)
|
||||||
|
ipa = np.sum(Pxx[hf_band])
|
||||||
|
|
||||||
|
return ipa
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def extract_eye_features(df, window_length_sec=50, fs=250):
|
||||||
|
"""
|
||||||
|
df = Tobii DataFrame
|
||||||
|
window_length_sec = Fenstergröße (z.B. W=1s)
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("→ extract_eye_features(): Starte Feature-Berechnung...")
|
||||||
|
print(" Fensterlänge W =", window_length_sec, "s")
|
||||||
|
|
||||||
|
W = int(window_length_sec * fs) # Window größe in Samples
|
||||||
|
|
||||||
|
# Gaze
|
||||||
|
gaze = extract_gaze_signal(df)
|
||||||
|
gx, gy = gaze[:, 0], gaze[:, 1]
|
||||||
|
print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
|
||||||
|
print("Range:", np.nanmin(gx), np.nanmax(gx))
|
||||||
|
print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
|
||||||
|
print("Range:", np.nanmin(gy), np.nanmax(gy))
|
||||||
|
|
||||||
|
# Pupille
|
||||||
|
pupil, pupil_validity = extract_pupil(df)
|
||||||
|
|
||||||
|
features = []
|
||||||
|
|
||||||
|
# Sliding windows
|
||||||
|
for start in range(0, len(df), W):
|
||||||
|
end = start + W
|
||||||
|
if end > len(df):
|
||||||
|
break #das letzte Fenster wird ignoriert
|
||||||
|
|
||||||
|
|
||||||
|
w_gaze = gaze[start:end]
|
||||||
|
w_pupil = pupil[start:end]
|
||||||
|
w_valid = pupil_validity[start:end]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FIXATIONS (PyGaze)
|
||||||
|
# ----------------------------
|
||||||
|
time_ms = np.arange(W) * 1000.0 / fs
|
||||||
|
|
||||||
|
# print("gx im Fenster:", w_gaze[:,0][:20])
|
||||||
|
# print("gy im Fenster:", w_gaze[:,1][:20])
|
||||||
|
# print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
|
||||||
|
|
||||||
|
# print("Werte X im Fenster:", w_gaze[:,0])
|
||||||
|
# print("Werte Y im Fenster:", w_gaze[:,1])
|
||||||
|
# print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
|
||||||
|
# print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
|
||||||
|
print("time_ms:", time_ms)
|
||||||
|
|
||||||
|
fix, efix = fixation_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
|
||||||
|
missing=0.0, maxdist=0.001, mindur=65 # mindur=100ms
|
||||||
|
)
|
||||||
|
|
||||||
|
#print("Raw Fixation Output:", efix[0])
|
||||||
|
|
||||||
|
if start == 0:
|
||||||
|
print("DEBUG fix raw:", fix[:10])
|
||||||
|
|
||||||
|
# Robust fixations: PyGaze may return malformed entries
|
||||||
|
fixation_durations = []
|
||||||
|
for f in efix:
|
||||||
|
print("Efix:", f[2])
|
||||||
|
# start_t = f[1] # in ms
|
||||||
|
# end_t = f[2] # in ms
|
||||||
|
# duration = (end_t - start_t) / 1000.0 # in Sekunden
|
||||||
|
|
||||||
|
#duration = f[2] / 1000.0
|
||||||
|
if np.isfinite(f[2]) and f[2] > 0:
|
||||||
|
fixation_durations.append(f[2])
|
||||||
|
|
||||||
|
# Kategorien laut Paper
|
||||||
|
F_short = sum(66 <= d <= 150 for d in fixation_durations)
|
||||||
|
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
|
||||||
|
F_long = sum(d >= 1000 for d in fixation_durations)
|
||||||
|
F_hundred = sum(d > 100 for d in fixation_durations)
|
||||||
|
F_Cancel = sum(66 < d for d in fixation_durations)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# SACCADES
|
||||||
|
# ----------------------------
|
||||||
|
sac, esac = saccade_detection(
|
||||||
|
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
|
||||||
|
)
|
||||||
|
|
||||||
|
sac_durations = [s[2] for s in esac]
|
||||||
|
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# BLINKS
|
||||||
|
# ----------------------------
|
||||||
|
blinks = detect_blinks(w_valid)
|
||||||
|
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# PUPIL
|
||||||
|
# ----------------------------
|
||||||
|
if np.all(np.isnan(w_pupil)):
|
||||||
|
mean_pupil = np.nan
|
||||||
|
ipa = np.nan
|
||||||
|
else:
|
||||||
|
mean_pupil = np.nanmean(w_pupil)
|
||||||
|
ipa = compute_IPA(w_pupil, fs=fs)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# FEATURE-TABELLE FÜLLEN
|
||||||
|
# ----------------------------
|
||||||
|
features.append({
|
||||||
|
"Fix_count_short_66_150": F_short,
|
||||||
|
"Fix_count_medium_300_500": F_medium,
|
||||||
|
"Fix_count_long_gt_1000": F_long,
|
||||||
|
"Fix_count_100": F_hundred,
|
||||||
|
"Fix_cancel": F_Cancel,
|
||||||
|
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
|
||||||
|
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
|
||||||
|
|
||||||
|
"Sac_count": len(sac),
|
||||||
|
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
|
||||||
|
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
|
||||||
|
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
|
||||||
|
|
||||||
|
"Blink_count": len(blinks),
|
||||||
|
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
|
||||||
|
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
|
||||||
|
|
||||||
|
"Pupil_mean": mean_pupil,
|
||||||
|
"Pupil_IPA": ipa
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
result = pd.DataFrame(features)
|
||||||
|
print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# 3. MAIN FUNKTION
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("### STARTE FEATURE-EXTRAKTION ###")
|
||||||
|
print("Aktueller Arbeitsordner:", os.getcwd())
|
||||||
|
|
||||||
|
df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
|
||||||
|
#df = pd.read_parquet("cleaned_0001.parquet")
|
||||||
|
print("DataFrame geladen:", df.shape)
|
||||||
|
|
||||||
|
# Nur Eye-Tracking auswählen
|
||||||
|
#eye_cols = [c for c in df.columns if "EYE_" in c]
|
||||||
|
#df_eye = df[eye_cols]
|
||||||
|
|
||||||
|
#print("Eye-Tracking-Spalten:", len(eye_cols))
|
||||||
|
#print("→", eye_cols[:10], " ...")
|
||||||
|
|
||||||
|
print("Reinige Eyetracking-Daten ...")
|
||||||
|
df_eye = clean_eye_df(df)
|
||||||
|
|
||||||
|
# Feature Extraction
|
||||||
|
features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
|
||||||
|
|
||||||
|
print("\n### FEATURE-MATRIX (HEAD) ###")
|
||||||
|
print(features.head())
|
||||||
|
|
||||||
|
print("\nSpeichere Output in features.csv ...")
|
||||||
|
features.to_csv("features2.csv", index=False)
|
||||||
|
|
||||||
|
print("FERTIG!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -1,79 +1,72 @@
|
|||||||
import math
|
import math
|
||||||
|
|
||||||
|
def fixation_radius_normalized(theta_deg: float,
|
||||||
def fixation_radius_normalized(
|
|
||||||
theta_deg: float,
|
|
||||||
distance_cm: float,
|
distance_cm: float,
|
||||||
screen_width_cm: float,
|
screen_width_cm: float,
|
||||||
screen_height_cm: float,
|
screen_height_cm: float,
|
||||||
resolution_x: int,
|
resolution_x: int,
|
||||||
resolution_y: int,
|
resolution_y: int,
|
||||||
method: str = "max",
|
method: str = "max"):
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Compute the PyGaze fixation radius for normalized gaze data in [0, 1].
|
Berechnet den PyGaze-Fixationsradius für normierte Gaze-Daten in [0,1].
|
||||||
"""
|
"""
|
||||||
# Visual angle to physical distance (cm)
|
# Schritt 1: visueller Winkel → physische Distanz (cm)
|
||||||
delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2)
|
delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2)
|
||||||
|
|
||||||
# Physical distance to pixels
|
# Schritt 2: physische Distanz → Pixel
|
||||||
delta_px_x = delta_cm * (resolution_x / screen_width_cm)
|
delta_px_x = delta_cm * (resolution_x / screen_width_cm)
|
||||||
delta_px_y = delta_cm * (resolution_y / screen_height_cm)
|
delta_px_y = delta_cm * (resolution_y / screen_height_cm)
|
||||||
|
|
||||||
# Pixel radius
|
# Pixelradius
|
||||||
if method == "max":
|
if method == "max":
|
||||||
r_px = max(delta_px_x, delta_px_y)
|
r_px = max(delta_px_x, delta_px_y)
|
||||||
else:
|
else:
|
||||||
r_px = math.sqrt(delta_px_x**2 + delta_px_y**2)
|
r_px = math.sqrt(delta_px_x**2 + delta_px_y**2)
|
||||||
|
|
||||||
# Pixel radius to normalized radius
|
# Schritt 3: Pixelradius → normierter Radius
|
||||||
r_norm_x = r_px / resolution_x
|
r_norm_x = r_px / resolution_x
|
||||||
r_norm_y = r_px / resolution_y
|
r_norm_y = r_px / resolution_y
|
||||||
|
|
||||||
if method == "max":
|
if method == "max":
|
||||||
return max(r_norm_x, r_norm_y)
|
return max(r_norm_x, r_norm_y)
|
||||||
|
else:
|
||||||
return math.sqrt(r_norm_x**2 + r_norm_y**2)
|
return math.sqrt(r_norm_x**2 + r_norm_y**2)
|
||||||
|
|
||||||
|
|
||||||
def run_example():
|
|
||||||
# Example: 55" 4k monitor
|
|
||||||
screen_width_cm = 3 * 121.8
|
|
||||||
screen_height_cm = 68.5
|
|
||||||
resolution_x = 3 * 3840
|
|
||||||
resolution_y = 2160
|
|
||||||
distance_to_screen_cm = 120
|
|
||||||
max_angle = 1.0
|
|
||||||
|
|
||||||
maxdist_px = fixation_radius_normalized(
|
|
||||||
theta_deg=max_angle,
|
|
||||||
|
|
||||||
|
# Beispiel: 55" 4k Monitor
|
||||||
|
screen_width_cm = 3*121.8
|
||||||
|
screen_height_cm = 68.5
|
||||||
|
resolution_x = 3*3840
|
||||||
|
resolution_y = 2160
|
||||||
|
distance_to_screen_cm = 120
|
||||||
|
method = 'max'
|
||||||
|
max_angle= 1.0
|
||||||
|
|
||||||
|
maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
|
||||||
distance_cm=distance_to_screen_cm,
|
distance_cm=distance_to_screen_cm,
|
||||||
screen_width_cm=screen_width_cm,
|
screen_width_cm=screen_width_cm,
|
||||||
screen_height_cm=screen_height_cm,
|
screen_height_cm=screen_height_cm,
|
||||||
resolution_x=resolution_x,
|
resolution_x=resolution_x,
|
||||||
resolution_y=resolution_y,
|
resolution_y=resolution_y,
|
||||||
method="max",
|
method=method)
|
||||||
)
|
|
||||||
print("PyGaze max_dist (max):", maxdist_px)
|
|
||||||
|
|
||||||
maxdist_px = fixation_radius_normalized(
|
print("PyGaze max_dist (max):", maxdist_px)
|
||||||
theta_deg=max_angle,
|
|
||||||
|
method = 'euclid'
|
||||||
|
maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
|
||||||
distance_cm=distance_to_screen_cm,
|
distance_cm=distance_to_screen_cm,
|
||||||
screen_width_cm=screen_width_cm,
|
screen_width_cm=screen_width_cm,
|
||||||
screen_height_cm=screen_height_cm,
|
screen_height_cm=screen_height_cm,
|
||||||
resolution_x=resolution_x,
|
resolution_x=resolution_x,
|
||||||
resolution_y=resolution_y,
|
resolution_y=resolution_y,
|
||||||
method="euclid",
|
method=method)
|
||||||
)
|
|
||||||
print("PyGaze max_dist (euclid):", maxdist_px)
|
|
||||||
|
|
||||||
|
print("PyGaze max_dist (euclid):", maxdist_px)
|
||||||
|
|
||||||
def main():
|
# Passt noch nicht zu der Breite
|
||||||
run_example()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
# Reference
|
|
||||||
# https://osdoc.cogsci.nl/4.0/de/visualangle/
|
# https://osdoc.cogsci.nl/4.0/de/visualangle/
|
||||||
# https://reference.org/facts/Visual_angle/LUw29zy7
|
# https://reference.org/facts/Visual_angle/LUw29zy7
|
||||||
155
dataset_creation/open_parquet_test.ipynb
Normal file
155
dataset_creation/open_parquet_test.ipynb
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2b3fface",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "74f1f5ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df= pd.read_parquet(r\" \")\n",
|
||||||
|
"print(df.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "05775454",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "99e17328",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df.tail()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "69e53731",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3754c664",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Zeigt alle Kombinationen mit Häufigkeit\n",
|
||||||
|
"df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f83b595c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"high_nback = df[\n",
|
||||||
|
" (df[\"STUDY\"]==\"n-back\") &\n",
|
||||||
|
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
|
||||||
|
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
|
||||||
|
"]\n",
|
||||||
|
"high_nback.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c0940343",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"low_all = df[\n",
|
||||||
|
" ((df[\"PHASE\"] == \"baseline\") |\n",
|
||||||
|
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
|
||||||
|
"]\n",
|
||||||
|
"print(low_all.shape)\n",
|
||||||
|
"high_kdrive = df[\n",
|
||||||
|
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
|
||||||
|
"]\n",
|
||||||
|
"print(high_kdrive.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f7ce38d3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
|
||||||
|
"print(df.shape[0])\n",
|
||||||
|
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "48ba0379",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"high_all = pd.concat([high_nback, high_kdrive])\n",
|
||||||
|
"high_all.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "77dda26c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
|
||||||
|
"print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
|
||||||
|
"print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "base",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@ -28,7 +28,7 @@
|
|||||||
"sys.path.append(base_dir)\n",
|
"sys.path.append(base_dir)\n",
|
||||||
"print(base_dir)\n",
|
"print(base_dir)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n",
|
"from tools import evaluation_tools\n",
|
||||||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
||||||
"from sklearn.ensemble import IsolationForest\n",
|
"from sklearn.ensemble import IsolationForest\n",
|
||||||
"from sklearn.model_selection import GridSearchCV, KFold\n",
|
"from sklearn.model_selection import GridSearchCV, KFold\n",
|
||||||
@ -52,7 +52,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")"
|
"data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -301,26 +301,20 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"au_columns = [col for col in low_all.columns if \"face\" in col.lower()] \n",
|
"# Cell 2: Get AU columns and prepare datasets\n",
|
||||||
|
"# Get all column names that start with 'AU'\n",
|
||||||
|
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"eye_columns = [ \n",
|
|
||||||
" 'Fix_count_short_66_150','Fix_count_medium_300_500','Fix_count_long_gt_1000', \n",
|
|
||||||
" 'Fix_count_100','Fix_mean_duration','Fix_median_duration', \n",
|
|
||||||
" 'Sac_count','Sac_mean_amp','Sac_mean_dur','Sac_median_dur', \n",
|
|
||||||
" 'Blink_count','Blink_mean_dur','Blink_median_dur', \n",
|
|
||||||
" 'Pupil_mean','Pupil_IPA' \n",
|
|
||||||
"] \n",
|
|
||||||
"cols = au_columns +eye_columns\n",
|
|
||||||
"# Prepare training data (only normal/low data)\n",
|
"# Prepare training data (only normal/low data)\n",
|
||||||
"train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + cols].copy()\n",
|
"train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Prepare validation data (normal and anomaly)\n",
|
"# Prepare validation data (normal and anomaly)\n",
|
||||||
"val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
|
"val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||||
"val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
|
"val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Prepare test data (normal and anomaly)\n",
|
"# Prepare test data (normal and anomaly)\n",
|
||||||
"test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
|
"test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||||
"test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
|
"test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(f\"Train samples: {len(train_data)}\")\n",
|
"print(f\"Train samples: {len(train_data)}\")\n",
|
||||||
"print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
|
"print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
|
||||||
@ -334,8 +328,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Fit normalizer on training data\n",
|
"# Cell 3: Fit normalizer on training data\n",
|
||||||
"normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n",
|
"normalizer = fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n",
|
||||||
"print(\"Normalizer fitted on training data\")"
|
"print(\"Normalizer fitted on training data\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -346,12 +340,12 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Apply normalization to all datasets\n",
|
"# Cell 4: Apply normalization to all datasets\n",
|
||||||
"train_normalized = apply_normalizer(train_data, cols, normalizer)\n",
|
"train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n",
|
||||||
"val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n",
|
"val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n",
|
||||||
"val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n",
|
"val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n",
|
||||||
"test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n",
|
"test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n",
|
||||||
"test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n",
|
"test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Normalization applied to all datasets\")"
|
"print(\"Normalization applied to all datasets\")"
|
||||||
]
|
]
|
||||||
@ -363,9 +357,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X_train = train_normalized[cols].copy()\n",
|
"# Cell 5: Extract AU columns and create labels for grid search\n",
|
||||||
"X_val_normal = val_normal_normalized[cols].copy()\n",
|
"# Extract only AU columns (drop subjectID)\n",
|
||||||
"X_val_high = val_high_normalized[cols].copy()\n",
|
"X_train = train_normalized[au_columns].copy()\n",
|
||||||
|
"X_val_normal = val_normal_normalized[au_columns].copy()\n",
|
||||||
|
"X_val_high = val_high_normalized[au_columns].copy()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Combine train and validation sets for grid search\n",
|
"# Combine train and validation sets for grid search\n",
|
||||||
"X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
|
"X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
|
||||||
@ -420,7 +416,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Train final model with best parameters on training data\n",
|
"# Cell 7: Train final model with best parameters on training data\n",
|
||||||
"final_model = IsolationForest(**best_params, random_state=42)\n",
|
"final_model = IsolationForest(**best_params, random_state=42)\n",
|
||||||
"final_model.fit(X_train.values)\n",
|
"final_model.fit(X_train.values)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -434,9 +430,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Prepare independent test set\n",
|
"# Cell 8: Prepare independent test set\n",
|
||||||
"X_test_normal = test_normal_normalized[cols].copy()\n",
|
"X_test_normal = test_normal_normalized[au_columns].copy()\n",
|
||||||
"X_test_high = test_high_normalized[cols].copy()\n",
|
"X_test_high = test_high_normalized[au_columns].copy()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Combine test sets\n",
|
"# Combine test sets\n",
|
||||||
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
|
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
|
||||||
@ -487,7 +483,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "base",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -501,7 +497,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.10"
|
"version": "3.11.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user