Compare commits

..

2 Commits

14 changed files with 195 additions and 1740 deletions

View File

@ -1,5 +1,13 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"id": "cc08936c",
"metadata": {},
"source": [
"## Insights into the dataset with histogramms and scatter plots"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "1014c5e0", "id": "1014c5e0",
@ -17,7 +25,8 @@
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"import matplotlib.pyplot as plt" "import matplotlib.pyplot as plt\n",
"from pathlib import Path"
] ]
}, },
{ {
@ -27,7 +36,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n", "path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n",
"df = pd.read_parquet(path=path)" "df = pd.read_parquet(path=path)"
] ]
}, },
@ -104,21 +113,27 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Get all columns that start with 'AU'\n", "face_au_cols = [c for c in low_all.columns if c.startswith(\"FACE_AU\")]\n",
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n", "eye_cols = ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
" 'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
" 'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
" 'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
" 'Pupil_mean', 'Pupil_IPA']\n",
"\n",
"cols = face_au_cols+eye_cols\n",
"\n", "\n",
"# Calculate number of rows and columns for subplots\n", "# Calculate number of rows and columns for subplots\n",
"n_cols = len(au_columns)\n", "n_cols = len(cols)\n",
"n_rows = 4\n", "n_rows = 7\n",
"n_cols_subplot = 5\n", "n_cols_subplot = 5\n",
"\n", "\n",
"# Create figure with subplots\n", "# Create figure with subplots\n",
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n", "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
"axes = axes.flatten()\n", "axes = axes.flatten()\n",
"fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n", "fig.suptitle('Feature Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
"\n", "\n",
"# Create histogram for each AU column\n", "# Create histogram for each AU column\n",
"for idx, col in enumerate(au_columns):\n", "for idx, col in enumerate(cols):\n",
" ax = axes[idx]\n", " ax = axes[idx]\n",
" \n", " \n",
" # Plot overlapping histograms\n", " # Plot overlapping histograms\n",
@ -133,18 +148,48 @@
" ax.grid(True, alpha=0.3)\n", " ax.grid(True, alpha=0.3)\n",
"\n", "\n",
"# Hide any unused subplots\n", "# Hide any unused subplots\n",
"for idx in range(len(au_columns), len(axes)):\n", "for idx in range(len(cols), len(axes)):\n",
" axes[idx].set_visible(False)\n", " axes[idx].set_visible(False)\n",
"\n", "\n",
"# Adjust layout\n", "# Adjust layout\n",
"plt.tight_layout()\n", "plt.tight_layout()\n",
"plt.show()" "plt.show()"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6cd53cdb",
"metadata": {},
"outputs": [],
"source": [
"# Create figure with subplots\n",
"fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
"axes = axes.flatten()\n",
"fig.suptitle('Feature Scatter: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
"\n",
"for idx, col in enumerate(cols):\n",
" ax = axes[idx]\n",
"\n",
" # Scatterplots\n",
" ax.scatter(range(len(low_all[col])), low_all[col], alpha=0.6, color='blue', label='low_all', s=10)\n",
" ax.scatter(range(len(high_all[col])), high_all[col], alpha=0.6, color='red', label='high_all', s=10)\n",
"\n",
" ax.set_title(col, fontsize=10, fontweight='bold')\n",
" ax.set_xlabel('Sample index', fontsize=8)\n",
" ax.set_ylabel('Value', fontsize=8)\n",
" ax.legend(fontsize=8)\n",
" ax.grid(True, alpha=0.3)\n",
"\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "base", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -158,7 +203,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.5" "version": "3.12.10"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -1,91 +0,0 @@
import os
import pandas as pd
from pathlib import Path
print(os.getcwd())
num_files = 2 # number of files to process (min: 1, max: 30)
print("connection aufgebaut")
data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
# os.chdir(data_dir)
# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 100_000
for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet")
print(f"{file_path} geoeffnet")
# Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns
nrows = store.get_storer("SIGNALS").nrows
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
# Step 2: Filter columns that start with "FACE_AU"
eye_cols = [c for c in cols if c.startswith("EYE_")]
print(f"eye-tracking columns found: {eye_cols}")
if len(eye_cols) == 0:
print(f"keine eye-tracking-Signale in Subject {i}")
continue
# Columns to read
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
# Step 3: Process file in chunks
chunks_to_save = []
for start_row in range(0, nrows, CHUNK_SIZE):
stop_row = min(start_row + CHUNK_SIZE, nrows)
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
# Read chunk
df_chunk = pd.read_hdf(
file_path,
key="SIGNALS",
columns=columns_to_read,
start=start_row,
stop=stop_row
)
# Add metadata columns
df_chunk["subjectID"] = i
df_chunk["rowID"] = range(start_row, stop_row)
# Clean data
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
df_chunk = df_chunk.dropna()
# Only keep non-empty chunks
if len(df_chunk) > 0:
chunks_to_save.append(df_chunk)
# Free memory
del df_chunk
print("load and cleaning done")
# Step 4: Combine all chunks and save
if chunks_to_save:
df_final = pd.concat(chunks_to_save, ignore_index=True)
print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
df_final.to_parquet(out_name, index=False)
print(f"Saved to {out_name}")
# Free memory
del df_final
del chunks_to_save
else:
print(f"No valid data found for Subject {i}")
print("All files processed!")

View File

@ -1,91 +0,0 @@
import os
import pandas as pd
from pathlib import Path
print(os.getcwd())
num_files = 2 # number of files to process (min: 1, max: 30)
print("connection aufgebaut")
data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 100_000
for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet")
print(f"{file_path} geoeffnet")
# Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns
nrows = store.get_storer("SIGNALS").nrows
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
# Step 2: Filter columns that start with "FACE_AU"
eye_cols = [c for c in cols if c.startswith("FACE_AU")]
print(f"FACE_AU columns found: {eye_cols}")
if len(eye_cols) == 0:
print(f"keine FACE_AU-Signale in Subject {i}")
continue
# Columns to read
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
# Step 3: Process file in chunks
chunks_to_save = []
for start_row in range(0, nrows, CHUNK_SIZE):
stop_row = min(start_row + CHUNK_SIZE, nrows)
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
# Read chunk
df_chunk = pd.read_hdf(
file_path,
key="SIGNALS",
columns=columns_to_read,
start=start_row,
stop=stop_row
)
# Add metadata columns
df_chunk["subjectID"] = i
df_chunk["rowID"] = range(start_row, stop_row)
# Clean data
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
df_chunk = df_chunk.dropna()
# Only keep non-empty chunks
if len(df_chunk) > 0:
chunks_to_save.append(df_chunk)
# Free memory
del df_chunk
print("load and cleaning done")
# Step 4: Combine all chunks and save
if chunks_to_save:
df_final = pd.concat(chunks_to_save, ignore_index=True)
print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet
base_dir = Path(r"C:\new_AU_parquet_files")
os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"cleaned_{i:04d}.parquet"
df_final.to_parquet(out_name, index=False)
print(f"Saved to {out_name}")
# Free memory
del df_final
del chunks_to_save
else:
print(f"No valid data found for Subject {i}")
print("All files processed!")

View File

@ -4,27 +4,26 @@ import pandas as pd
from pathlib import Path from pathlib import Path
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
from scipy.signal import welch from scipy.signal import welch
from pygazeanalyser.detectors import fixation_detection, saccade_detection from pygazeanalyser.detectors import fixation_detection, saccade_detection # not installed by default
############################################################################## ##############################################################################
# KONFIGURATION # CONFIGURATION
############################################################################## ##############################################################################
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files") INPUT_DIR = Path(r"") # directory that stores the parquet files (one file per subject)
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet") OUTPUT_FILE = Path(r"") # path for resulting dataset
WINDOW_SIZE_SAMPLES = 25*50 # 50s at 25Hz
WINDOW_SIZE_SAMPLES = 25*50 # 50s bei 25Hz STEP_SIZE_SAMPLES = 125 # 5s at 25Hz
STEP_SIZE_SAMPLES = 125 # 5s bei 25Hz
SAMPLING_RATE = 25 # Hz SAMPLING_RATE = 25 # Hz
MIN_DUR_BLINKS = 2 # x * 40ms MIN_DUR_BLINKS = 2 # x * 40ms
############################################################################## ##############################################################################
# EYE-TRACKING FUNKTIONEN # EYE-TRACKING FUNCTIONS
############################################################################## ##############################################################################
def clean_eye_df(df): def clean_eye_df(df):
"""Extrahiert nur Eye-Tracking Spalten und entfernt leere Zeilen.""" """Extracts Eye-Tracking columns only and removes empty rows."""
eye_cols = [c for c in df.columns if c.startswith("EYE_")] eye_cols = [c for c in df.columns if c.startswith("EYE_")]
if not eye_cols: if not eye_cols:
@ -38,7 +37,7 @@ def clean_eye_df(df):
def extract_gaze_signal(df): def extract_gaze_signal(df):
"""Extrahiert 2D-Gaze-Positionen, maskiert ungültige Samples und interpoliert.""" """Extracts 2D gaze positions, masks invalid samples, and interpolates."""
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
@ -51,14 +50,14 @@ def extract_gaze_signal(df):
for arr in [gx_L, gy_L, gx_R, gy_R]: for arr in [gx_L, gy_L, gx_R, gy_R]:
arr.replace([np.inf, -np.inf], np.nan, inplace=True) arr.replace([np.inf, -np.inf], np.nan, inplace=True)
# Ungültige maskieren # Mask invalids
gx_L[~val_L] = np.nan gx_L[~val_L] = np.nan
gy_L[~val_L] = np.nan gy_L[~val_L] = np.nan
gx_R[~val_R] = np.nan gx_R[~val_R] = np.nan
gy_R[~val_R] = np.nan gy_R[~val_R] = np.nan
# Mittelwert beider Augen # Mean of both eyes
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1) gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1) gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
@ -66,7 +65,7 @@ def extract_gaze_signal(df):
gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill() gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill()
gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill() gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill()
# MinMax Skalierung # MinMax scaling
xscaler = MinMaxScaler() xscaler = MinMaxScaler()
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
@ -77,7 +76,7 @@ def extract_gaze_signal(df):
def extract_pupil(df): def extract_pupil(df):
"""Extrahiert Pupillengröße (beide Augen gemittelt).""" """Extract pupil size (average of both eyes)."""
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
@ -96,7 +95,7 @@ def extract_pupil(df):
def detect_blinks(pupil_validity, min_duration=5): def detect_blinks(pupil_validity, min_duration=5):
"""Erkennt Blinks: Validity=0 → Blink.""" """Detect blinks: Validity=0 → Blink."""
blinks = [] blinks = []
start = None start = None
@ -120,13 +119,13 @@ def compute_IPA(pupil, fs=25):
def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2): def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
""" """
Extrahiert Eye-Tracking Features für ein einzelnes Window. Extracts eye tracking features for a single window.
Gibt Dictionary mit allen Eye-Features zurück. Returns a dictionary containing all eye features.
""" """
# Gaze # Gaze
gaze = extract_gaze_signal(df_eye_window) gaze = extract_gaze_signal(df_eye_window)
# Pupille # Pupil
pupil, pupil_validity = extract_pupil(df_eye_window) pupil, pupil_validity = extract_pupil(df_eye_window)
window_size = len(df_eye_window) window_size = len(df_eye_window)
@ -143,7 +142,6 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0] fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0]
# Kategorien
F_short = sum(66 <= d <= 150 for d in fixation_durations) F_short = sum(66 <= d <= 150 for d in fixation_durations)
F_medium = sum(300 <= d <= 500 for d in fixation_durations) F_medium = sum(300 <= d <= 500 for d in fixation_durations)
F_long = sum(d >= 1000 for d in fixation_durations) F_long = sum(d >= 1000 for d in fixation_durations)
@ -197,27 +195,27 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
############################################################################## ##############################################################################
# KOMBINIERTE FEATURE-EXTRAKTION # Combined feature extraction
############################################################################## ##############################################################################
def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2): def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2):
""" """
Verarbeitet Parquet-Dateien mit FACE_AU und EYE Spalten. Processes Parquet files with FACE_AU and EYE columns.
Extrahiert beide Feature-Sets und kombiniert sie. Extracts both feature sets and combines them.
""" """
input_path = Path(input_dir) input_path = Path(input_dir)
parquet_files = sorted(input_path.glob("*.parquet")) parquet_files = sorted(input_path.glob("*.parquet"))
if not parquet_files: if not parquet_files:
print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!") print(f"Error: No parquet-files found in {input_dir}!")
return None return None
print(f"\n{'='*70}") print(f"\n{'='*70}")
print(f"KOMBINIERTE FEATURE-EXTRAKTION") print(f"Combined feature-extraction")
print(f"{'='*70}") print(f"{'='*70}")
print(f"Dateien: {len(parquet_files)}") print(f"Files: {len(parquet_files)}")
print(f"Window: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)") print(f"Window: {window_size} Samples ({window_size/fs:.1f}s at {fs}Hz)")
print(f"Step: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)") print(f"Step: {step_size} Samples ({step_size/fs:.1f}s at {fs}Hz)")
print(f"{'='*70}\n") print(f"{'='*70}\n")
all_windows = [] all_windows = []
@ -227,24 +225,22 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
try: try:
df = pd.read_parquet(parquet_file) df = pd.read_parquet(parquet_file)
print(f" Einträge: {len(df)}") print(f" Entries: {len(df)}")
# Identifiziere Spalten
au_columns = [col for col in df.columns if col.startswith('FACE_AU')] au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
eye_columns = [col for col in df.columns if col.startswith('EYE_')] eye_columns = [col for col in df.columns if col.startswith('EYE_')]
print(f" AU-Spalten: {len(au_columns)}") print(f" AU-columns: {len(au_columns)}")
print(f" Eye-Spalten: {len(eye_columns)}") print(f" Eye-columns: {len(eye_columns)}")
has_au = len(au_columns) > 0 has_au = len(au_columns) > 0
has_eye = len(eye_columns) > 0 has_eye = len(eye_columns) > 0
if not has_au and not has_eye: if not has_au and not has_eye:
print(f" WARNUNG: Keine AU oder Eye Spalten gefunden!") print(f" Warning: No AU or eye tracking columns found!")
continue continue
# Gruppiere nach STUDY, LEVEL, PHASE # Group by STUDY, LEVEL, PHASE
group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns] group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns]
if group_cols: if group_cols:
@ -258,7 +254,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
group_df = group_df.reset_index(drop=True) group_df = group_df.reset_index(drop=True)
# Berechne Anzahl Windows # calculate number of windows
num_windows = (len(group_df) - window_size) // step_size + 1 num_windows = (len(group_df) - window_size) // step_size + 1
if num_windows <= 0: if num_windows <= 0:
@ -272,7 +268,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
window_df = group_df.iloc[start_idx:end_idx] window_df = group_df.iloc[start_idx:end_idx]
# Basis-Metadaten # basic metadata
result = { result = {
'subjectID': window_df['subjectID'].iloc[0], 'subjectID': window_df['subjectID'].iloc[0],
'start_time': window_df['rowID'].iloc[0], 'start_time': window_df['rowID'].iloc[0],
@ -281,12 +277,12 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan 'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan
} }
# FACE AU Features # FACE AU features
if has_au: if has_au:
for au_col in au_columns: for au_col in au_columns:
result[f'{au_col}_mean'] = window_df[au_col].mean() result[f'{au_col}_mean'] = window_df[au_col].mean()
# Eye-Tracking Features # Eye-tracking features
if has_eye: if has_eye:
try: try:
# clean dataframe from all nan rows # clean dataframe from all nan rows
@ -296,7 +292,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
result.update(eye_features) result.update(eye_features)
except Exception as e: except Exception as e:
print(f" WARNUNG: Eye-Features fehlgeschlagen: {str(e)}") print(f" WARNUNG: Eye-Features fehlgeschlagen: {str(e)}")
# Füge NaN-Werte für Eye-Features hinzu # Add NaN-values for eye-features
result.update({ result.update({
"Fix_count_short_66_150": np.nan, "Fix_count_short_66_150": np.nan,
"Fix_count_medium_300_500": np.nan, "Fix_count_medium_300_500": np.nan,
@ -325,7 +321,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
traceback.print_exc() traceback.print_exc()
continue continue
# Kombiniere alle Windows # Combine all windows
if not all_windows: if not all_windows:
print("\nKEINE FEATURES EXTRAHIERT!") print("\nKEINE FEATURES EXTRAHIERT!")
return None return None
@ -340,7 +336,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
print(f"Spalten: {len(result_df.columns)}") print(f"Spalten: {len(result_df.columns)}")
print(f"Subjects: {result_df['subjectID'].nunique()}") print(f"Subjects: {result_df['subjectID'].nunique()}")
# Speichern # Save
output_path = Path(output_file) output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
result_df.to_parquet(output_file, index=False) result_df.to_parquet(output_file, index=False)
@ -357,7 +353,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
def main(): def main():
print("\n" + "="*70) print("\n" + "="*70)
print("KOMBINIERTE FEATURE-EXTRAKTION (AU + EYE)") print("Combined extraction (AU + EYE)")
print("="*70) print("="*70)
result = process_combined_features( result = process_combined_features(
@ -370,16 +366,16 @@ def main():
) )
if result is not None: if result is not None:
print("\nErste 5 Zeilen:") print("\First 5 rows:")
print(result.head()) print(result.head())
print("\nSpalten-Übersicht:") print("\nColumns overview:")
print(result.dtypes) print(result.dtypes)
print("\nStatistik:") print("\Statistics:")
print(result.describe()) print(result.describe())
print("\n✓ FERTIG!\n") print("\nDone!\n")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,113 +0,0 @@
import pandas as pd
import numpy as np
from pathlib import Path
def process_parquet_files(input_dir, output_file, window_size=1250, step_size=125):
"""
Verarbeitet Parquet-Dateien mit Sliding Window Aggregation.
Parameters:
-----------
input_dir : str
Verzeichnis mit Parquet-Dateien
output_file : str
Pfad für die Ausgabe-Parquet-Datei
window_size : int
Größe des Sliding Windows (default: 3000)
step_size : int
Schrittweite in Einträgen (default: 250 = 10 Sekunden bei 25 Hz)
"""
input_path = Path(input_dir)
parquet_files = sorted(input_path.glob("*.parquet"))
if not parquet_files:
print(f"Keine Parquet-Dateien in {input_dir} gefunden!")
return
print(f"Gefundene Dateien: {len(parquet_files)}")
all_windows = []
for file_idx, parquet_file in enumerate(parquet_files):
print(f"\nVerarbeite Datei {file_idx + 1}/{len(parquet_files)}: {parquet_file.name}")
# Lade Parquet-Datei
df = pd.read_parquet(parquet_file)
print(f" Einträge: {len(df)}")
# Identifiziere AU-Spalten
au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
print(f" AU-Spalten: {len(au_columns)}")
# Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
for (study_val, level_val, phase_val), level_df in df.groupby(['STUDY', 'LEVEL', 'PHASE'], sort=False):
print(f" STUDY {study_val}, LEVEL {level_val}, PHASE {phase_val}: {len(level_df)} Einträge")
# Reset index für korrekte Position-Berechnung
level_df = level_df.reset_index(drop=True)
# Sliding Window über dieses Level
num_windows = (len(level_df) - window_size) // step_size + 1
if num_windows <= 0:
print(f" Zu wenige Einträge für Window (benötigt {window_size})")
continue
for i in range(num_windows):
start_idx = i * step_size
end_idx = start_idx + window_size
window_df = level_df.iloc[start_idx:end_idx]
# Erstelle aggregiertes Ergebnis
result = {
'subjectID': window_df['subjectID'].iloc[0],
'start_time': window_df['rowID'].iloc[0], # rowID als start_time
'STUDY': window_df['STUDY'].iloc[0],
'LEVEL': window_df['LEVEL'].iloc[0],
'PHASE': window_df['PHASE'].iloc[0]
}
# Summiere alle AU-Spalten
for au_col in au_columns:
# result[f'{au_col}_sum'] = window_df[au_col].sum()
result[f'{au_col}_mean'] = window_df[au_col].mean()
all_windows.append(result)
print(f" Windows erstellt: {num_windows}")
# Erstelle finalen DataFrame
result_df = pd.DataFrame(all_windows)
print(f"\n{'='*60}")
print(f"Gesamt Windows erstellt: {len(result_df)}")
print(f"Spalten: {list(result_df.columns)}")
# Speichere Ergebnis
result_df.to_parquet(output_file, index=False)
print(f"\nErgebnis gespeichert in: {output_file}")
return result_df
# Beispiel-Verwendung
if __name__ == "__main__":
# Anpassen an deine Pfade
input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
result = process_parquet_files(
input_dir=input_directory,
output_file=output_file,
window_size=1250,
step_size=125
)
# Zeige erste Zeilen
if result is not None:
print("\nErste 5 Zeilen des Ergebnisses:")
print(result.head())

View File

@ -1,56 +0,0 @@
from pathlib import Path
import pandas as pd
def main():
"""
USER CONFIGURATION
------------------
Specify input files and output directory here.
"""
# Input parquet files (single-modality datasets)
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
# Output directory and file name
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
output_file = output_dir / "merged_dataset.parquet"
# Column names (adjust only if your schema differs)
subject_col = "subjectID"
time_col = "start_time"
# ------------------------------------------------------------------
# Load datasets
# ------------------------------------------------------------------
df1 = pd.read_parquet(file_modality_1)
df2 = pd.read_parquet(file_modality_2)
# ------------------------------------------------------------------
# Keep only subjects that appear in BOTH datasets
# ------------------------------------------------------------------
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
df1 = df1[df1[subject_col].isin(common_subjects)]
df2 = df2[df2[subject_col].isin(common_subjects)]
# ------------------------------------------------------------------
# Inner join on subject ID AND start_time
# ------------------------------------------------------------------
merged_df = pd.merge(
df1,
df2,
on=[subject_col, time_col],
how="inner",
)
# ------------------------------------------------------------------
# Save merged dataset
# ------------------------------------------------------------------
output_dir.mkdir(parents=True, exist_ok=True)
merged_df.to_parquet(output_file, index=False)
if __name__ == "__main__":
main()

View File

@ -1,6 +1,5 @@
# pip install pyocclient
import yaml import yaml
import owncloud import owncloud # pip install pyocclient
import pandas as pd import pandas as pd
import h5py import h5py
import os import os
@ -26,7 +25,7 @@ for i in range(num_files):
# Download file from ownCloud # Download file from ownCloud
oc.get_file(file_name, local_tmp) oc.get_file(file_name, local_tmp)
print(f"{file_name} geoeffnet") print(f"Opened: {file_name}")
# Load into memory and extract needed columns # Load into memory and extract needed columns
# with h5py.File(local_tmp, "r") as f: # with h5py.File(local_tmp, "r") as f:
# # Adjust this path depending on actual dataset layout inside .h5py file # # Adjust this path depending on actual dataset layout inside .h5py file
@ -35,14 +34,9 @@ for i in range(num_files):
with pd.HDFStore(local_tmp, mode="r") as store: with pd.HDFStore(local_tmp, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns # get column names cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
# Step 2: Filter columns that start with "AU"
au_cols = [c for c in cols if c.startswith("AU")]
print(au_cols)
if len(au_cols)==0:
print(f"keine AU Signale in Subject {i}")
continue
# Step 3: Read only those columns (plus any others you want) # Step 3: Read only those columns (plus any others you want)
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols) df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + cols)
print("load done") print("load done")
@ -63,7 +57,7 @@ for i in range(num_files):
# Save to parquet # Save to parquet
os.makedirs("ParquetFiles", exist_ok=True) os.makedirs("ParquetFiles", exist_ok=True) # TODO: change for custom directory
out_name = f"ParquetFiles/cleaned_{i:04d}.parquet" out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
df.to_parquet(out_name, index=False) df.to_parquet(out_name, index=False)

View File

@ -1,323 +0,0 @@
import numpy as np
import pandas as pd
import h5py
import yaml
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import welch
from pygazeanalyser.detectors import fixation_detection, saccade_detection
##############################################################################
# 1. HELFERFUNKTIONEN
##############################################################################
def clean_eye_df(df):
"""
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
"""
eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
df_eye = df[eye_cols]
# INF → NaN
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
df_eye = df_eye.dropna(subset=eye_cols, how="all")
print("Eyetracking-Zeilen vorher:", len(df))
print("Eyetracking-Zeilen nachher:", len(df_eye))
#Index zurücksetzen
return df_eye.reset_index(drop=True)
def extract_gaze_signal(df):
"""
Extrahiert 2D-Gaze-Positionen auf dem Display,
maskiert ungültige Samples und interpoliert Lücken.
"""
print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
# Gaze-Spalten
gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
# Validity-Spalten (1 = gültig)
val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
# Ungültige Werte maskieren
gx_L[~val_L] = np.nan
gy_L[~val_L] = np.nan
gx_R[~val_R] = np.nan
gy_R[~val_R] = np.nan
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
# Interpolation (wichtig für PyGaze!)
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
# xscaler = MinMaxScaler()
# gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
# yscaler = MinMaxScaler()
# gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
#print("xmax ymax", gxscale.max(), gyscale.max())
#out = np.column_stack((gxscale, gyscale))
out = np.column_stack((gx, gy))
print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
return out
def extract_pupil(df):
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
vl = df.get("LEFT_PUPIL_VALIDITY")
vr = df.get("RIGHT_PUPIL_VALIDITY")
if vl is None or vr is None:
# Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
# gültig, wenn Pupillendurchmesser nicht NaN.
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
else:
# Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
# Mittelwert der verfügbaren Pupillen
p = np.mean(np.column_stack([pl, pr]), axis=1)
# INF/NaN reparieren
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
p = p.to_numpy()
print("→ extract_pupil(): Pupillensignal Länge:", len(p))
return p, validity
def detect_blinks(pupil_validity, min_duration=5):
"""Erkennt Blinks: Validity=0 → Blink."""
blinks = []
start = None
for i, v in enumerate(pupil_validity):
if v == 0 and start is None:
start = i
elif v == 1 and start is not None:
if i - start >= min_duration:
blinks.append([start, i])
start = None
return blinks
def compute_IPA(pupil, fs=250):
"""
IPA = Index of Pupillary Activity (nach Duchowski 2018).
Hochfrequenzanteile der Pupillenzeitreihe.
"""
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
hf_band = (f >= 0.6) & (f <= 2.0)
ipa = np.sum(Pxx[hf_band])
return ipa
##############################################################################
# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
##############################################################################
def extract_eye_features(df, window_length_sec=50, fs=250):
"""
df = Tobii DataFrame
window_length_sec = Fenstergröße (z.B. W=1s)
"""
print("→ extract_eye_features(): Starte Feature-Berechnung...")
print(" Fensterlänge W =", window_length_sec, "s")
W = int(window_length_sec * fs) # Window größe in Samples
# Gaze
gaze = extract_gaze_signal(df)
gx, gy = gaze[:, 0], gaze[:, 1]
print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
print("Range:", np.nanmin(gx), np.nanmax(gx))
print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
print("Range:", np.nanmin(gy), np.nanmax(gy))
# Pupille
pupil, pupil_validity = extract_pupil(df)
features = []
# Sliding windows
for start in range(0, len(df), W):
end = start + W
if end > len(df):
break #das letzte Fenster wird ignoriert
w_gaze = gaze[start:end]
w_pupil = pupil[start:end]
w_valid = pupil_validity[start:end]
# ----------------------------
# FIXATIONS (PyGaze)
# ----------------------------
time_ms = np.arange(W) * 1000.0 / fs
# print("gx im Fenster:", w_gaze[:,0][:20])
# print("gy im Fenster:", w_gaze[:,1][:20])
# print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
# print("Werte X im Fenster:", w_gaze[:,0])
# print("Werte Y im Fenster:", w_gaze[:,1])
# print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
# print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
print("time_ms:", time_ms)
fix, efix = fixation_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
missing=0.0, maxdist=0.003, mindur=10 # mindur=100ms
)
#print("Raw Fixation Output:", efix[0])
if start == 0:
print("DEBUG fix raw:", fix[:10])
# Robust fixations: PyGaze may return malformed entries
fixation_durations = []
for f in efix:
print("Efix:", f[2])
# start_t = f[1] # in ms
# end_t = f[2] # in ms
# duration = (end_t - start_t) / 1000.0 # in Sekunden
#duration = f[2] / 1000.0
if np.isfinite(f[2]) and f[2] > 0:
fixation_durations.append(f[2])
# Kategorien laut Paper
F_short = sum(66 <= d <= 150 for d in fixation_durations)
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
F_long = sum(d >= 1000 for d in fixation_durations)
F_hundred = sum(d > 100 for d in fixation_durations)
F_Cancel = sum(66 < d for d in fixation_durations)
# ----------------------------
# SACCADES
# ----------------------------
sac, esac = saccade_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
)
sac_durations = [s[2] for s in esac]
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
# ----------------------------
# BLINKS
# ----------------------------
blinks = detect_blinks(w_valid)
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
# ----------------------------
# PUPIL
# ----------------------------
if np.all(np.isnan(w_pupil)):
mean_pupil = np.nan
ipa = np.nan
else:
mean_pupil = np.nanmean(w_pupil)
ipa = compute_IPA(w_pupil, fs=fs)
# ----------------------------
# FEATURE-TABELLE FÜLLEN
# ----------------------------
features.append({
"Fix_count_short_66_150": F_short,
"Fix_count_medium_300_500": F_medium,
"Fix_count_long_gt_1000": F_long,
"Fix_count_100": F_hundred,
"Fix_cancel": F_Cancel,
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
"Sac_count": len(sac),
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
"Blink_count": len(blinks),
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
"Pupil_mean": mean_pupil,
"Pupil_IPA": ipa
})
result = pd.DataFrame(features)
print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
return result
##############################################################################
# 3. MAIN FUNKTION
##############################################################################
def main():
print("### STARTE FEATURE-EXTRAKTION ###")
print("Aktueller Arbeitsordner:", os.getcwd())
#df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
df = pd.read_parquet("cleaned_0001.parquet")
print("DataFrame geladen:", df.shape)
# Nur Eye-Tracking auswählen
#eye_cols = [c for c in df.columns if "EYE_" in c]
#df_eye = df[eye_cols]
#print("Eye-Tracking-Spalten:", len(eye_cols))
#print("→", eye_cols[:10], " ...")
print("Reinige Eyetracking-Daten ...")
df_eye = clean_eye_df(df)
# Feature Extraction
features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
print("\n### FEATURE-MATRIX (HEAD) ###")
print(features.head())
print("\nSpeichere Output in features.csv ...")
features.to_csv("features4.csv", index=False)
print("FERTIG!")
if __name__ == "__main__":
main()

View File

@ -1,441 +0,0 @@
import numpy as np
import pandas as pd
import h5py
import yaml
import os
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import welch
from pygazeanalyser.detectors import fixation_detection, saccade_detection
##############################################################################
# KONFIGURATION - HIER ANPASSEN!
##############################################################################
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
SAMPLING_RATE = 250 # Hz
##############################################################################
# 1. HELFERFUNKTIONEN
##############################################################################
def clean_eye_df(df):
"""
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
"""
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
df_eye = df[eye_cols]
# INF → NaN
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
df_eye = df_eye.dropna(subset=eye_cols, how="all")
print(f" Eyetracking-Zeilen: {len(df)}{len(df_eye)}")
return df_eye.reset_index(drop=True)
def extract_gaze_signal(df):
"""
Extrahiert 2D-Gaze-Positionen auf dem Display,
maskiert ungültige Samples und interpoliert Lücken.
"""
# Gaze-Spalten
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
# Validity-Spalten (1 = gültig)
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
# Ungültige Werte maskieren
gx_L[~val_L] = np.nan
gy_L[~val_L] = np.nan
gx_R[~val_R] = np.nan
gy_R[~val_R] = np.nan
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
# Interpolation (wichtig für PyGaze!)
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
xscaler = MinMaxScaler()
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
yscaler = MinMaxScaler()
gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
out = np.column_stack((gxscale, gyscale))
return out
def extract_pupil(df):
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
if vl is None or vr is None:
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
else:
validity = ((vl == 1) | (vr == 1)).astype(int).to_numpy()
# Mittelwert der verfügbaren Pupillen
p = np.mean(np.column_stack([pl, pr]), axis=1)
# INF/NaN reparieren
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
p = p.to_numpy()
return p, validity
def detect_blinks(pupil_validity, min_duration=5):
"""Erkennt Blinks: Validity=0 → Blink."""
blinks = []
start = None
for i, v in enumerate(pupil_validity):
if v == 0 and start is None:
start = i
elif v == 1 and start is not None:
if i - start >= min_duration:
blinks.append([start, i])
start = None
return blinks
def compute_IPA(pupil, fs=250):
"""
IPA = Index of Pupillary Activity (nach Duchowski 2018).
Hochfrequenzanteile der Pupillenzeitreihe.
"""
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
hf_band = (f >= 0.6) & (f <= 2.0)
ipa = np.sum(Pxx[hf_band])
return ipa
##############################################################################
# 2. FEATURE-EXTRAKTION MIT SLIDING WINDOW
##############################################################################
def extract_eye_features_sliding(df_eye, df_meta, window_size, step_size, fs=250):
"""
Extrahiert Features mit Sliding Window aus einem einzelnen Level/Phase.
Parameters:
-----------
df_eye : DataFrame
Eye-Tracking Daten (bereits gereinigt)
df_meta : DataFrame
Metadaten (subjectID, rowID, STUDY, LEVEL, PHASE)
window_size : int
Anzahl Samples pro Window
step_size : int
Schrittweite in Samples
fs : int
Sampling Rate in Hz
"""
# Gaze
gaze = extract_gaze_signal(df_eye)
# Pupille
pupil, pupil_validity = extract_pupil(df_eye)
features = []
num_windows = (len(df_eye) - window_size) // step_size + 1
if num_windows <= 0:
return pd.DataFrame()
for i in range(num_windows):
start_idx = i * step_size
end_idx = start_idx + window_size
w_gaze = gaze[start_idx:end_idx]
w_pupil = pupil[start_idx:end_idx]
w_valid = pupil_validity[start_idx:end_idx]
# Metadaten für dieses Window
meta_row = df_meta.iloc[start_idx]
# ----------------------------
# FIXATIONS (PyGaze)
# ----------------------------
time_ms = np.arange(window_size) * 1000.0 / fs
fix, efix = fixation_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
missing=0.0, maxdist=0.003, mindur=10
)
fixation_durations = []
for f in efix:
if np.isfinite(f[2]) and f[2] > 0:
fixation_durations.append(f[2])
# Kategorien laut Paper
F_short = sum(66 <= d <= 150 for d in fixation_durations)
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
F_long = sum(d >= 1000 for d in fixation_durations)
F_hundred = sum(d > 100 for d in fixation_durations)
# F_Cancel = sum(66 < d for d in fixation_durations)
# ----------------------------
# SACCADES
# ----------------------------
sac, esac = saccade_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
missing=0, minlen=12, maxvel=0.2, maxacc=1
)
sac_durations = [s[2] for s in esac]
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
# ----------------------------
# BLINKS
# ----------------------------
blinks = detect_blinks(w_valid)
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
# ----------------------------
# PUPIL
# ----------------------------
if np.all(np.isnan(w_pupil)):
mean_pupil = np.nan
ipa = np.nan
else:
mean_pupil = np.nanmean(w_pupil)
ipa = compute_IPA(w_pupil, fs=fs)
# ----------------------------
# FEATURE-DICTIONARY
# ----------------------------
features.append({
# Metadaten
'subjectID': meta_row['subjectID'],
'start_time': meta_row['rowID'],
'STUDY': meta_row.get('STUDY', np.nan),
'LEVEL': meta_row.get('LEVEL', np.nan),
'PHASE': meta_row.get('PHASE', np.nan),
# Fixation Features
"Fix_count_short_66_150": F_short,
"Fix_count_medium_300_500": F_medium,
"Fix_count_long_gt_1000": F_long,
"Fix_count_100": F_hundred,
# "Fix_cancel": F_Cancel,
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
# Saccade Features
"Sac_count": len(sac),
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
# Blink Features
"Blink_count": len(blinks),
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
# Pupil Features
"Pupil_mean": mean_pupil,
"Pupil_IPA": ipa
})
return pd.DataFrame(features)
##############################################################################
# 3. BATCH-VERARBEITUNG
##############################################################################
def process_parquet_directory(input_dir, output_file, window_size, step_size, fs=250):
"""
Verarbeitet alle Parquet-Dateien in einem Verzeichnis.
Parameters:
-----------
input_dir : str
Pfad zum Verzeichnis mit Parquet-Dateien
output_file : str
Pfad für die Ausgabe-Parquet-Datei
window_size : int
Window-Größe in Samples
step_size : int
Schrittweite in Samples
fs : int
Sampling Rate in Hz
"""
input_path = Path(input_dir)
parquet_files = sorted(input_path.glob("*.parquet"))
if not parquet_files:
print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
return
print(f"\n{'='*70}")
print(f"STARTE BATCH-VERARBEITUNG")
print(f"{'='*70}")
print(f"Gefundene Dateien: {len(parquet_files)}")
print(f"Window Size: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
print(f"Step Size: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
print(f"{'='*70}\n")
all_features = []
for file_idx, parquet_file in enumerate(parquet_files, 1):
print(f"\n[{file_idx}/{len(parquet_files)}] Verarbeite: {parquet_file.name}")
try:
# Lade Parquet-Datei
df = pd.read_parquet(parquet_file)
print(f" Einträge geladen: {len(df)}")
# Prüfe ob benötigte Spalten vorhanden sind
required_cols = ['subjectID', 'rowID']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
print(f" WARNUNG: Fehlende Spalten: {missing_cols} - Überspringe Datei")
continue
# Reinige Eye-Tracking-Daten
df_eye = clean_eye_df(df)
if len(df_eye) == 0:
print(f" WARNUNG: Keine gültigen Eye-Tracking-Daten - Überspringe Datei")
continue
# Metadaten extrahieren (aligned mit df_eye)
meta_cols = ['subjectID', 'rowID']
if 'STUDY' in df.columns:
meta_cols.append('STUDY')
if 'LEVEL' in df.columns:
meta_cols.append('LEVEL')
if 'PHASE' in df.columns:
meta_cols.append('PHASE')
df_meta = df[meta_cols].iloc[df_eye.index].reset_index(drop=True)
# Gruppiere nach STUDY, LEVEL, PHASE (falls vorhanden)
group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df_meta.columns]
if group_cols:
print(f" Gruppiere nach: {', '.join(group_cols)}")
for group_vals, group_df in df_meta.groupby(group_cols, sort=False):
group_eye = df_eye.iloc[group_df.index].reset_index(drop=True)
group_meta = group_df.reset_index(drop=True)
print(f" Gruppe {group_vals}: {len(group_eye)} Samples", end="")
features_df = extract_eye_features_sliding(
group_eye, group_meta, window_size, step_size, fs
)
if not features_df.empty:
all_features.append(features_df)
print(f"{len(features_df)} Windows")
else:
print("Zu wenige Daten")
else:
# Keine Gruppierung
print(f" Keine Gruppierungsspalten gefunden")
features_df = extract_eye_features_sliding(
df_eye, df_meta, window_size, step_size, fs
)
if not features_df.empty:
all_features.append(features_df)
print(f"{len(features_df)} Windows erstellt")
else:
print(f" → Zu wenige Daten")
except Exception as e:
print(f" FEHLER bei Verarbeitung: {str(e)}")
import traceback
traceback.print_exc()
continue
# Kombiniere alle Features
if not all_features:
print("\nKEINE FEATURES EXTRAHIERT!")
return None
print(f"\n{'='*70}")
print(f"ZUSAMMENFASSUNG")
print(f"{'='*70}")
final_df = pd.concat(all_features, ignore_index=True)
print(f"Gesamt Windows: {len(final_df)}")
print(f"Spalten: {len(final_df.columns)}")
print(f"Subjects: {final_df['subjectID'].nunique()}")
# Speichere Ergebnis
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
final_df.to_parquet(output_file, index=False)
print(f"\n✓ Ergebnis gespeichert: {output_file}")
print(f"{'='*70}\n")
return final_df
##############################################################################
# 4. MAIN
##############################################################################
def main():
print("\n" + "="*70)
print("EYE-TRACKING FEATURE EXTRAKTION - BATCH MODE")
print("="*70)
result = process_parquet_directory(
input_dir=INPUT_DIR,
output_file=OUTPUT_FILE,
window_size=WINDOW_SIZE_SAMPLES,
step_size=STEP_SIZE_SAMPLES,
fs=SAMPLING_RATE
)
if result is not None:
print("\nErste 5 Zeilen des Ergebnisses:")
print(result.head())
print("\nSpalten-Übersicht:")
print(result.columns.tolist())
print("\nDatentypen:")
print(result.dtypes)
print("\n✓ FERTIG!\n")
if __name__ == "__main__":
main()

View File

@ -1,323 +0,0 @@
import numpy as np
import pandas as pd
import h5py
import yaml
import owncloud
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import welch
from pygazeanalyser.detectors import fixation_detection, saccade_detection
##############################################################################
# 1. HELFERFUNKTIONEN
##############################################################################
def clean_eye_df(df):
"""
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
"""
eye_cols = [c for c in df.columns if "EYE_" in c]
df_eye = df[eye_cols]
# INF → NaN
df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
# Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
df_eye = df_eye.dropna(subset=eye_cols, how="all")
print("Eyetracking-Zeilen vorher:", len(df))
print("Eyetracking-Zeilen nachher:", len(df_eye))
#Index zurücksetzen
return df_eye.reset_index(drop=True)
def extract_gaze_signal(df):
"""
Extrahiert 2D-Gaze-Positionen auf dem Display,
maskiert ungültige Samples und interpoliert Lücken.
"""
print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
# Gaze-Spalten
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
# Validity-Spalten (1 = gültig)
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
# Ungültige Werte maskieren
gx_L[~val_L] = np.nan
gy_L[~val_L] = np.nan
gx_R[~val_R] = np.nan
gy_R[~val_R] = np.nan
# Mittelwert der beiden Augen pro Sample (nanmean ist robust)
gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
# Interpolation (wichtig für PyGaze!)
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
xscaler = MinMaxScaler()
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
yscaler = MinMaxScaler()
gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
print("xmax ymax", gxscale.max(), gyscale.max())
out = np.column_stack((gxscale, gyscale))
print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
return out
def extract_pupil(df):
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
if vl is None or vr is None:
# Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
# gültig, wenn Pupillendurchmesser nicht NaN.
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
else:
# Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
# Mittelwert der verfügbaren Pupillen
p = np.mean(np.column_stack([pl, pr]), axis=1)
# INF/NaN reparieren
p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
p = p.to_numpy()
print("→ extract_pupil(): Pupillensignal Länge:", len(p))
return p, validity
def detect_blinks(pupil_validity, min_duration=5):
"""Erkennt Blinks: Validity=0 → Blink."""
blinks = []
start = None
for i, v in enumerate(pupil_validity):
if v == 0 and start is None:
start = i
elif v == 1 and start is not None:
if i - start >= min_duration:
blinks.append([start, i])
start = None
return blinks
def compute_IPA(pupil, fs=250):
"""
IPA = Index of Pupillary Activity (nach Duchowski 2018).
Hochfrequenzanteile der Pupillenzeitreihe.
"""
f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster
hf_band = (f >= 0.6) & (f <= 2.0)
ipa = np.sum(Pxx[hf_band])
return ipa
##############################################################################
# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
##############################################################################
def extract_eye_features(df, window_length_sec=50, fs=250):
"""
df = Tobii DataFrame
window_length_sec = Fenstergröße (z.B. W=1s)
"""
print("→ extract_eye_features(): Starte Feature-Berechnung...")
print(" Fensterlänge W =", window_length_sec, "s")
W = int(window_length_sec * fs) # Window größe in Samples
# Gaze
gaze = extract_gaze_signal(df)
gx, gy = gaze[:, 0], gaze[:, 1]
print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
print("Range:", np.nanmin(gx), np.nanmax(gx))
print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
print("Range:", np.nanmin(gy), np.nanmax(gy))
# Pupille
pupil, pupil_validity = extract_pupil(df)
features = []
# Sliding windows
for start in range(0, len(df), W):
end = start + W
if end > len(df):
break #das letzte Fenster wird ignoriert
w_gaze = gaze[start:end]
w_pupil = pupil[start:end]
w_valid = pupil_validity[start:end]
# ----------------------------
# FIXATIONS (PyGaze)
# ----------------------------
time_ms = np.arange(W) * 1000.0 / fs
# print("gx im Fenster:", w_gaze[:,0][:20])
# print("gy im Fenster:", w_gaze[:,1][:20])
# print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
# print("Werte X im Fenster:", w_gaze[:,0])
# print("Werte Y im Fenster:", w_gaze[:,1])
# print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
# print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
print("time_ms:", time_ms)
fix, efix = fixation_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
missing=0.0, maxdist=0.001, mindur=65 # mindur=100ms
)
#print("Raw Fixation Output:", efix[0])
if start == 0:
print("DEBUG fix raw:", fix[:10])
# Robust fixations: PyGaze may return malformed entries
fixation_durations = []
for f in efix:
print("Efix:", f[2])
# start_t = f[1] # in ms
# end_t = f[2] # in ms
# duration = (end_t - start_t) / 1000.0 # in Sekunden
#duration = f[2] / 1000.0
if np.isfinite(f[2]) and f[2] > 0:
fixation_durations.append(f[2])
# Kategorien laut Paper
F_short = sum(66 <= d <= 150 for d in fixation_durations)
F_medium = sum(300 <= d <= 500 for d in fixation_durations)
F_long = sum(d >= 1000 for d in fixation_durations)
F_hundred = sum(d > 100 for d in fixation_durations)
F_Cancel = sum(66 < d for d in fixation_durations)
# ----------------------------
# SACCADES
# ----------------------------
sac, esac = saccade_detection(
x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
)
sac_durations = [s[2] for s in esac]
sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
# ----------------------------
# BLINKS
# ----------------------------
blinks = detect_blinks(w_valid)
blink_durations = [(b[1] - b[0]) / fs for b in blinks]
# ----------------------------
# PUPIL
# ----------------------------
if np.all(np.isnan(w_pupil)):
mean_pupil = np.nan
ipa = np.nan
else:
mean_pupil = np.nanmean(w_pupil)
ipa = compute_IPA(w_pupil, fs=fs)
# ----------------------------
# FEATURE-TABELLE FÜLLEN
# ----------------------------
features.append({
"Fix_count_short_66_150": F_short,
"Fix_count_medium_300_500": F_medium,
"Fix_count_long_gt_1000": F_long,
"Fix_count_100": F_hundred,
"Fix_cancel": F_Cancel,
"Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
"Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
"Sac_count": len(sac),
"Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
"Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
"Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
"Blink_count": len(blinks),
"Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
"Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
"Pupil_mean": mean_pupil,
"Pupil_IPA": ipa
})
result = pd.DataFrame(features)
print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
return result
##############################################################################
# 3. MAIN FUNKTION
##############################################################################
def main():
print("### STARTE FEATURE-EXTRAKTION ###")
print("Aktueller Arbeitsordner:", os.getcwd())
df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
#df = pd.read_parquet("cleaned_0001.parquet")
print("DataFrame geladen:", df.shape)
# Nur Eye-Tracking auswählen
#eye_cols = [c for c in df.columns if "EYE_" in c]
#df_eye = df[eye_cols]
#print("Eye-Tracking-Spalten:", len(eye_cols))
#print("→", eye_cols[:10], " ...")
print("Reinige Eyetracking-Daten ...")
df_eye = clean_eye_df(df)
# Feature Extraction
features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
print("\n### FEATURE-MATRIX (HEAD) ###")
print(features.head())
print("\nSpeichere Output in features.csv ...")
features.to_csv("features2.csv", index=False)
print("FERTIG!")
if __name__ == "__main__":
main()

View File

@ -1,72 +1,79 @@
import math import math
def fixation_radius_normalized(theta_deg: float,
distance_cm: float, def fixation_radius_normalized(
screen_width_cm: float, theta_deg: float,
screen_height_cm: float, distance_cm: float,
resolution_x: int, screen_width_cm: float,
resolution_y: int, screen_height_cm: float,
method: str = "max"): resolution_x: int,
resolution_y: int,
method: str = "max",
):
""" """
Berechnet den PyGaze-Fixationsradius für normierte Gaze-Daten in [0,1]. Compute the PyGaze fixation radius for normalized gaze data in [0, 1].
""" """
# Schritt 1: visueller Winkel → physische Distanz (cm) # Visual angle to physical distance (cm)
delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2) delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2)
# Schritt 2: physische Distanz → Pixel # Physical distance to pixels
delta_px_x = delta_cm * (resolution_x / screen_width_cm) delta_px_x = delta_cm * (resolution_x / screen_width_cm)
delta_px_y = delta_cm * (resolution_y / screen_height_cm) delta_px_y = delta_cm * (resolution_y / screen_height_cm)
# Pixelradius # Pixel radius
if method == "max": if method == "max":
r_px = max(delta_px_x, delta_px_y) r_px = max(delta_px_x, delta_px_y)
else: else:
r_px = math.sqrt(delta_px_x**2 + delta_px_y**2) r_px = math.sqrt(delta_px_x**2 + delta_px_y**2)
# Schritt 3: Pixelradius → normierter Radius # Pixel radius to normalized radius
r_norm_x = r_px / resolution_x r_norm_x = r_px / resolution_x
r_norm_y = r_px / resolution_y r_norm_y = r_px / resolution_y
if method == "max": if method == "max":
return max(r_norm_x, r_norm_y) return max(r_norm_x, r_norm_y)
else: return math.sqrt(r_norm_x**2 + r_norm_y**2)
return math.sqrt(r_norm_x**2 + r_norm_y**2)
def run_example():
# Example: 55" 4k monitor
screen_width_cm = 3 * 121.8
screen_height_cm = 68.5
resolution_x = 3 * 3840
resolution_y = 2160
distance_to_screen_cm = 120
max_angle = 1.0
maxdist_px = fixation_radius_normalized(
theta_deg=max_angle,
distance_cm=distance_to_screen_cm,
screen_width_cm=screen_width_cm,
screen_height_cm=screen_height_cm,
resolution_x=resolution_x,
resolution_y=resolution_y,
method="max",
)
print("PyGaze max_dist (max):", maxdist_px)
maxdist_px = fixation_radius_normalized(
theta_deg=max_angle,
distance_cm=distance_to_screen_cm,
screen_width_cm=screen_width_cm,
screen_height_cm=screen_height_cm,
resolution_x=resolution_x,
resolution_y=resolution_y,
method="euclid",
)
print("PyGaze max_dist (euclid):", maxdist_px)
def main():
run_example()
# Beispiel: 55" 4k Monitor if __name__ == "__main__":
screen_width_cm = 3*121.8 main()
screen_height_cm = 68.5
resolution_x = 3*3840
resolution_y = 2160
distance_to_screen_cm = 120
method = 'max'
max_angle= 1.0
maxdist_px = fixation_radius_normalized(theta_deg=max_angle, # Reference
distance_cm=distance_to_screen_cm, # https://osdoc.cogsci.nl/4.0/de/visualangle/
screen_width_cm=screen_width_cm,
screen_height_cm=screen_height_cm,
resolution_x=resolution_x,
resolution_y=resolution_y,
method=method)
print("PyGaze max_dist (max):", maxdist_px)
method = 'euclid'
maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
distance_cm=distance_to_screen_cm,
screen_width_cm=screen_width_cm,
screen_height_cm=screen_height_cm,
resolution_x=resolution_x,
resolution_y=resolution_y,
method=method)
print("PyGaze max_dist (euclid):", maxdist_px)
# Passt noch nicht zu der Breite
# https://osdoc.cogsci.nl/4.0/de/visualangle/
# https://reference.org/facts/Visual_angle/LUw29zy7 # https://reference.org/facts/Visual_angle/LUw29zy7

View File

@ -1,155 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "2b3fface",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74f1f5ec",
"metadata": {},
"outputs": [],
"source": [
"df= pd.read_parquet(r\" \")\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05775454",
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99e17328",
"metadata": {},
"outputs": [],
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "69e53731",
"metadata": {},
"outputs": [],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3754c664",
"metadata": {},
"outputs": [],
"source": [
"# Zeigt alle Kombinationen mit Häufigkeit\n",
"df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f83b595c",
"metadata": {},
"outputs": [],
"source": [
"high_nback = df[\n",
" (df[\"STUDY\"]==\"n-back\") &\n",
" (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
" (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
"]\n",
"high_nback.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0940343",
"metadata": {},
"outputs": [],
"source": [
"low_all = df[\n",
" ((df[\"PHASE\"] == \"baseline\") |\n",
" ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
"]\n",
"print(low_all.shape)\n",
"high_kdrive = df[\n",
" (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
"]\n",
"print(high_kdrive.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7ce38d3",
"metadata": {},
"outputs": [],
"source": [
"print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
"print(df.shape[0])\n",
"print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48ba0379",
"metadata": {},
"outputs": [],
"source": [
"high_all = pd.concat([high_nback, high_kdrive])\n",
"high_all.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77dda26c",
"metadata": {},
"outputs": [],
"source": [
"print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
"print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
"print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,8 +1,10 @@
import os import os
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
# TODO: Set paths correctly
data_dir = Path("") # path to the directory with all .h5 files
base_dir = Path(r"") # directory to store the parquet files in
data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")
# Get all .h5 files and sort them # Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5")) matching_files = sorted(data_dir.glob("*.h5"))
@ -11,8 +13,8 @@ matching_files = sorted(data_dir.glob("*.h5"))
CHUNK_SIZE = 50_000 CHUNK_SIZE = 50_000
for i, file_path in enumerate(matching_files): for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet") print(f"Starting with subject {i}")
print(f"{file_path} geoeffnet") print(f"Opened: {file_path}")
# Step 1: Get total number of rows and column names # Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store: with pd.HDFStore(file_path, mode="r") as store:
@ -81,7 +83,7 @@ for i, file_path in enumerate(matching_files):
print(f"Final dataframe shape: {df_final.shape}") print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet # Save to parquet
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
os.makedirs(base_dir, exist_ok=True) os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"both_mod_{i:04d}.parquet" out_name = base_dir / f"both_mod_{i:04d}.parquet"

View File

@ -28,7 +28,7 @@
"sys.path.append(base_dir)\n", "sys.path.append(base_dir)\n",
"print(base_dir)\n", "print(base_dir)\n",
"\n", "\n",
"from tools import evaluation_tools\n", "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"from sklearn.ensemble import IsolationForest\n", "from sklearn.ensemble import IsolationForest\n",
"from sklearn.model_selection import GridSearchCV, KFold\n", "from sklearn.model_selection import GridSearchCV, KFold\n",
@ -52,7 +52,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")" "data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")"
] ]
}, },
{ {
@ -301,20 +301,26 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 2: Get AU columns and prepare datasets\n", "au_columns = [col for col in low_all.columns if \"face\" in col.lower()] \n",
"# Get all column names that start with 'AU'\n",
"au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
"\n", "\n",
"eye_columns = [ \n",
" 'Fix_count_short_66_150','Fix_count_medium_300_500','Fix_count_long_gt_1000', \n",
" 'Fix_count_100','Fix_mean_duration','Fix_median_duration', \n",
" 'Sac_count','Sac_mean_amp','Sac_mean_dur','Sac_median_dur', \n",
" 'Blink_count','Blink_mean_dur','Blink_median_dur', \n",
" 'Pupil_mean','Pupil_IPA' \n",
"] \n",
"cols = au_columns +eye_columns\n",
"# Prepare training data (only normal/low data)\n", "# Prepare training data (only normal/low data)\n",
"train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n", "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + cols].copy()\n",
"\n", "\n",
"# Prepare validation data (normal and anomaly)\n", "# Prepare validation data (normal and anomaly)\n",
"val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
"val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n", "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
"\n", "\n",
"# Prepare test data (normal and anomaly)\n", "# Prepare test data (normal and anomaly)\n",
"test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
"test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n", "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
"\n", "\n",
"print(f\"Train samples: {len(train_data)}\")\n", "print(f\"Train samples: {len(train_data)}\")\n",
"print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n", "print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
@ -328,8 +334,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 3: Fit normalizer on training data\n", "# Fit normalizer on training data\n",
"normalizer = fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n", "normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n",
"print(\"Normalizer fitted on training data\")" "print(\"Normalizer fitted on training data\")"
] ]
}, },
@ -340,12 +346,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 4: Apply normalization to all datasets\n", "# Apply normalization to all datasets\n",
"train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n", "train_normalized = apply_normalizer(train_data, cols, normalizer)\n",
"val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n", "val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n",
"val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n", "val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n",
"test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n", "test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n",
"test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n", "test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n",
"\n", "\n",
"print(\"Normalization applied to all datasets\")" "print(\"Normalization applied to all datasets\")"
] ]
@ -357,11 +363,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 5: Extract AU columns and create labels for grid search\n", "X_train = train_normalized[cols].copy()\n",
"# Extract only AU columns (drop subjectID)\n", "X_val_normal = val_normal_normalized[cols].copy()\n",
"X_train = train_normalized[au_columns].copy()\n", "X_val_high = val_high_normalized[cols].copy()\n",
"X_val_normal = val_normal_normalized[au_columns].copy()\n",
"X_val_high = val_high_normalized[au_columns].copy()\n",
"\n", "\n",
"# Combine train and validation sets for grid search\n", "# Combine train and validation sets for grid search\n",
"X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n", "X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
@ -416,7 +420,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 7: Train final model with best parameters on training data\n", "# Train final model with best parameters on training data\n",
"final_model = IsolationForest(**best_params, random_state=42)\n", "final_model = IsolationForest(**best_params, random_state=42)\n",
"final_model.fit(X_train.values)\n", "final_model.fit(X_train.values)\n",
"\n", "\n",
@ -430,9 +434,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Cell 8: Prepare independent test set\n", "# Prepare independent test set\n",
"X_test_normal = test_normal_normalized[au_columns].copy()\n", "X_test_normal = test_normal_normalized[cols].copy()\n",
"X_test_high = test_high_normalized[au_columns].copy()\n", "X_test_high = test_high_normalized[cols].copy()\n",
"\n", "\n",
"# Combine test sets\n", "# Combine test sets\n",
"X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n", "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
@ -483,7 +487,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "base", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -497,7 +501,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.5" "version": "3.12.10"
} }
}, },
"nbformat": 4, "nbformat": 4,