minor fixes in dataset creation

changed paths to paulusja ... directory
changed feature extraction for AUs to mean instead of sum
added v1 of merge script of datasets (needs to be adjusted)
This commit is contained in:
Michael Weig 2025-12-18 13:04:11 +01:00
parent 87c5e21daf
commit b8bebc0944
6 changed files with 171 additions and 18 deletions

View File

@ -0,0 +1,91 @@
import os
import pandas as pd
from pathlib import Path
print(os.getcwd())
num_files = 2 # number of files to process (min: 1, max: 30)
print("connection aufgebaut")
data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
# os.chdir(data_dir)
# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 100_000
for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet")
print(f"{file_path} geoeffnet")
# Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns
nrows = store.get_storer("SIGNALS").nrows
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
# Step 2: Filter columns that start with "FACE_AU"
eye_cols = [c for c in cols if c.startswith("EYE_")]
print(f"eye-tracking columns found: {eye_cols}")
if len(eye_cols) == 0:
print(f"keine eye-tracking-Signale in Subject {i}")
continue
# Columns to read
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
# Step 3: Process file in chunks
chunks_to_save = []
for start_row in range(0, nrows, CHUNK_SIZE):
stop_row = min(start_row + CHUNK_SIZE, nrows)
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
# Read chunk
df_chunk = pd.read_hdf(
file_path,
key="SIGNALS",
columns=columns_to_read,
start=start_row,
stop=stop_row
)
# Add metadata columns
df_chunk["subjectID"] = i
df_chunk["rowID"] = range(start_row, stop_row)
# Clean data
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
df_chunk = df_chunk.dropna()
# Only keep non-empty chunks
if len(df_chunk) > 0:
chunks_to_save.append(df_chunk)
# Free memory
del df_chunk
print("load and cleaning done")
# Step 4: Combine all chunks and save
if chunks_to_save:
df_final = pd.concat(chunks_to_save, ignore_index=True)
print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
df_final.to_parquet(out_name, index=False)
print(f"Saved to {out_name}")
# Free memory
del df_final
del chunks_to_save
else:
print(f"No valid data found for Subject {i}")
print("All files processed!")

View File

@ -71,7 +71,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
# Summiere alle AU-Spalten
for au_col in au_columns:
result[f'{au_col}_sum'] = window_df[au_col].sum()
# result[f'{au_col}_sum'] = window_df[au_col].sum()
result[f'{au_col}_mean'] = window_df[au_col].mean()
all_windows.append(result)
@ -94,8 +95,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
# Beispiel-Verwendung
if __name__ == "__main__":
# Anpassen an deine Pfade
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"
input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")

View File

@ -0,0 +1,56 @@
from pathlib import Path
import pandas as pd
def main():
"""
USER CONFIGURATION
------------------
Specify input files and output directory here.
"""
# Input parquet files (single-modality datasets)
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
# Output directory and file name
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
output_file = output_dir / "merged_dataset.parquet"
# Column names (adjust only if your schema differs)
subject_col = "subjectID"
time_col = "start_time"
# ------------------------------------------------------------------
# Load datasets
# ------------------------------------------------------------------
df1 = pd.read_parquet(file_modality_1)
df2 = pd.read_parquet(file_modality_2)
# ------------------------------------------------------------------
# Keep only subjects that appear in BOTH datasets
# ------------------------------------------------------------------
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
df1 = df1[df1[subject_col].isin(common_subjects)]
df2 = df2[df2[subject_col].isin(common_subjects)]
# ------------------------------------------------------------------
# Inner join on subject ID AND start_time
# ------------------------------------------------------------------
merged_df = pd.merge(
df1,
df2,
on=[subject_col, time_col],
how="inner",
)
# ------------------------------------------------------------------
# Save merged dataset
# ------------------------------------------------------------------
output_dir.mkdir(parents=True, exist_ok=True)
merged_df.to_parquet(output_file, index=False)
if __name__ == "__main__":
main()

View File

@ -2,7 +2,6 @@ import numpy as np
import pandas as pd
import h5py
import yaml
import owncloud
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import welch

View File

@ -12,8 +12,8 @@ from pygazeanalyser.detectors import fixation_detection, saccade_detection
##############################################################################
# KONFIGURATION - HIER ANPASSEN!
##############################################################################
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/parquet_Eye_features_old/")
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/eye_dataset_old.parquet")
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
@ -28,7 +28,7 @@ def clean_eye_df(df):
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
"""
eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
df_eye = df[eye_cols]
# INF → NaN
@ -48,14 +48,14 @@ def extract_gaze_signal(df):
maskiert ungültige Samples und interpoliert Lücken.
"""
# Gaze-Spalten
gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
# Validity-Spalten (1 = gültig)
val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
@ -76,18 +76,24 @@ def extract_gaze_signal(df):
# Interpolation (wichtig für PyGaze!)
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
xscaler = MinMaxScaler()
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
out = np.column_stack((gx, gy))
yscaler = MinMaxScaler()
gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
out = np.column_stack((gxscale, gyscale))
return out
def extract_pupil(df):
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
vl = df.get("LEFT_PUPIL_VALIDITY")
vr = df.get("RIGHT_PUPIL_VALIDITY")
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
if vl is None or vr is None:
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()