minor fixes in dataset creation
changed paths to paulusja ... directory changed feature extraction for AUs to mean instead of sum added v1 of merge script of datasets (needs to be adjusted)
This commit is contained in:
parent
87c5e21daf
commit
b8bebc0944
@ -0,0 +1,91 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
print(os.getcwd())
|
||||
num_files = 2 # number of files to process (min: 1, max: 30)
|
||||
|
||||
print("connection aufgebaut")
|
||||
|
||||
data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
|
||||
# os.chdir(data_dir)
|
||||
# Get all .h5 files and sort them
|
||||
matching_files = sorted(data_dir.glob("*.h5"))
|
||||
|
||||
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
|
||||
CHUNK_SIZE = 100_000
|
||||
|
||||
for i, file_path in enumerate(matching_files):
|
||||
print(f"Subject {i} gestartet")
|
||||
print(f"{file_path} geoeffnet")
|
||||
|
||||
# Step 1: Get total number of rows and column names
|
||||
with pd.HDFStore(file_path, mode="r") as store:
|
||||
cols = store.select("SIGNALS", start=0, stop=1).columns
|
||||
nrows = store.get_storer("SIGNALS").nrows
|
||||
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
|
||||
|
||||
# Step 2: Filter columns that start with "FACE_AU"
|
||||
eye_cols = [c for c in cols if c.startswith("EYE_")]
|
||||
print(f"eye-tracking columns found: {eye_cols}")
|
||||
|
||||
if len(eye_cols) == 0:
|
||||
print(f"keine eye-tracking-Signale in Subject {i}")
|
||||
continue
|
||||
|
||||
# Columns to read
|
||||
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
|
||||
|
||||
# Step 3: Process file in chunks
|
||||
chunks_to_save = []
|
||||
|
||||
for start_row in range(0, nrows, CHUNK_SIZE):
|
||||
stop_row = min(start_row + CHUNK_SIZE, nrows)
|
||||
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
|
||||
|
||||
# Read chunk
|
||||
df_chunk = pd.read_hdf(
|
||||
file_path,
|
||||
key="SIGNALS",
|
||||
columns=columns_to_read,
|
||||
start=start_row,
|
||||
stop=stop_row
|
||||
)
|
||||
|
||||
# Add metadata columns
|
||||
df_chunk["subjectID"] = i
|
||||
df_chunk["rowID"] = range(start_row, stop_row)
|
||||
|
||||
# Clean data
|
||||
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
|
||||
df_chunk = df_chunk.dropna()
|
||||
|
||||
# Only keep non-empty chunks
|
||||
if len(df_chunk) > 0:
|
||||
chunks_to_save.append(df_chunk)
|
||||
|
||||
# Free memory
|
||||
del df_chunk
|
||||
|
||||
print("load and cleaning done")
|
||||
|
||||
# Step 4: Combine all chunks and save
|
||||
if chunks_to_save:
|
||||
df_final = pd.concat(chunks_to_save, ignore_index=True)
|
||||
print(f"Final dataframe shape: {df_final.shape}")
|
||||
|
||||
# Save to parquet
|
||||
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
|
||||
out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
|
||||
df_final.to_parquet(out_name, index=False)
|
||||
print(f"Saved to {out_name}")
|
||||
|
||||
# Free memory
|
||||
del df_final
|
||||
del chunks_to_save
|
||||
else:
|
||||
print(f"No valid data found for Subject {i}")
|
||||
|
||||
print("All files processed!")
|
||||
@ -71,7 +71,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
|
||||
|
||||
# Summiere alle AU-Spalten
|
||||
for au_col in au_columns:
|
||||
result[f'{au_col}_sum'] = window_df[au_col].sum()
|
||||
# result[f'{au_col}_sum'] = window_df[au_col].sum()
|
||||
result[f'{au_col}_mean'] = window_df[au_col].mean()
|
||||
|
||||
all_windows.append(result)
|
||||
|
||||
@ -94,8 +95,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
|
||||
# Beispiel-Verwendung
|
||||
if __name__ == "__main__":
|
||||
# Anpassen an deine Pfade
|
||||
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
|
||||
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"
|
||||
input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
|
||||
output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
|
||||
|
||||
|
||||
|
||||
|
||||
56
dataset_creation/create_multimodal_dataset.py
Normal file
56
dataset_creation/create_multimodal_dataset.py
Normal file
@ -0,0 +1,56 @@
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
USER CONFIGURATION
|
||||
------------------
|
||||
Specify input files and output directory here.
|
||||
"""
|
||||
|
||||
# Input parquet files (single-modality datasets)
|
||||
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
|
||||
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
|
||||
|
||||
# Output directory and file name
|
||||
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
|
||||
output_file = output_dir / "merged_dataset.parquet"
|
||||
|
||||
# Column names (adjust only if your schema differs)
|
||||
subject_col = "subjectID"
|
||||
time_col = "start_time"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Load datasets
|
||||
# ------------------------------------------------------------------
|
||||
df1 = pd.read_parquet(file_modality_1)
|
||||
df2 = pd.read_parquet(file_modality_2)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Keep only subjects that appear in BOTH datasets
|
||||
# ------------------------------------------------------------------
|
||||
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
|
||||
|
||||
df1 = df1[df1[subject_col].isin(common_subjects)]
|
||||
df2 = df2[df2[subject_col].isin(common_subjects)]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Inner join on subject ID AND start_time
|
||||
# ------------------------------------------------------------------
|
||||
merged_df = pd.merge(
|
||||
df1,
|
||||
df2,
|
||||
on=[subject_col, time_col],
|
||||
how="inner",
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Save merged dataset
|
||||
# ------------------------------------------------------------------
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_df.to_parquet(output_file, index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -2,7 +2,6 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import h5py
|
||||
import yaml
|
||||
import owncloud
|
||||
import os
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from scipy.signal import welch
|
||||
|
||||
@ -12,8 +12,8 @@ from pygazeanalyser.detectors import fixation_detection, saccade_detection
|
||||
##############################################################################
|
||||
# KONFIGURATION - HIER ANPASSEN!
|
||||
##############################################################################
|
||||
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/parquet_Eye_features_old/")
|
||||
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/eye_dataset_old.parquet")
|
||||
INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
|
||||
OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
|
||||
|
||||
WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
|
||||
STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
|
||||
@ -28,7 +28,7 @@ def clean_eye_df(df):
|
||||
Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
|
||||
Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
|
||||
"""
|
||||
eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
|
||||
eye_cols = [c for c in df.columns if c.startswith("EYE_")]
|
||||
df_eye = df[eye_cols]
|
||||
|
||||
# INF → NaN
|
||||
@ -48,14 +48,14 @@ def extract_gaze_signal(df):
|
||||
maskiert ungültige Samples und interpoliert Lücken.
|
||||
"""
|
||||
# Gaze-Spalten
|
||||
gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||
gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||
gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||
gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||
gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||
gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||
gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
|
||||
gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
|
||||
|
||||
# Validity-Spalten (1 = gültig)
|
||||
val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
|
||||
val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
|
||||
val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
|
||||
val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
|
||||
|
||||
# Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
|
||||
gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
@ -76,18 +76,24 @@ def extract_gaze_signal(df):
|
||||
# Interpolation (wichtig für PyGaze!)
|
||||
gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||
gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
|
||||
|
||||
xscaler = MinMaxScaler()
|
||||
gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
|
||||
|
||||
out = np.column_stack((gx, gy))
|
||||
yscaler = MinMaxScaler()
|
||||
gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
|
||||
|
||||
out = np.column_stack((gxscale, gyscale))
|
||||
return out
|
||||
|
||||
|
||||
def extract_pupil(df):
|
||||
"""Extrahiert Pupillengröße (beide Augen gemittelt)."""
|
||||
pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||
pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||
pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||
pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
|
||||
|
||||
vl = df.get("LEFT_PUPIL_VALIDITY")
|
||||
vr = df.get("RIGHT_PUPIL_VALIDITY")
|
||||
vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
|
||||
vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
|
||||
|
||||
if vl is None or vr is None:
|
||||
validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user