From b8bebc0944a1e1d1d300343134fb7185c6d50130 Mon Sep 17 00:00:00 2001
From: Michael <weigmi87303@th-nuernberg.de>
Date: Thu, 18 Dec 2025 13:04:11 +0100
Subject: [PATCH] minor fixes in dataset creation

changed paths to paulusja ... directory
changed feature extraction for AUs to mean instead of sum
added v1 of merge script of datasets (needs to be adjusted)
---
 ...wise_parquet_file_creation_EYE_TRACKING.py | 91 +++++++++++++++++++
 ...hunkwise_parquet_file_creation_FACE_AU.py} |  0
 dataset_creation/create_feature_table.py      |  7 +-
 dataset_creation/create_multimodal_dataset.py | 56 ++++++++++++
 dataset_creation/eyeAlt.py                    |  1 -
 dataset_creation/eye_batch_processor.py       | 34 ++++---
 6 files changed, 171 insertions(+), 18 deletions(-)
 create mode 100644 dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
 rename dataset_creation/{chunkwise_parquet_file_creation.py => chunkwise_parquet_file_creation_FACE_AU.py} (100%)
 create mode 100644 dataset_creation/create_multimodal_dataset.py

diff --git a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
new file mode 100644
index 0000000..64b1ae6
--- /dev/null
+++ b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+print(os.getcwd())
+num_files = 2  # number of files to process (min: 1, max: 30)
+
+print("connection aufgebaut")
+
+data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
+# os.chdir(data_dir)
+# Get all .h5 files and sort them
+matching_files = sorted(data_dir.glob("*.h5"))
+
+# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
+CHUNK_SIZE = 100_000
+
+for i, file_path in enumerate(matching_files):
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
+    
+    # Step 1: Get total number of rows and column names
+    with pd.HDFStore(file_path, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns
+        nrows = store.get_storer("SIGNALS").nrows
+        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
+    
+    # Step 2: Filter columns that start with "FACE_AU"
+    eye_cols = [c for c in cols if c.startswith("EYE_")]
+    print(f"eye-tracking columns found: {eye_cols}")
+    
+    if len(eye_cols) == 0:
+        print(f"keine eye-tracking-Signale in Subject {i}")
+        continue
+    
+    # Columns to read
+    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
+    
+    # Step 3: Process file in chunks
+    chunks_to_save = []
+    
+    for start_row in range(0, nrows, CHUNK_SIZE):
+        stop_row = min(start_row + CHUNK_SIZE, nrows)
+        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
+        
+        # Read chunk
+        df_chunk = pd.read_hdf(
+            file_path, 
+            key="SIGNALS", 
+            columns=columns_to_read,
+            start=start_row,
+            stop=stop_row
+        )
+        
+        # Add metadata columns
+        df_chunk["subjectID"] = i
+        df_chunk["rowID"] = range(start_row, stop_row)
+        
+        # Clean data
+        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
+        df_chunk = df_chunk.dropna()
+        
+        # Only keep non-empty chunks
+        if len(df_chunk) > 0:
+            chunks_to_save.append(df_chunk)
+        
+        # Free memory
+        del df_chunk
+    
+    print("load and cleaning done")
+    
+    # Step 4: Combine all chunks and save
+    if chunks_to_save:
+        df_final = pd.concat(chunks_to_save, ignore_index=True)
+        print(f"Final dataframe shape: {df_final.shape}")
+        
+        # Save to parquet
+        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
+        os.makedirs(base_dir, exist_ok=True)
+        
+        out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
+        df_final.to_parquet(out_name, index=False)
+        print(f"Saved to {out_name}")
+        
+        # Free memory
+        del df_final
+        del chunks_to_save
+    else:
+        print(f"No valid data found for Subject {i}")
+
+print("All files processed!")
\ No newline at end of file
diff --git a/dataset_creation/chunkwise_parquet_file_creation.py b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
similarity index 100%
rename from dataset_creation/chunkwise_parquet_file_creation.py
rename to dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py
index 86ee7b8..54e7892 100644
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@@ -71,7 +71,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
                 
                 # Summiere alle AU-Spalten
                 for au_col in au_columns:
-                    result[f'{au_col}_sum'] = window_df[au_col].sum()
+                    # result[f'{au_col}_sum'] = window_df[au_col].sum()
+                    result[f'{au_col}_mean'] = window_df[au_col].mean()
                 
                 all_windows.append(result)
             
@@ -94,8 +95,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
 # Beispiel-Verwendung
 if __name__ == "__main__":
     # Anpassen an deine Pfade
-    input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
-    output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"
+    input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
+    output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
 
     
     
diff --git a/dataset_creation/create_multimodal_dataset.py b/dataset_creation/create_multimodal_dataset.py
new file mode 100644
index 0000000..a81a242
--- /dev/null
+++ b/dataset_creation/create_multimodal_dataset.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+import pandas as pd
+
+
+def main():
+    """
+    USER CONFIGURATION
+    ------------------
+    Specify input files and output directory here.
+    """
+
+    # Input parquet files (single-modality datasets)
+    file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
+    file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
+
+    # Output directory and file name
+    output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
+    output_file = output_dir / "merged_dataset.parquet"
+
+    # Column names (adjust only if your schema differs)
+    subject_col = "subjectID"
+    time_col = "start_time"
+
+    # ------------------------------------------------------------------
+    # Load datasets
+    # ------------------------------------------------------------------
+    df1 = pd.read_parquet(file_modality_1)
+    df2 = pd.read_parquet(file_modality_2)
+
+    # ------------------------------------------------------------------
+    # Keep only subjects that appear in BOTH datasets
+    # ------------------------------------------------------------------
+    common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
+
+    df1 = df1[df1[subject_col].isin(common_subjects)]
+    df2 = df2[df2[subject_col].isin(common_subjects)]
+
+    # ------------------------------------------------------------------
+    # Inner join on subject ID AND start_time
+    # ------------------------------------------------------------------
+    merged_df = pd.merge(
+        df1,
+        df2,
+        on=[subject_col, time_col],
+        how="inner",
+    )
+
+    # ------------------------------------------------------------------
+    # Save merged dataset
+    # ------------------------------------------------------------------
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merged_df.to_parquet(output_file, index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dataset_creation/eyeAlt.py b/dataset_creation/eyeAlt.py
index 1f685a9..fef68ad 100644
--- a/dataset_creation/eyeAlt.py
+++ b/dataset_creation/eyeAlt.py
@@ -2,7 +2,6 @@ import numpy as np
 import pandas as pd
 import h5py
 import yaml
-import owncloud
 import os
 from sklearn.preprocessing import MinMaxScaler
 from scipy.signal import welch
diff --git a/dataset_creation/eye_batch_processor.py b/dataset_creation/eye_batch_processor.py
index 09b906d..8192147 100644
--- a/dataset_creation/eye_batch_processor.py
+++ b/dataset_creation/eye_batch_processor.py
@@ -12,8 +12,8 @@ from pygazeanalyser.detectors import fixation_detection, saccade_detection
 ##############################################################################
 # KONFIGURATION - HIER ANPASSEN!
 ##############################################################################
-INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/parquet_Eye_features_old/")
-OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/eye_dataset_old.parquet")
+INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
+OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
 
 WINDOW_SIZE_SAMPLES = 12500  # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
 STEP_SIZE_SAMPLES = 1250    # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
@@ -28,7 +28,7 @@ def clean_eye_df(df):
     Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
     Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
     """
-    eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
+    eye_cols = [c for c in df.columns if c.startswith("EYE_")]
     df_eye = df[eye_cols]
 
     # INF → NaN
@@ -48,14 +48,14 @@ def extract_gaze_signal(df):
     maskiert ungültige Samples und interpoliert Lücken.
     """
     # Gaze-Spalten
-    gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-    gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
 
     # Validity-Spalten (1 = gültig)
-    val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
-    val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
+    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
+    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
 
     # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
     gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
@@ -76,18 +76,24 @@ def extract_gaze_signal(df):
     # Interpolation (wichtig für PyGaze!)
     gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
     gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    
+    xscaler = MinMaxScaler()
+    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
 
-    out = np.column_stack((gx, gy))
+    yscaler = MinMaxScaler()
+    gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
+    
+    out = np.column_stack((gxscale, gyscale))
     return out
 
 
 def extract_pupil(df):
     """Extrahiert Pupillengröße (beide Augen gemittelt)."""
-    pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-    pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
 
-    vl = df.get("LEFT_PUPIL_VALIDITY")
-    vr = df.get("RIGHT_PUPIL_VALIDITY")
+    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
+    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
 
     if vl is None or vr is None:
         validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()