minor fixes in dataset creation

changed paths to paulusja ... directory changed feature extraction for AUs to mean instead of sum added v1 of merge script of datasets (needs to be adjusted)
2025-12-18 13:04:11 +01:00 · 2025-12-18 13:04:11 +01:00 · b8bebc0944
commit b8bebc0944
parent 87c5e21daf
6 changed files with 171 additions and 18 deletions
--- a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
+++ b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
@ -0,0 +1,91 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+print(os.getcwd())
+num_files = 2  # number of files to process (min: 1, max: 30)
+
+print("connection aufgebaut")
+
+data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
+# os.chdir(data_dir)
+# Get all .h5 files and sort them
+matching_files = sorted(data_dir.glob("*.h5"))
+
+# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
+CHUNK_SIZE = 100_000
+
+for i, file_path in enumerate(matching_files):
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
+    
+    # Step 1: Get total number of rows and column names
+    with pd.HDFStore(file_path, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns
+        nrows = store.get_storer("SIGNALS").nrows
+        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
+    
+    # Step 2: Filter columns that start with "FACE_AU"
+    eye_cols = [c for c in cols if c.startswith("EYE_")]
+    print(f"eye-tracking columns found: {eye_cols}")
+    
+    if len(eye_cols) == 0:
+        print(f"keine eye-tracking-Signale in Subject {i}")
+        continue
+    
+    # Columns to read
+    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
+    
+    # Step 3: Process file in chunks
+    chunks_to_save = []
+    
+    for start_row in range(0, nrows, CHUNK_SIZE):
+        stop_row = min(start_row + CHUNK_SIZE, nrows)
+        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
+        
+        # Read chunk
+        df_chunk = pd.read_hdf(
+            file_path, 
+            key="SIGNALS", 
+            columns=columns_to_read,
+            start=start_row,
+            stop=stop_row
+        )
+        
+        # Add metadata columns
+        df_chunk["subjectID"] = i
+        df_chunk["rowID"] = range(start_row, stop_row)
+        
+        # Clean data
+        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
+        df_chunk = df_chunk.dropna()
+        
+        # Only keep non-empty chunks
+        if len(df_chunk) > 0:
+            chunks_to_save.append(df_chunk)
+        
+        # Free memory
+        del df_chunk
+    
+    print("load and cleaning done")
+    
+    # Step 4: Combine all chunks and save
+    if chunks_to_save:
+        df_final = pd.concat(chunks_to_save, ignore_index=True)
+        print(f"Final dataframe shape: {df_final.shape}")
+        
+        # Save to parquet
+        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
+        os.makedirs(base_dir, exist_ok=True)
+        
+        out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
+        df_final.to_parquet(out_name, index=False)
+        print(f"Saved to {out_name}")
+        
+        # Free memory
+        del df_final
+        del chunks_to_save
+    else:
+        print(f"No valid data found for Subject {i}")
+
+print("All files processed!")
--- a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
+++ b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@ -71,7 +71,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
                
                # Summiere alle AU-Spalten
                for au_col in au_columns:
-                    result[f'{au_col}_sum'] = window_df[au_col].sum()
+                    # result[f'{au_col}_sum'] = window_df[au_col].sum()
+                    result[f'{au_col}_mean'] = window_df[au_col].mean()
                
                all_windows.append(result)
            
@ -94,8 +95,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
 # Beispiel-Verwendung
 if __name__ == "__main__":
    # Anpassen an deine Pfade
-    input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
-    output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"
+    input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
+    output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")

    
    
--- a/dataset_creation/create_multimodal_dataset.py
+++ b/dataset_creation/create_multimodal_dataset.py
@ -0,0 +1,56 @@
+from pathlib import Path
+import pandas as pd
+
+
+def main():
+    """
+    USER CONFIGURATION
+    ------------------
+    Specify input files and output directory here.
+    """
+
+    # Input parquet files (single-modality datasets)
+    file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
+    file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
+
+    # Output directory and file name
+    output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
+    output_file = output_dir / "merged_dataset.parquet"
+
+    # Column names (adjust only if your schema differs)
+    subject_col = "subjectID"
+    time_col = "start_time"
+
+    # ------------------------------------------------------------------
+    # Load datasets
+    # ------------------------------------------------------------------
+    df1 = pd.read_parquet(file_modality_1)
+    df2 = pd.read_parquet(file_modality_2)
+
+    # ------------------------------------------------------------------
+    # Keep only subjects that appear in BOTH datasets
+    # ------------------------------------------------------------------
+    common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
+
+    df1 = df1[df1[subject_col].isin(common_subjects)]
+    df2 = df2[df2[subject_col].isin(common_subjects)]
+
+    # ------------------------------------------------------------------
+    # Inner join on subject ID AND start_time
+    # ------------------------------------------------------------------
+    merged_df = pd.merge(
+        df1,
+        df2,
+        on=[subject_col, time_col],
+        how="inner",
+    )
+
+    # ------------------------------------------------------------------
+    # Save merged dataset
+    # ------------------------------------------------------------------
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merged_df.to_parquet(output_file, index=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset_creation/eyeAlt.py
+++ b/dataset_creation/eyeAlt.py
@ -2,7 +2,6 @@ import numpy as np
 import pandas as pd
 import h5py
 import yaml
-import owncloud
 import os
 from sklearn.preprocessing import MinMaxScaler
 from scipy.signal import welch
--- a/dataset_creation/eye_batch_processor.py
+++ b/dataset_creation/eye_batch_processor.py
@ -12,8 +12,8 @@ from pygazeanalyser.detectors import fixation_detection, saccade_detection
 ##############################################################################
 # KONFIGURATION - HIER ANPASSEN!
 ##############################################################################
-INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/parquet_Eye_features_old/")
-OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/eye_dataset_old.parquet")
+INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
+OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")

 WINDOW_SIZE_SAMPLES = 12500  # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
 STEP_SIZE_SAMPLES = 1250    # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
@ -28,7 +28,7 @@ def clean_eye_df(df):
    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
    """
-    eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
+    eye_cols = [c for c in df.columns if c.startswith("EYE_")]
    df_eye = df[eye_cols]

    # INF → NaN
@ -48,14 +48,14 @@ def extract_gaze_signal(df):
    maskiert ungültige Samples und interpoliert Lücken.
    """
    # Gaze-Spalten
-    gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-    gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()

    # Validity-Spalten (1 = gültig)
-    val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
-    val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
+    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
+    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)

    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
@ -76,18 +76,24 @@ def extract_gaze_signal(df):
    # Interpolation (wichtig für PyGaze!)
    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    
+    xscaler = MinMaxScaler()
+    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))

-    out = np.column_stack((gx, gy))
+    yscaler = MinMaxScaler()
+    gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
+    
+    out = np.column_stack((gxscale, gyscale))
    return out


 def extract_pupil(df):
    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
-    pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-    pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)

-    vl = df.get("LEFT_PUPIL_VALIDITY")
-    vr = df.get("RIGHT_PUPIL_VALIDITY")
+    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
+    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")

    if vl is None or vr is None:
        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()