diff --git a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py new file mode 100644 index 0000000..64b1ae6 --- /dev/null +++ b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py @@ -0,0 +1,91 @@ +import os +import pandas as pd +from pathlib import Path + +print(os.getcwd()) +num_files = 2 # number of files to process (min: 1, max: 30) + +print("connection aufgebaut") + +data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA") +# os.chdir(data_dir) +# Get all .h5 files and sort them +matching_files = sorted(data_dir.glob("*.h5")) + +# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns) +CHUNK_SIZE = 100_000 + +for i, file_path in enumerate(matching_files): + print(f"Subject {i} gestartet") + print(f"{file_path} geoeffnet") + + # Step 1: Get total number of rows and column names + with pd.HDFStore(file_path, mode="r") as store: + cols = store.select("SIGNALS", start=0, stop=1).columns + nrows = store.get_storer("SIGNALS").nrows + print(f"Total columns: {len(cols)}, Total rows: {nrows}") + + # Step 2: Filter columns that start with "FACE_AU" + eye_cols = [c for c in cols if c.startswith("EYE_")] + print(f"eye-tracking columns found: {eye_cols}") + + if len(eye_cols) == 0: + print(f"keine eye-tracking-Signale in Subject {i}") + continue + + # Columns to read + columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + + # Step 3: Process file in chunks + chunks_to_save = [] + + for start_row in range(0, nrows, CHUNK_SIZE): + stop_row = min(start_row + CHUNK_SIZE, nrows) + print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)") + + # Read chunk + df_chunk = pd.read_hdf( + file_path, + key="SIGNALS", + columns=columns_to_read, + start=start_row, + stop=stop_row + ) + + # Add metadata columns + df_chunk["subjectID"] = i + df_chunk["rowID"] = range(start_row, stop_row) + + # Clean data + df_chunk = df_chunk[df_chunk["LEVEL"] != 0] + df_chunk = df_chunk.dropna() + + # Only keep non-empty chunks + if len(df_chunk) > 0: + chunks_to_save.append(df_chunk) + + # Free memory + del df_chunk + + print("load and cleaning done") + + # Step 4: Combine all chunks and save + if chunks_to_save: + df_final = pd.concat(chunks_to_save, ignore_index=True) + print(f"Final dataframe shape: {df_final.shape}") + + # Save to parquet + base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files") + os.makedirs(base_dir, exist_ok=True) + + out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet" + df_final.to_parquet(out_name, index=False) + print(f"Saved to {out_name}") + + # Free memory + del df_final + del chunks_to_save + else: + print(f"No valid data found for Subject {i}") + +print("All files processed!") \ No newline at end of file diff --git a/dataset_creation/chunkwise_parquet_file_creation.py b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py similarity index 100% rename from dataset_creation/chunkwise_parquet_file_creation.py rename to dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py index 86ee7b8..54e7892 100644 --- a/dataset_creation/create_feature_table.py +++ b/dataset_creation/create_feature_table.py @@ -71,7 +71,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12 # Summiere alle AU-Spalten for au_col in au_columns: - result[f'{au_col}_sum'] = window_df[au_col].sum() + # result[f'{au_col}_sum'] = window_df[au_col].sum() + result[f'{au_col}_mean'] = window_df[au_col].mean() all_windows.append(result) @@ -94,8 +95,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12 # Beispiel-Verwendung if __name__ == "__main__": # Anpassen an deine Pfade - input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files" - output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet" + input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files") + output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet") diff --git a/dataset_creation/create_multimodal_dataset.py b/dataset_creation/create_multimodal_dataset.py new file mode 100644 index 0000000..a81a242 --- /dev/null +++ b/dataset_creation/create_multimodal_dataset.py @@ -0,0 +1,56 @@ +from pathlib import Path +import pandas as pd + + +def main(): + """ + USER CONFIGURATION + ------------------ + Specify input files and output directory here. + """ + + # Input parquet files (single-modality datasets) + file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet") + file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet") + + # Output directory and file name + output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/") + output_file = output_dir / "merged_dataset.parquet" + + # Column names (adjust only if your schema differs) + subject_col = "subjectID" + time_col = "start_time" + + # ------------------------------------------------------------------ + # Load datasets + # ------------------------------------------------------------------ + df1 = pd.read_parquet(file_modality_1) + df2 = pd.read_parquet(file_modality_2) + + # ------------------------------------------------------------------ + # Keep only subjects that appear in BOTH datasets + # ------------------------------------------------------------------ + common_subjects = set(df1[subject_col]).intersection(df2[subject_col]) + + df1 = df1[df1[subject_col].isin(common_subjects)] + df2 = df2[df2[subject_col].isin(common_subjects)] + + # ------------------------------------------------------------------ + # Inner join on subject ID AND start_time + # ------------------------------------------------------------------ + merged_df = pd.merge( + df1, + df2, + on=[subject_col, time_col], + how="inner", + ) + + # ------------------------------------------------------------------ + # Save merged dataset + # ------------------------------------------------------------------ + output_dir.mkdir(parents=True, exist_ok=True) + merged_df.to_parquet(output_file, index=False) + + +if __name__ == "__main__": + main() diff --git a/dataset_creation/eyeAlt.py b/dataset_creation/eyeAlt.py index 1f685a9..fef68ad 100644 --- a/dataset_creation/eyeAlt.py +++ b/dataset_creation/eyeAlt.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd import h5py import yaml -import owncloud import os from sklearn.preprocessing import MinMaxScaler from scipy.signal import welch diff --git a/dataset_creation/eye_batch_processor.py b/dataset_creation/eye_batch_processor.py index 09b906d..8192147 100644 --- a/dataset_creation/eye_batch_processor.py +++ b/dataset_creation/eye_batch_processor.py @@ -12,8 +12,8 @@ from pygazeanalyser.detectors import fixation_detection, saccade_detection ############################################################################## # KONFIGURATION - HIER ANPASSEN! ############################################################################## -INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/parquet_Eye_features_old/") -OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/eye_dataset_old.parquet") +INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/") +OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet") WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz) STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz) @@ -28,7 +28,7 @@ def clean_eye_df(df): Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten. Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält. """ - eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)] + eye_cols = [c for c in df.columns if c.startswith("EYE_")] df_eye = df[eye_cols] # INF → NaN @@ -48,14 +48,14 @@ def extract_gaze_signal(df): maskiert ungültige Samples und interpoliert Lücken. """ # Gaze-Spalten - gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() + gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() + gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() + gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() + gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() # Validity-Spalten (1 = gültig) - val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1) - val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1) + val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1) + val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1) # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor) gx_L.replace([np.inf, -np.inf], np.nan, inplace=True) @@ -76,18 +76,24 @@ def extract_gaze_signal(df): # Interpolation (wichtig für PyGaze!) gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill() gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill() + + xscaler = MinMaxScaler() + gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) - out = np.column_stack((gx, gy)) + yscaler = MinMaxScaler() + gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1)) + + out = np.column_stack((gxscale, gyscale)) return out def extract_pupil(df): """Extrahiert Pupillengröße (beide Augen gemittelt).""" - pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) + pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) + pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - vl = df.get("LEFT_PUPIL_VALIDITY") - vr = df.get("RIGHT_PUPIL_VALIDITY") + vl = df.get("EYE_LEFT_PUPIL_VALIDITY") + vr = df.get("EYE_RIGHT_PUPIL_VALIDITY") if vl is None or vr is None: validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()