from pathlib import Path import pandas as pd def main(): """ USER CONFIGURATION ------------------ Specify input files and output directory here. """ # Input parquet files (single-modality datasets) file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet") file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet") # Output directory and file name output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/") output_file = output_dir / "merged_dataset.parquet" # Column names (adjust only if your schema differs) subject_col = "subjectID" time_col = "start_time" # ------------------------------------------------------------------ # Load datasets # ------------------------------------------------------------------ df1 = pd.read_parquet(file_modality_1) df2 = pd.read_parquet(file_modality_2) # ------------------------------------------------------------------ # Keep only subjects that appear in BOTH datasets # ------------------------------------------------------------------ common_subjects = set(df1[subject_col]).intersection(df2[subject_col]) df1 = df1[df1[subject_col].isin(common_subjects)] df2 = df2[df2[subject_col].isin(common_subjects)] # ------------------------------------------------------------------ # Inner join on subject ID AND start_time # ------------------------------------------------------------------ merged_df = pd.merge( df1, df2, on=[subject_col, time_col], how="inner", ) # ------------------------------------------------------------------ # Save merged dataset # ------------------------------------------------------------------ output_dir.mkdir(parents=True, exist_ok=True) merged_df.to_parquet(output_file, index=False) if __name__ == "__main__": main()