Fahrsimulator_MSY2526_AI/dataset_creation/create_multimodal_dataset.py
Michael b8bebc0944 minor fixes in dataset creation
changed paths to paulusja ... directory
changed feature extraction for AUs to mean instead of sum
added v1 of merge script of datasets (needs to be adjusted)
2025-12-18 13:04:11 +01:00

57 lines
2.0 KiB
Python

from pathlib import Path
import pandas as pd
def main():
"""
USER CONFIGURATION
------------------
Specify input files and output directory here.
"""
# Input parquet files (single-modality datasets)
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
# Output directory and file name
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
output_file = output_dir / "merged_dataset.parquet"
# Column names (adjust only if your schema differs)
subject_col = "subjectID"
time_col = "start_time"
# ------------------------------------------------------------------
# Load datasets
# ------------------------------------------------------------------
df1 = pd.read_parquet(file_modality_1)
df2 = pd.read_parquet(file_modality_2)
# ------------------------------------------------------------------
# Keep only subjects that appear in BOTH datasets
# ------------------------------------------------------------------
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
df1 = df1[df1[subject_col].isin(common_subjects)]
df2 = df2[df2[subject_col].isin(common_subjects)]
# ------------------------------------------------------------------
# Inner join on subject ID AND start_time
# ------------------------------------------------------------------
merged_df = pd.merge(
df1,
df2,
on=[subject_col, time_col],
how="inner",
)
# ------------------------------------------------------------------
# Save merged dataset
# ------------------------------------------------------------------
output_dir.mkdir(parents=True, exist_ok=True)
merged_df.to_parquet(output_file, index=False)
if __name__ == "__main__":
main()