changed paths to paulusja ... directory changed feature extraction for AUs to mean instead of sum added v1 of merge script of datasets (needs to be adjusted)
57 lines
2.0 KiB
Python
57 lines
2.0 KiB
Python
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
|
|
def main():
|
|
"""
|
|
USER CONFIGURATION
|
|
------------------
|
|
Specify input files and output directory here.
|
|
"""
|
|
|
|
# Input parquet files (single-modality datasets)
|
|
file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
|
|
file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
|
|
|
|
# Output directory and file name
|
|
output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
|
|
output_file = output_dir / "merged_dataset.parquet"
|
|
|
|
# Column names (adjust only if your schema differs)
|
|
subject_col = "subjectID"
|
|
time_col = "start_time"
|
|
|
|
# ------------------------------------------------------------------
|
|
# Load datasets
|
|
# ------------------------------------------------------------------
|
|
df1 = pd.read_parquet(file_modality_1)
|
|
df2 = pd.read_parquet(file_modality_2)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Keep only subjects that appear in BOTH datasets
|
|
# ------------------------------------------------------------------
|
|
common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
|
|
|
|
df1 = df1[df1[subject_col].isin(common_subjects)]
|
|
df2 = df2[df2[subject_col].isin(common_subjects)]
|
|
|
|
# ------------------------------------------------------------------
|
|
# Inner join on subject ID AND start_time
|
|
# ------------------------------------------------------------------
|
|
merged_df = pd.merge(
|
|
df1,
|
|
df2,
|
|
on=[subject_col, time_col],
|
|
how="inner",
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Save merged dataset
|
|
# ------------------------------------------------------------------
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
merged_df.to_parquet(output_file, index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|