Fahrsimulator_MSY2526_AI/dataset_creation/CPFC_both.py

import os
import pandas as pd
from pathlib import Path

data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")

# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))

# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 50_000

for i, file_path in enumerate(matching_files):
    print(f"Subject {i} gestartet")
    print(f"{file_path} geoeffnet")

    # Step 1: Get total number of rows and column names
    with pd.HDFStore(file_path, mode="r") as store:
        cols = store.select("SIGNALS", start=0, stop=1).columns
        nrows = store.get_storer("SIGNALS").nrows
        print(f"Total columns: {len(cols)}, Total rows: {nrows}")

    # Step 2: Filter columns that start with "FACE_AU"
    # Find columns starting with each prefix
    face_au_cols = [c for c in cols if c.startswith("FACE_AU")]
    eye_cols = [c for c in cols if c.startswith("EYE_")]

    # Check that both have at least one column
    if face_au_cols and eye_cols:
        print(f"FACE_AU columns found: {face_au_cols}")
        print(f"EYE_ columns found: {eye_cols}")
    else:
        missing = []
        if not face_au_cols:
            missing.append("FACE_AU")
        if not eye_cols:
            missing.append("EYE_")
        print(f"Missing columns for: {', '.join(missing)}")
        continue

    # Columns to read
    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + face_au_cols

    # Step 3: Process file in chunks
    chunks_to_save = []

    for start_row in range(0, nrows, CHUNK_SIZE):
        stop_row = min(start_row + CHUNK_SIZE, nrows)
        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")

        # Read chunk
        df_chunk = pd.read_hdf(
            file_path,
            key="SIGNALS",
            columns=columns_to_read,
            start=start_row,
            stop=stop_row
        )
        # print(f"[DEBUG] Vor Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
        # Add metadata columns
        df_chunk["subjectID"] = i
        df_chunk["rowID"] = range(start_row, stop_row)

        # Clean data
        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
        df_chunk = df_chunk.dropna(subset=face_au_cols)

        # print(f"[DEBUG] Nach Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
        # Only keep non-empty chunks
        if len(df_chunk) > 0:
            chunks_to_save.append(df_chunk)

        # Free memory
        del df_chunk

    print("load and cleaning done")

    # Step 4: Combine all chunks and save
    if chunks_to_save:
        df_final = pd.concat(chunks_to_save, ignore_index=True)
        print(f"Final dataframe shape: {df_final.shape}")

        # Save to parquet
        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
        os.makedirs(base_dir, exist_ok=True)

        out_name = base_dir / f"both_mod_{i:04d}.parquet"
        df_final.to_parquet(out_name, index=False)
        print(f"Saved to {out_name}")

        # Free memory
        del df_final
        del chunks_to_save
    else:
        print(f"No valid data found for Subject {i}")

print("All files processed!")