import os import pandas as pd from pathlib import Path data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data") # Get all .h5 files and sort them matching_files = sorted(data_dir.glob("*.h5")) # Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns) CHUNK_SIZE = 50_000 for i, file_path in enumerate(matching_files): print(f"Subject {i} gestartet") print(f"{file_path} geoeffnet") # Step 1: Get total number of rows and column names with pd.HDFStore(file_path, mode="r") as store: cols = store.select("SIGNALS", start=0, stop=1).columns nrows = store.get_storer("SIGNALS").nrows print(f"Total columns: {len(cols)}, Total rows: {nrows}") # Step 2: Filter columns that start with "FACE_AU" # Find columns starting with each prefix face_au_cols = [c for c in cols if c.startswith("FACE_AU")] eye_cols = [c for c in cols if c.startswith("EYE_")] # Check that both have at least one column if face_au_cols and eye_cols: print(f"FACE_AU columns found: {face_au_cols}") print(f"EYE_ columns found: {eye_cols}") else: missing = [] if not face_au_cols: missing.append("FACE_AU") if not eye_cols: missing.append("EYE_") print(f"Missing columns for: {', '.join(missing)}") continue # Columns to read columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + face_au_cols # Step 3: Process file in chunks chunks_to_save = [] for start_row in range(0, nrows, CHUNK_SIZE): stop_row = min(start_row + CHUNK_SIZE, nrows) print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)") # Read chunk df_chunk = pd.read_hdf( file_path, key="SIGNALS", columns=columns_to_read, start=start_row, stop=stop_row ) # print(f"[DEBUG] Vor Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}") # Add metadata columns df_chunk["subjectID"] = i df_chunk["rowID"] = range(start_row, stop_row) # Clean data df_chunk = df_chunk[df_chunk["LEVEL"] != 0] df_chunk = df_chunk.dropna(subset=face_au_cols) # print(f"[DEBUG] Nach Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}") # Only keep non-empty chunks if len(df_chunk) > 0: chunks_to_save.append(df_chunk) # Free memory del df_chunk print("load and cleaning done") # Step 4: Combine all chunks and save if chunks_to_save: df_final = pd.concat(chunks_to_save, ignore_index=True) print(f"Final dataframe shape: {df_final.shape}") # Save to parquet base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files") os.makedirs(base_dir, exist_ok=True) out_name = base_dir / f"both_mod_{i:04d}.parquet" df_final.to_parquet(out_name, index=False) print(f"Saved to {out_name}") # Free memory del df_final del chunks_to_save else: print(f"No valid data found for Subject {i}") print("All files processed!")