97 lines
3.2 KiB
Python

import os
import pandas as pd
from pathlib import Path
data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")
# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 50_000
for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet")
print(f"{file_path} geoeffnet")
# Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns
nrows = store.get_storer("SIGNALS").nrows
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
# Step 2: Filter columns that start with "FACE_AU"
# Find columns starting with each prefix
face_au_cols = [c for c in cols if c.startswith("FACE_AU")]
eye_cols = [c for c in cols if c.startswith("EYE_")]
# Check that both have at least one column
if face_au_cols and eye_cols:
print(f"FACE_AU columns found: {face_au_cols}")
print(f"EYE_ columns found: {eye_cols}")
else:
missing = []
if not face_au_cols:
missing.append("FACE_AU")
if not eye_cols:
missing.append("EYE_")
print(f"Missing columns for: {', '.join(missing)}")
continue
# Columns to read
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + face_au_cols
# Step 3: Process file in chunks
chunks_to_save = []
for start_row in range(0, nrows, CHUNK_SIZE):
stop_row = min(start_row + CHUNK_SIZE, nrows)
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
# Read chunk
df_chunk = pd.read_hdf(
file_path,
key="SIGNALS",
columns=columns_to_read,
start=start_row,
stop=stop_row
)
# print(f"[DEBUG] Vor Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
# Add metadata columns
df_chunk["subjectID"] = i
df_chunk["rowID"] = range(start_row, stop_row)
# Clean data
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
df_chunk = df_chunk.dropna(subset=face_au_cols)
# print(f"[DEBUG] Nach Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
# Only keep non-empty chunks
if len(df_chunk) > 0:
chunks_to_save.append(df_chunk)
# Free memory
del df_chunk
print("load and cleaning done")
# Step 4: Combine all chunks and save
if chunks_to_save:
df_final = pd.concat(chunks_to_save, ignore_index=True)
print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"both_mod_{i:04d}.parquet"
df_final.to_parquet(out_name, index=False)
print(f"Saved to {out_name}")
# Free memory
del df_final
del chunks_to_save
else:
print(f"No valid data found for Subject {i}")
print("All files processed!")