97 lines
3.2 KiB
Python
97 lines
3.2 KiB
Python
import os
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
|
|
data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")
|
|
|
|
# Get all .h5 files and sort them
|
|
matching_files = sorted(data_dir.glob("*.h5"))
|
|
|
|
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
|
|
CHUNK_SIZE = 50_000
|
|
|
|
for i, file_path in enumerate(matching_files):
|
|
print(f"Subject {i} gestartet")
|
|
print(f"{file_path} geoeffnet")
|
|
|
|
# Step 1: Get total number of rows and column names
|
|
with pd.HDFStore(file_path, mode="r") as store:
|
|
cols = store.select("SIGNALS", start=0, stop=1).columns
|
|
nrows = store.get_storer("SIGNALS").nrows
|
|
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
|
|
|
|
# Step 2: Filter columns that start with "FACE_AU"
|
|
# Find columns starting with each prefix
|
|
face_au_cols = [c for c in cols if c.startswith("FACE_AU")]
|
|
eye_cols = [c for c in cols if c.startswith("EYE_")]
|
|
|
|
# Check that both have at least one column
|
|
if face_au_cols and eye_cols:
|
|
print(f"FACE_AU columns found: {face_au_cols}")
|
|
print(f"EYE_ columns found: {eye_cols}")
|
|
else:
|
|
missing = []
|
|
if not face_au_cols:
|
|
missing.append("FACE_AU")
|
|
if not eye_cols:
|
|
missing.append("EYE_")
|
|
print(f"Missing columns for: {', '.join(missing)}")
|
|
continue
|
|
|
|
# Columns to read
|
|
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + face_au_cols
|
|
|
|
# Step 3: Process file in chunks
|
|
chunks_to_save = []
|
|
|
|
for start_row in range(0, nrows, CHUNK_SIZE):
|
|
stop_row = min(start_row + CHUNK_SIZE, nrows)
|
|
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
|
|
|
|
# Read chunk
|
|
df_chunk = pd.read_hdf(
|
|
file_path,
|
|
key="SIGNALS",
|
|
columns=columns_to_read,
|
|
start=start_row,
|
|
stop=stop_row
|
|
)
|
|
# print(f"[DEBUG] Vor Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
|
|
# Add metadata columns
|
|
df_chunk["subjectID"] = i
|
|
df_chunk["rowID"] = range(start_row, stop_row)
|
|
|
|
# Clean data
|
|
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
|
|
df_chunk = df_chunk.dropna(subset=face_au_cols)
|
|
|
|
# print(f"[DEBUG] Nach Dropna: {df_chunk["EYE_LEFT_PUPIL_VALIDITY"].value_counts()}")
|
|
# Only keep non-empty chunks
|
|
if len(df_chunk) > 0:
|
|
chunks_to_save.append(df_chunk)
|
|
|
|
# Free memory
|
|
del df_chunk
|
|
|
|
print("load and cleaning done")
|
|
|
|
# Step 4: Combine all chunks and save
|
|
if chunks_to_save:
|
|
df_final = pd.concat(chunks_to_save, ignore_index=True)
|
|
print(f"Final dataframe shape: {df_final.shape}")
|
|
|
|
# Save to parquet
|
|
base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
|
|
os.makedirs(base_dir, exist_ok=True)
|
|
|
|
out_name = base_dir / f"both_mod_{i:04d}.parquet"
|
|
df_final.to_parquet(out_name, index=False)
|
|
print(f"Saved to {out_name}")
|
|
|
|
# Free memory
|
|
del df_final
|
|
del chunks_to_save
|
|
else:
|
|
print(f"No valid data found for Subject {i}")
|
|
|
|
print("All files processed!") |