changed dataset creation for face au

This commit is contained in:
Michael Weig 2025-11-12 16:36:35 +01:00
parent 2eddb73644
commit 58faff8f68
2 changed files with 94 additions and 3 deletions

View File

@ -0,0 +1,91 @@
import os
import pandas as pd
from pathlib import Path
print(os.getcwd())
num_files = 2 # number of files to process (min: 1, max: 30)
print("connection aufgebaut")
data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
# Get all .h5 files and sort them
matching_files = sorted(data_dir.glob("*.h5"))
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
CHUNK_SIZE = 100_000
for i, file_path in enumerate(matching_files):
print(f"Subject {i} gestartet")
print(f"{file_path} geoeffnet")
# Step 1: Get total number of rows and column names
with pd.HDFStore(file_path, mode="r") as store:
cols = store.select("SIGNALS", start=0, stop=1).columns
nrows = store.get_storer("SIGNALS").nrows
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
# Step 2: Filter columns that start with "FACE_AU"
eye_cols = [c for c in cols if c.startswith("FACE_AU")]
print(f"FACE_AU columns found: {eye_cols}")
if len(eye_cols) == 0:
print(f"keine FACE_AU-Signale in Subject {i}")
continue
# Columns to read
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
# Step 3: Process file in chunks
chunks_to_save = []
for start_row in range(0, nrows, CHUNK_SIZE):
stop_row = min(start_row + CHUNK_SIZE, nrows)
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
# Read chunk
df_chunk = pd.read_hdf(
file_path,
key="SIGNALS",
columns=columns_to_read,
start=start_row,
stop=stop_row
)
# Add metadata columns
df_chunk["subjectID"] = i
df_chunk["rowID"] = range(start_row, stop_row)
# Clean data
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
df_chunk = df_chunk.dropna()
# Only keep non-empty chunks
if len(df_chunk) > 0:
chunks_to_save.append(df_chunk)
# Free memory
del df_chunk
print("load and cleaning done")
# Step 4: Combine all chunks and save
if chunks_to_save:
df_final = pd.concat(chunks_to_save, ignore_index=True)
print(f"Final dataframe shape: {df_final.shape}")
# Save to parquet
base_dir = Path(r"C:\new_AU_parquet_files")
os.makedirs(base_dir, exist_ok=True)
out_name = base_dir / f"cleaned_{i:04d}.parquet"
df_final.to_parquet(out_name, index=False)
print(f"Saved to {out_name}")
# Free memory
del df_final
del chunks_to_save
else:
print(f"No valid data found for Subject {i}")
print("All files processed!")

View File

@ -37,7 +37,7 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
print(f" Einträge: {len(df)}") print(f" Einträge: {len(df)}")
# Identifiziere AU-Spalten # Identifiziere AU-Spalten
au_columns = [col for col in df.columns if col.startswith('AU')] au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
print(f" AU-Spalten: {len(au_columns)}") print(f" AU-Spalten: {len(au_columns)}")
# Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden) # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
@ -94,8 +94,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
# Beispiel-Verwendung # Beispiel-Verwendung
if __name__ == "__main__": if __name__ == "__main__":
# Anpassen an deine Pfade # Anpassen an deine Pfade
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU" input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet" output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"