From 58faff8f68fccb3b4c3e790743e314e469d521ab Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 12 Nov 2025 16:36:35 +0100 Subject: [PATCH] changed dataset creation for face au --- .../chunkwise_parquet_file_creation.py | 91 +++++++++++++++++++ dataset_creation/create_feature_table.py | 6 +- 2 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 dataset_creation/chunkwise_parquet_file_creation.py diff --git a/dataset_creation/chunkwise_parquet_file_creation.py b/dataset_creation/chunkwise_parquet_file_creation.py new file mode 100644 index 0000000..667de93 --- /dev/null +++ b/dataset_creation/chunkwise_parquet_file_creation.py @@ -0,0 +1,91 @@ +import os +import pandas as pd +from pathlib import Path + +print(os.getcwd()) +num_files = 2 # number of files to process (min: 1, max: 30) + +print("connection aufgebaut") + +data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp") + +# Get all .h5 files and sort them +matching_files = sorted(data_dir.glob("*.h5")) + +# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns) +CHUNK_SIZE = 100_000 + +for i, file_path in enumerate(matching_files): + print(f"Subject {i} gestartet") + print(f"{file_path} geoeffnet") + + # Step 1: Get total number of rows and column names + with pd.HDFStore(file_path, mode="r") as store: + cols = store.select("SIGNALS", start=0, stop=1).columns + nrows = store.get_storer("SIGNALS").nrows + print(f"Total columns: {len(cols)}, Total rows: {nrows}") + + # Step 2: Filter columns that start with "FACE_AU" + eye_cols = [c for c in cols if c.startswith("FACE_AU")] + print(f"FACE_AU columns found: {eye_cols}") + + if len(eye_cols) == 0: + print(f"keine FACE_AU-Signale in Subject {i}") + continue + + # Columns to read + columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols + + # Step 3: Process file in chunks + chunks_to_save = [] + + for start_row in range(0, nrows, CHUNK_SIZE): + stop_row = min(start_row + CHUNK_SIZE, nrows) + print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)") + + # Read chunk + df_chunk = pd.read_hdf( + file_path, + key="SIGNALS", + columns=columns_to_read, + start=start_row, + stop=stop_row + ) + + # Add metadata columns + df_chunk["subjectID"] = i + df_chunk["rowID"] = range(start_row, stop_row) + + # Clean data + df_chunk = df_chunk[df_chunk["LEVEL"] != 0] + df_chunk = df_chunk.dropna() + + # Only keep non-empty chunks + if len(df_chunk) > 0: + chunks_to_save.append(df_chunk) + + # Free memory + del df_chunk + + print("load and cleaning done") + + # Step 4: Combine all chunks and save + if chunks_to_save: + df_final = pd.concat(chunks_to_save, ignore_index=True) + print(f"Final dataframe shape: {df_final.shape}") + + # Save to parquet + base_dir = Path(r"C:\new_AU_parquet_files") + os.makedirs(base_dir, exist_ok=True) + + out_name = base_dir / f"cleaned_{i:04d}.parquet" + df_final.to_parquet(out_name, index=False) + print(f"Saved to {out_name}") + + # Free memory + del df_final + del chunks_to_save + else: + print(f"No valid data found for Subject {i}") + +print("All files processed!") \ No newline at end of file diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py index 7fac133..86ee7b8 100644 --- a/dataset_creation/create_feature_table.py +++ b/dataset_creation/create_feature_table.py @@ -37,7 +37,7 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12 print(f" Einträge: {len(df)}") # Identifiziere AU-Spalten - au_columns = [col for col in df.columns if col.startswith('AU')] + au_columns = [col for col in df.columns if col.startswith('FACE_AU')] print(f" AU-Spalten: {len(au_columns)}") # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden) @@ -94,8 +94,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12 # Beispiel-Verwendung if __name__ == "__main__": # Anpassen an deine Pfade - input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU" - output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet" + input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files" + output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"