From 58faff8f68fccb3b4c3e790743e314e469d521ab Mon Sep 17 00:00:00 2001
From: Michael <weigmi87303@th-nuernberg.de>
Date: Wed, 12 Nov 2025 16:36:35 +0100
Subject: [PATCH] changed dataset creation for face au

---
 .../chunkwise_parquet_file_creation.py        | 91 +++++++++++++++++++
 dataset_creation/create_feature_table.py      |  6 +-
 2 files changed, 94 insertions(+), 3 deletions(-)
 create mode 100644 dataset_creation/chunkwise_parquet_file_creation.py

diff --git a/dataset_creation/chunkwise_parquet_file_creation.py b/dataset_creation/chunkwise_parquet_file_creation.py
new file mode 100644
index 0000000..667de93
--- /dev/null
+++ b/dataset_creation/chunkwise_parquet_file_creation.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+print(os.getcwd())
+num_files = 2  # number of files to process (min: 1, max: 30)
+
+print("connection aufgebaut")
+
+data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
+
+# Get all .h5 files and sort them
+matching_files = sorted(data_dir.glob("*.h5"))
+
+# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
+CHUNK_SIZE = 100_000
+
+for i, file_path in enumerate(matching_files):
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
+    
+    # Step 1: Get total number of rows and column names
+    with pd.HDFStore(file_path, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns
+        nrows = store.get_storer("SIGNALS").nrows
+        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
+    
+    # Step 2: Filter columns that start with "FACE_AU"
+    eye_cols = [c for c in cols if c.startswith("FACE_AU")]
+    print(f"FACE_AU columns found: {eye_cols}")
+    
+    if len(eye_cols) == 0:
+        print(f"keine FACE_AU-Signale in Subject {i}")
+        continue
+    
+    # Columns to read
+    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
+    
+    # Step 3: Process file in chunks
+    chunks_to_save = []
+    
+    for start_row in range(0, nrows, CHUNK_SIZE):
+        stop_row = min(start_row + CHUNK_SIZE, nrows)
+        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
+        
+        # Read chunk
+        df_chunk = pd.read_hdf(
+            file_path, 
+            key="SIGNALS", 
+            columns=columns_to_read,
+            start=start_row,
+            stop=stop_row
+        )
+        
+        # Add metadata columns
+        df_chunk["subjectID"] = i
+        df_chunk["rowID"] = range(start_row, stop_row)
+        
+        # Clean data
+        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
+        df_chunk = df_chunk.dropna()
+        
+        # Only keep non-empty chunks
+        if len(df_chunk) > 0:
+            chunks_to_save.append(df_chunk)
+        
+        # Free memory
+        del df_chunk
+    
+    print("load and cleaning done")
+    
+    # Step 4: Combine all chunks and save
+    if chunks_to_save:
+        df_final = pd.concat(chunks_to_save, ignore_index=True)
+        print(f"Final dataframe shape: {df_final.shape}")
+        
+        # Save to parquet
+        base_dir = Path(r"C:\new_AU_parquet_files")
+        os.makedirs(base_dir, exist_ok=True)
+        
+        out_name = base_dir / f"cleaned_{i:04d}.parquet"
+        df_final.to_parquet(out_name, index=False)
+        print(f"Saved to {out_name}")
+        
+        # Free memory
+        del df_final
+        del chunks_to_save
+    else:
+        print(f"No valid data found for Subject {i}")
+
+print("All files processed!")
\ No newline at end of file
diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py
index 7fac133..86ee7b8 100644
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@@ -37,7 +37,7 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
         print(f"  Einträge: {len(df)}")
         
         # Identifiziere AU-Spalten
-        au_columns = [col for col in df.columns if col.startswith('AU')]
+        au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
         print(f"  AU-Spalten: {len(au_columns)}")
         
         # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
@@ -94,8 +94,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
 # Beispiel-Verwendung
 if __name__ == "__main__":
     # Anpassen an deine Pfade
-    input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU"
-    output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet"
+    input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
+    output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"