changed dataset creation for face au
This commit is contained in:
parent
2eddb73644
commit
58faff8f68
91
dataset_creation/chunkwise_parquet_file_creation.py
Normal file
91
dataset_creation/chunkwise_parquet_file_creation.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
print(os.getcwd())
|
||||||
|
num_files = 2 # number of files to process (min: 1, max: 30)
|
||||||
|
|
||||||
|
print("connection aufgebaut")
|
||||||
|
|
||||||
|
data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
|
||||||
|
|
||||||
|
# Get all .h5 files and sort them
|
||||||
|
matching_files = sorted(data_dir.glob("*.h5"))
|
||||||
|
|
||||||
|
# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
|
||||||
|
CHUNK_SIZE = 100_000
|
||||||
|
|
||||||
|
for i, file_path in enumerate(matching_files):
|
||||||
|
print(f"Subject {i} gestartet")
|
||||||
|
print(f"{file_path} geoeffnet")
|
||||||
|
|
||||||
|
# Step 1: Get total number of rows and column names
|
||||||
|
with pd.HDFStore(file_path, mode="r") as store:
|
||||||
|
cols = store.select("SIGNALS", start=0, stop=1).columns
|
||||||
|
nrows = store.get_storer("SIGNALS").nrows
|
||||||
|
print(f"Total columns: {len(cols)}, Total rows: {nrows}")
|
||||||
|
|
||||||
|
# Step 2: Filter columns that start with "FACE_AU"
|
||||||
|
eye_cols = [c for c in cols if c.startswith("FACE_AU")]
|
||||||
|
print(f"FACE_AU columns found: {eye_cols}")
|
||||||
|
|
||||||
|
if len(eye_cols) == 0:
|
||||||
|
print(f"keine FACE_AU-Signale in Subject {i}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Columns to read
|
||||||
|
columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
|
||||||
|
|
||||||
|
# Step 3: Process file in chunks
|
||||||
|
chunks_to_save = []
|
||||||
|
|
||||||
|
for start_row in range(0, nrows, CHUNK_SIZE):
|
||||||
|
stop_row = min(start_row + CHUNK_SIZE, nrows)
|
||||||
|
print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
|
||||||
|
|
||||||
|
# Read chunk
|
||||||
|
df_chunk = pd.read_hdf(
|
||||||
|
file_path,
|
||||||
|
key="SIGNALS",
|
||||||
|
columns=columns_to_read,
|
||||||
|
start=start_row,
|
||||||
|
stop=stop_row
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add metadata columns
|
||||||
|
df_chunk["subjectID"] = i
|
||||||
|
df_chunk["rowID"] = range(start_row, stop_row)
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
|
||||||
|
df_chunk = df_chunk.dropna()
|
||||||
|
|
||||||
|
# Only keep non-empty chunks
|
||||||
|
if len(df_chunk) > 0:
|
||||||
|
chunks_to_save.append(df_chunk)
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_chunk
|
||||||
|
|
||||||
|
print("load and cleaning done")
|
||||||
|
|
||||||
|
# Step 4: Combine all chunks and save
|
||||||
|
if chunks_to_save:
|
||||||
|
df_final = pd.concat(chunks_to_save, ignore_index=True)
|
||||||
|
print(f"Final dataframe shape: {df_final.shape}")
|
||||||
|
|
||||||
|
# Save to parquet
|
||||||
|
base_dir = Path(r"C:\new_AU_parquet_files")
|
||||||
|
os.makedirs(base_dir, exist_ok=True)
|
||||||
|
|
||||||
|
out_name = base_dir / f"cleaned_{i:04d}.parquet"
|
||||||
|
df_final.to_parquet(out_name, index=False)
|
||||||
|
print(f"Saved to {out_name}")
|
||||||
|
|
||||||
|
# Free memory
|
||||||
|
del df_final
|
||||||
|
del chunks_to_save
|
||||||
|
else:
|
||||||
|
print(f"No valid data found for Subject {i}")
|
||||||
|
|
||||||
|
print("All files processed!")
|
||||||
@ -37,7 +37,7 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
|
|||||||
print(f" Einträge: {len(df)}")
|
print(f" Einträge: {len(df)}")
|
||||||
|
|
||||||
# Identifiziere AU-Spalten
|
# Identifiziere AU-Spalten
|
||||||
au_columns = [col for col in df.columns if col.startswith('AU')]
|
au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
|
||||||
print(f" AU-Spalten: {len(au_columns)}")
|
print(f" AU-Spalten: {len(au_columns)}")
|
||||||
|
|
||||||
# Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
|
# Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
|
||||||
@ -94,8 +94,8 @@ def process_parquet_files(input_dir, output_file, window_size=1250, step_size=12
|
|||||||
# Beispiel-Verwendung
|
# Beispiel-Verwendung
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Anpassen an deine Pfade
|
# Anpassen an deine Pfade
|
||||||
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\ParquetFiles_AU"
|
input_directory = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_parquet_files"
|
||||||
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\AU_dataset\xxoutput_windowed.parquet"
|
output_file = r"C:\Users\x\FAUbox\WS2526_Fahrsimulator_MSY (Celina Korzer)\new_AU_dataset\AU_dataset.parquet"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user