From de0084dc09acbed3c3a2ecfe8e284bcb3026d844 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 4 Mar 2026 15:09:23 +0100 Subject: [PATCH] getting rid of redundant files in dataset creation --- ...wise_parquet_file_creation_EYE_TRACKING.py | 91 ---- ...chunkwise_parquet_file_creation_FACE_AU.py | 91 ---- dataset_creation/combined_feature_creation.py | 88 ++-- dataset_creation/create_feature_table.py | 113 ----- .../create_multimodal_dataset_by_merge.py | 56 --- ... => create_parquet_files_from_owncloud.py} | 16 +- dataset_creation/eyeAlt.py | 323 ------------- dataset_creation/eye_batch_processor.py | 441 ------------------ dataset_creation/eyetrackingFeatures.py | 323 ------------- dataset_creation/maxDist.py | 99 ++-- dataset_creation/open_parquet_test.ipynb | 155 ------ ...{CPFC_both.py => parquet_file_creation.py} | 10 +- 12 files changed, 106 insertions(+), 1700 deletions(-) delete mode 100644 dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py delete mode 100644 dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py delete mode 100644 dataset_creation/create_feature_table.py delete mode 100644 dataset_creation/create_multimodal_dataset_by_merge.py rename dataset_creation/{create_parquet_files.py => create_parquet_files_from_owncloud.py} (82%) delete mode 100644 dataset_creation/eyeAlt.py delete mode 100644 dataset_creation/eye_batch_processor.py delete mode 100644 dataset_creation/eyetrackingFeatures.py delete mode 100644 dataset_creation/open_parquet_test.ipynb rename dataset_creation/{CPFC_both.py => parquet_file_creation.py} (92%) diff --git a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py deleted file mode 100644 index 64b1ae6..0000000 --- a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -import pandas as pd -from pathlib import Path - -print(os.getcwd()) -num_files = 2 # number of files to process (min: 1, max: 30) - -print("connection aufgebaut") - -data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA") -# os.chdir(data_dir) -# Get all .h5 files and sort them -matching_files = sorted(data_dir.glob("*.h5")) - -# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns) -CHUNK_SIZE = 100_000 - -for i, file_path in enumerate(matching_files): - print(f"Subject {i} gestartet") - print(f"{file_path} geoeffnet") - - # Step 1: Get total number of rows and column names - with pd.HDFStore(file_path, mode="r") as store: - cols = store.select("SIGNALS", start=0, stop=1).columns - nrows = store.get_storer("SIGNALS").nrows - print(f"Total columns: {len(cols)}, Total rows: {nrows}") - - # Step 2: Filter columns that start with "FACE_AU" - eye_cols = [c for c in cols if c.startswith("EYE_")] - print(f"eye-tracking columns found: {eye_cols}") - - if len(eye_cols) == 0: - print(f"keine eye-tracking-Signale in Subject {i}") - continue - - # Columns to read - columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols - - # Step 3: Process file in chunks - chunks_to_save = [] - - for start_row in range(0, nrows, CHUNK_SIZE): - stop_row = min(start_row + CHUNK_SIZE, nrows) - print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)") - - # Read chunk - df_chunk = pd.read_hdf( - file_path, - key="SIGNALS", - columns=columns_to_read, - start=start_row, - stop=stop_row - ) - - # Add metadata columns - df_chunk["subjectID"] = i - df_chunk["rowID"] = range(start_row, stop_row) - - # Clean data - df_chunk = df_chunk[df_chunk["LEVEL"] != 0] - df_chunk = df_chunk.dropna() - - # Only keep non-empty chunks - if len(df_chunk) > 0: - chunks_to_save.append(df_chunk) - - # Free memory - del df_chunk - - print("load and cleaning done") - - # Step 4: Combine all chunks and save - if chunks_to_save: - df_final = pd.concat(chunks_to_save, ignore_index=True) - print(f"Final dataframe shape: {df_final.shape}") - - # Save to parquet - base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files") - os.makedirs(base_dir, exist_ok=True) - - out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet" - df_final.to_parquet(out_name, index=False) - print(f"Saved to {out_name}") - - # Free memory - del df_final - del chunks_to_save - else: - print(f"No valid data found for Subject {i}") - -print("All files processed!") \ No newline at end of file diff --git a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py deleted file mode 100644 index 667de93..0000000 --- a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -import pandas as pd -from pathlib import Path - -print(os.getcwd()) -num_files = 2 # number of files to process (min: 1, max: 30) - -print("connection aufgebaut") - -data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp") - -# Get all .h5 files and sort them -matching_files = sorted(data_dir.glob("*.h5")) - -# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns) -CHUNK_SIZE = 100_000 - -for i, file_path in enumerate(matching_files): - print(f"Subject {i} gestartet") - print(f"{file_path} geoeffnet") - - # Step 1: Get total number of rows and column names - with pd.HDFStore(file_path, mode="r") as store: - cols = store.select("SIGNALS", start=0, stop=1).columns - nrows = store.get_storer("SIGNALS").nrows - print(f"Total columns: {len(cols)}, Total rows: {nrows}") - - # Step 2: Filter columns that start with "FACE_AU" - eye_cols = [c for c in cols if c.startswith("FACE_AU")] - print(f"FACE_AU columns found: {eye_cols}") - - if len(eye_cols) == 0: - print(f"keine FACE_AU-Signale in Subject {i}") - continue - - # Columns to read - columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols - - # Step 3: Process file in chunks - chunks_to_save = [] - - for start_row in range(0, nrows, CHUNK_SIZE): - stop_row = min(start_row + CHUNK_SIZE, nrows) - print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)") - - # Read chunk - df_chunk = pd.read_hdf( - file_path, - key="SIGNALS", - columns=columns_to_read, - start=start_row, - stop=stop_row - ) - - # Add metadata columns - df_chunk["subjectID"] = i - df_chunk["rowID"] = range(start_row, stop_row) - - # Clean data - df_chunk = df_chunk[df_chunk["LEVEL"] != 0] - df_chunk = df_chunk.dropna() - - # Only keep non-empty chunks - if len(df_chunk) > 0: - chunks_to_save.append(df_chunk) - - # Free memory - del df_chunk - - print("load and cleaning done") - - # Step 4: Combine all chunks and save - if chunks_to_save: - df_final = pd.concat(chunks_to_save, ignore_index=True) - print(f"Final dataframe shape: {df_final.shape}") - - # Save to parquet - base_dir = Path(r"C:\new_AU_parquet_files") - os.makedirs(base_dir, exist_ok=True) - - out_name = base_dir / f"cleaned_{i:04d}.parquet" - df_final.to_parquet(out_name, index=False) - print(f"Saved to {out_name}") - - # Free memory - del df_final - del chunks_to_save - else: - print(f"No valid data found for Subject {i}") - -print("All files processed!") \ No newline at end of file diff --git a/dataset_creation/combined_feature_creation.py b/dataset_creation/combined_feature_creation.py index c525a6f..6fbfccd 100644 --- a/dataset_creation/combined_feature_creation.py +++ b/dataset_creation/combined_feature_creation.py @@ -4,27 +4,26 @@ import pandas as pd from pathlib import Path from sklearn.preprocessing import MinMaxScaler from scipy.signal import welch -from pygazeanalyser.detectors import fixation_detection, saccade_detection +from pygazeanalyser.detectors import fixation_detection, saccade_detection # not installed by default ############################################################################## -# KONFIGURATION +# CONFIGURATION ############################################################################## -INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files") -OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet") - -WINDOW_SIZE_SAMPLES = 25*50 # 50s bei 25Hz -STEP_SIZE_SAMPLES = 125 # 5s bei 25Hz +INPUT_DIR = Path(r"") # directory that stores the parquet files (one file per subject) +OUTPUT_FILE = Path(r"") # path for resulting dataset +WINDOW_SIZE_SAMPLES = 25*50 # 50s at 25Hz +STEP_SIZE_SAMPLES = 125 # 5s at 25Hz SAMPLING_RATE = 25 # Hz MIN_DUR_BLINKS = 2 # x * 40ms ############################################################################## -# EYE-TRACKING FUNKTIONEN +# EYE-TRACKING FUNCTIONS ############################################################################## def clean_eye_df(df): - """Extrahiert nur Eye-Tracking Spalten und entfernt leere Zeilen.""" + """Extracts Eye-Tracking columns only and removes empty rows.""" eye_cols = [c for c in df.columns if c.startswith("EYE_")] if not eye_cols: @@ -38,7 +37,7 @@ def clean_eye_df(df): def extract_gaze_signal(df): - """Extrahiert 2D-Gaze-Positionen, maskiert ungültige Samples und interpoliert.""" + """Extracts 2D gaze positions, masks invalid samples, and interpolates.""" gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() @@ -51,14 +50,14 @@ def extract_gaze_signal(df): for arr in [gx_L, gy_L, gx_R, gy_R]: arr.replace([np.inf, -np.inf], np.nan, inplace=True) - # Ungültige maskieren + # Mask invalids gx_L[~val_L] = np.nan gy_L[~val_L] = np.nan gx_R[~val_R] = np.nan gy_R[~val_R] = np.nan - # Mittelwert beider Augen + # Mean of both eyes gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1) gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1) @@ -66,7 +65,7 @@ def extract_gaze_signal(df): gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill() gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill() - # MinMax Skalierung + # MinMax scaling xscaler = MinMaxScaler() gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) @@ -77,7 +76,7 @@ def extract_gaze_signal(df): def extract_pupil(df): - """Extrahiert Pupillengröße (beide Augen gemittelt).""" + """Extract pupil size (average of both eyes).""" pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) @@ -96,7 +95,7 @@ def extract_pupil(df): def detect_blinks(pupil_validity, min_duration=5): - """Erkennt Blinks: Validity=0 → Blink.""" + """Detect blinks: Validity=0 → Blink.""" blinks = [] start = None @@ -120,13 +119,13 @@ def compute_IPA(pupil, fs=25): def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2): """ - Extrahiert Eye-Tracking Features für ein einzelnes Window. - Gibt Dictionary mit allen Eye-Features zurück. + Extracts eye tracking features for a single window. + Returns a dictionary containing all eye features. """ # Gaze gaze = extract_gaze_signal(df_eye_window) - # Pupille + # Pupil pupil, pupil_validity = extract_pupil(df_eye_window) window_size = len(df_eye_window) @@ -143,7 +142,6 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2): fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0] - # Kategorien F_short = sum(66 <= d <= 150 for d in fixation_durations) F_medium = sum(300 <= d <= 500 for d in fixation_durations) F_long = sum(d >= 1000 for d in fixation_durations) @@ -197,27 +195,27 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2): ############################################################################## -# KOMBINIERTE FEATURE-EXTRAKTION +# Combined feature extraction ############################################################################## def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2): """ - Verarbeitet Parquet-Dateien mit FACE_AU und EYE Spalten. - Extrahiert beide Feature-Sets und kombiniert sie. + Processes Parquet files with FACE_AU and EYE columns. + Extracts both feature sets and combines them. """ input_path = Path(input_dir) parquet_files = sorted(input_path.glob("*.parquet")) if not parquet_files: - print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!") + print(f"Error: No parquet-files found in {input_dir}!") return None print(f"\n{'='*70}") - print(f"KOMBINIERTE FEATURE-EXTRAKTION") + print(f"Combined feature-extraction") print(f"{'='*70}") - print(f"Dateien: {len(parquet_files)}") - print(f"Window: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)") - print(f"Step: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)") + print(f"Files: {len(parquet_files)}") + print(f"Window: {window_size} Samples ({window_size/fs:.1f}s at {fs}Hz)") + print(f"Step: {step_size} Samples ({step_size/fs:.1f}s at {fs}Hz)") print(f"{'='*70}\n") all_windows = [] @@ -227,24 +225,22 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs try: df = pd.read_parquet(parquet_file) - print(f" Einträge: {len(df)}") + print(f" Entries: {len(df)}") - - # Identifiziere Spalten au_columns = [col for col in df.columns if col.startswith('FACE_AU')] eye_columns = [col for col in df.columns if col.startswith('EYE_')] - print(f" AU-Spalten: {len(au_columns)}") - print(f" Eye-Spalten: {len(eye_columns)}") + print(f" AU-columns: {len(au_columns)}") + print(f" Eye-columns: {len(eye_columns)}") has_au = len(au_columns) > 0 has_eye = len(eye_columns) > 0 if not has_au and not has_eye: - print(f" WARNUNG: Keine AU oder Eye Spalten gefunden!") + print(f" Warning: No AU or eye tracking columns found!") continue - # Gruppiere nach STUDY, LEVEL, PHASE + # Group by STUDY, LEVEL, PHASE group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns] if group_cols: @@ -258,7 +254,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs group_df = group_df.reset_index(drop=True) - # Berechne Anzahl Windows + # calculate number of windows num_windows = (len(group_df) - window_size) // step_size + 1 if num_windows <= 0: @@ -272,7 +268,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs window_df = group_df.iloc[start_idx:end_idx] - # Basis-Metadaten + # basic metadata result = { 'subjectID': window_df['subjectID'].iloc[0], 'start_time': window_df['rowID'].iloc[0], @@ -281,12 +277,12 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs 'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan } - # FACE AU Features + # FACE AU features if has_au: for au_col in au_columns: result[f'{au_col}_mean'] = window_df[au_col].mean() - # Eye-Tracking Features + # Eye-tracking features if has_eye: try: # clean dataframe from all nan rows @@ -296,7 +292,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs result.update(eye_features) except Exception as e: print(f" WARNUNG: Eye-Features fehlgeschlagen: {str(e)}") - # Füge NaN-Werte für Eye-Features hinzu + # Add NaN-values for eye-features result.update({ "Fix_count_short_66_150": np.nan, "Fix_count_medium_300_500": np.nan, @@ -325,7 +321,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs traceback.print_exc() continue - # Kombiniere alle Windows + # Combine all windows if not all_windows: print("\nKEINE FEATURES EXTRAHIERT!") return None @@ -340,7 +336,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs print(f"Spalten: {len(result_df.columns)}") print(f"Subjects: {result_df['subjectID'].nunique()}") - # Speichern + # Save output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) result_df.to_parquet(output_file, index=False) @@ -357,7 +353,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs def main(): print("\n" + "="*70) - print("KOMBINIERTE FEATURE-EXTRAKTION (AU + EYE)") + print("Combined extraction (AU + EYE)") print("="*70) result = process_combined_features( @@ -370,16 +366,16 @@ def main(): ) if result is not None: - print("\nErste 5 Zeilen:") + print("\First 5 rows:") print(result.head()) - print("\nSpalten-Übersicht:") + print("\nColumns overview:") print(result.dtypes) - print("\nStatistik:") + print("\Statistics:") print(result.describe()) - print("\n✓ FERTIG!\n") + print("\nDone!\n") if __name__ == "__main__": diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py deleted file mode 100644 index 54e7892..0000000 --- a/dataset_creation/create_feature_table.py +++ /dev/null @@ -1,113 +0,0 @@ -import pandas as pd -import numpy as np -from pathlib import Path - -def process_parquet_files(input_dir, output_file, window_size=1250, step_size=125): - """ - Verarbeitet Parquet-Dateien mit Sliding Window Aggregation. - - Parameters: - ----------- - input_dir : str - Verzeichnis mit Parquet-Dateien - output_file : str - Pfad für die Ausgabe-Parquet-Datei - window_size : int - Größe des Sliding Windows (default: 3000) - step_size : int - Schrittweite in Einträgen (default: 250 = 10 Sekunden bei 25 Hz) - """ - - input_path = Path(input_dir) - parquet_files = sorted(input_path.glob("*.parquet")) - - if not parquet_files: - print(f"Keine Parquet-Dateien in {input_dir} gefunden!") - return - - print(f"Gefundene Dateien: {len(parquet_files)}") - - all_windows = [] - - for file_idx, parquet_file in enumerate(parquet_files): - print(f"\nVerarbeite Datei {file_idx + 1}/{len(parquet_files)}: {parquet_file.name}") - - # Lade Parquet-Datei - df = pd.read_parquet(parquet_file) - print(f" Einträge: {len(df)}") - - # Identifiziere AU-Spalten - au_columns = [col for col in df.columns if col.startswith('FACE_AU')] - print(f" AU-Spalten: {len(au_columns)}") - - # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden) - for (study_val, level_val, phase_val), level_df in df.groupby(['STUDY', 'LEVEL', 'PHASE'], sort=False): - print(f" STUDY {study_val}, LEVEL {level_val}, PHASE {phase_val}: {len(level_df)} Einträge") - - # Reset index für korrekte Position-Berechnung - level_df = level_df.reset_index(drop=True) - - # Sliding Window über dieses Level - num_windows = (len(level_df) - window_size) // step_size + 1 - - if num_windows <= 0: - print(f" Zu wenige Einträge für Window (benötigt {window_size})") - continue - - for i in range(num_windows): - start_idx = i * step_size - end_idx = start_idx + window_size - - window_df = level_df.iloc[start_idx:end_idx] - - # Erstelle aggregiertes Ergebnis - result = { - 'subjectID': window_df['subjectID'].iloc[0], - 'start_time': window_df['rowID'].iloc[0], # rowID als start_time - 'STUDY': window_df['STUDY'].iloc[0], - 'LEVEL': window_df['LEVEL'].iloc[0], - 'PHASE': window_df['PHASE'].iloc[0] - } - - # Summiere alle AU-Spalten - for au_col in au_columns: - # result[f'{au_col}_sum'] = window_df[au_col].sum() - result[f'{au_col}_mean'] = window_df[au_col].mean() - - all_windows.append(result) - - print(f" Windows erstellt: {num_windows}") - - # Erstelle finalen DataFrame - result_df = pd.DataFrame(all_windows) - - print(f"\n{'='*60}") - print(f"Gesamt Windows erstellt: {len(result_df)}") - print(f"Spalten: {list(result_df.columns)}") - - # Speichere Ergebnis - result_df.to_parquet(output_file, index=False) - print(f"\nErgebnis gespeichert in: {output_file}") - - return result_df - - -# Beispiel-Verwendung -if __name__ == "__main__": - # Anpassen an deine Pfade - input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files") - output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet") - - - - result = process_parquet_files( - input_dir=input_directory, - output_file=output_file, - window_size=1250, - step_size=125 - ) - - # Zeige erste Zeilen - if result is not None: - print("\nErste 5 Zeilen des Ergebnisses:") - print(result.head()) \ No newline at end of file diff --git a/dataset_creation/create_multimodal_dataset_by_merge.py b/dataset_creation/create_multimodal_dataset_by_merge.py deleted file mode 100644 index a81a242..0000000 --- a/dataset_creation/create_multimodal_dataset_by_merge.py +++ /dev/null @@ -1,56 +0,0 @@ -from pathlib import Path -import pandas as pd - - -def main(): - """ - USER CONFIGURATION - ------------------ - Specify input files and output directory here. - """ - - # Input parquet files (single-modality datasets) - file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet") - file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet") - - # Output directory and file name - output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/") - output_file = output_dir / "merged_dataset.parquet" - - # Column names (adjust only if your schema differs) - subject_col = "subjectID" - time_col = "start_time" - - # ------------------------------------------------------------------ - # Load datasets - # ------------------------------------------------------------------ - df1 = pd.read_parquet(file_modality_1) - df2 = pd.read_parquet(file_modality_2) - - # ------------------------------------------------------------------ - # Keep only subjects that appear in BOTH datasets - # ------------------------------------------------------------------ - common_subjects = set(df1[subject_col]).intersection(df2[subject_col]) - - df1 = df1[df1[subject_col].isin(common_subjects)] - df2 = df2[df2[subject_col].isin(common_subjects)] - - # ------------------------------------------------------------------ - # Inner join on subject ID AND start_time - # ------------------------------------------------------------------ - merged_df = pd.merge( - df1, - df2, - on=[subject_col, time_col], - how="inner", - ) - - # ------------------------------------------------------------------ - # Save merged dataset - # ------------------------------------------------------------------ - output_dir.mkdir(parents=True, exist_ok=True) - merged_df.to_parquet(output_file, index=False) - - -if __name__ == "__main__": - main() diff --git a/dataset_creation/create_parquet_files.py b/dataset_creation/create_parquet_files_from_owncloud.py similarity index 82% rename from dataset_creation/create_parquet_files.py rename to dataset_creation/create_parquet_files_from_owncloud.py index 1a2fb7f..7c3880f 100644 --- a/dataset_creation/create_parquet_files.py +++ b/dataset_creation/create_parquet_files_from_owncloud.py @@ -1,6 +1,5 @@ -# pip install pyocclient import yaml -import owncloud +import owncloud # pip install pyocclient import pandas as pd import h5py import os @@ -26,7 +25,7 @@ for i in range(num_files): # Download file from ownCloud oc.get_file(file_name, local_tmp) - print(f"{file_name} geoeffnet") + print(f"Opened: {file_name}") # Load into memory and extract needed columns # with h5py.File(local_tmp, "r") as f: # # Adjust this path depending on actual dataset layout inside .h5py file @@ -35,14 +34,9 @@ for i in range(num_files): with pd.HDFStore(local_tmp, mode="r") as store: cols = store.select("SIGNALS", start=0, stop=1).columns # get column names - # Step 2: Filter columns that start with "AU" - au_cols = [c for c in cols if c.startswith("AU")] - print(au_cols) - if len(au_cols)==0: - print(f"keine AU Signale in Subject {i}") - continue + # Step 3: Read only those columns (plus any others you want) - df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols) + df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + cols) print("load done") @@ -63,7 +57,7 @@ for i in range(num_files): # Save to parquet - os.makedirs("ParquetFiles", exist_ok=True) + os.makedirs("ParquetFiles", exist_ok=True) # TODO: change for custom directory out_name = f"ParquetFiles/cleaned_{i:04d}.parquet" df.to_parquet(out_name, index=False) diff --git a/dataset_creation/eyeAlt.py b/dataset_creation/eyeAlt.py deleted file mode 100644 index fef68ad..0000000 --- a/dataset_creation/eyeAlt.py +++ /dev/null @@ -1,323 +0,0 @@ -import numpy as np -import pandas as pd -import h5py -import yaml -import os -from sklearn.preprocessing import MinMaxScaler -from scipy.signal import welch -from pygazeanalyser.detectors import fixation_detection, saccade_detection - - -############################################################################## -# 1. HELFERFUNKTIONEN -############################################################################## -def clean_eye_df(df): - """ - Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten. - Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält. - """ - eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)] - df_eye = df[eye_cols] - - # INF → NaN - df_eye = df_eye.replace([np.inf, -np.inf], np.nan) - - # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt - df_eye = df_eye.dropna(subset=eye_cols, how="all") - - print("Eyetracking-Zeilen vorher:", len(df)) - print("Eyetracking-Zeilen nachher:", len(df_eye)) - - #Index zurücksetzen - return df_eye.reset_index(drop=True) - - -def extract_gaze_signal(df): - """ - Extrahiert 2D-Gaze-Positionen auf dem Display, - maskiert ungültige Samples und interpoliert Lücken. - """ - - print("→ extract_gaze_signal(): Eingabegröße:", df.shape) - - # Gaze-Spalten - gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - - - # Validity-Spalten (1 = gültig) - val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1) - val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1) - - # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor) - gx_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gx_R.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_R.replace([np.inf, -np.inf], np.nan, inplace=True) - - # Ungültige Werte maskieren - gx_L[~val_L] = np.nan - gy_L[~val_L] = np.nan - gx_R[~val_R] = np.nan - gy_R[~val_R] = np.nan - - # Mittelwert der beiden Augen pro Sample (nanmean ist robust) - gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1) - gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1) - - # Interpolation (wichtig für PyGaze!) - gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill() - gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill() - - # xscaler = MinMaxScaler() - # gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) - - # yscaler = MinMaxScaler() - # gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1)) - - #print("xmax ymax", gxscale.max(), gyscale.max()) - - #out = np.column_stack((gxscale, gyscale)) - out = np.column_stack((gx, gy)) - - print("→ extract_gaze_signal(): Ausgabegröße:", out.shape) - - return out - - -def extract_pupil(df): - """Extrahiert Pupillengröße (beide Augen gemittelt).""" - - pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - - vl = df.get("LEFT_PUPIL_VALIDITY") - vr = df.get("RIGHT_PUPIL_VALIDITY") - - if vl is None or vr is None: - # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik: - # gültig, wenn Pupillendurchmesser nicht NaN. - validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy() - else: - # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist - validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy() - - # Mittelwert der verfügbaren Pupillen - p = np.mean(np.column_stack([pl, pr]), axis=1) - - # INF/NaN reparieren - p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill() - p = p.to_numpy() - - print("→ extract_pupil(): Pupillensignal Länge:", len(p)) - return p, validity - - -def detect_blinks(pupil_validity, min_duration=5): - """Erkennt Blinks: Validity=0 → Blink.""" - blinks = [] - start = None - - for i, v in enumerate(pupil_validity): - if v == 0 and start is None: - start = i - elif v == 1 and start is not None: - if i - start >= min_duration: - blinks.append([start, i]) - start = None - - return blinks - - -def compute_IPA(pupil, fs=250): - """ - IPA = Index of Pupillary Activity (nach Duchowski 2018). - Hochfrequenzanteile der Pupillenzeitreihe. - """ - f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster - - hf_band = (f >= 0.6) & (f <= 2.0) - ipa = np.sum(Pxx[hf_band]) - - return ipa - - -############################################################################## -# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION) -############################################################################## - -def extract_eye_features(df, window_length_sec=50, fs=250): - """ - df = Tobii DataFrame - window_length_sec = Fenstergröße (z.B. W=1s) - """ - - print("→ extract_eye_features(): Starte Feature-Berechnung...") - print(" Fensterlänge W =", window_length_sec, "s") - - W = int(window_length_sec * fs) # Window größe in Samples - - # Gaze - gaze = extract_gaze_signal(df) - gx, gy = gaze[:, 0], gaze[:, 1] - print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx)) - print("Range:", np.nanmin(gx), np.nanmax(gx)) - print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy)) - print("Range:", np.nanmin(gy), np.nanmax(gy)) - - # Pupille - pupil, pupil_validity = extract_pupil(df) - - features = [] - - # Sliding windows - for start in range(0, len(df), W): - end = start + W - if end > len(df): - break #das letzte Fenster wird ignoriert - - - w_gaze = gaze[start:end] - w_pupil = pupil[start:end] - w_valid = pupil_validity[start:end] - - # ---------------------------- - # FIXATIONS (PyGaze) - # ---------------------------- - time_ms = np.arange(W) * 1000.0 / fs - - # print("gx im Fenster:", w_gaze[:,0][:20]) - # print("gy im Fenster:", w_gaze[:,1][:20]) - # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0])))) - - # print("Werte X im Fenster:", w_gaze[:,0]) - # print("Werte Y im Fenster:", w_gaze[:,1]) - # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0])))) - # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1])))) - print("time_ms:", time_ms) - - fix, efix = fixation_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, - missing=0.0, maxdist=0.003, mindur=10 # mindur=100ms - ) - - #print("Raw Fixation Output:", efix[0]) - - if start == 0: - print("DEBUG fix raw:", fix[:10]) - - # Robust fixations: PyGaze may return malformed entries - fixation_durations = [] - for f in efix: - print("Efix:", f[2]) - # start_t = f[1] # in ms - # end_t = f[2] # in ms - # duration = (end_t - start_t) / 1000.0 # in Sekunden - - #duration = f[2] / 1000.0 - if np.isfinite(f[2]) and f[2] > 0: - fixation_durations.append(f[2]) - - # Kategorien laut Paper - F_short = sum(66 <= d <= 150 for d in fixation_durations) - F_medium = sum(300 <= d <= 500 for d in fixation_durations) - F_long = sum(d >= 1000 for d in fixation_durations) - F_hundred = sum(d > 100 for d in fixation_durations) - F_Cancel = sum(66 < d for d in fixation_durations) - - # ---------------------------- - # SACCADES - # ---------------------------- - sac, esac = saccade_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1 - ) - - sac_durations = [s[2] for s in esac] - sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac] - - # ---------------------------- - # BLINKS - # ---------------------------- - blinks = detect_blinks(w_valid) - blink_durations = [(b[1] - b[0]) / fs for b in blinks] - - # ---------------------------- - # PUPIL - # ---------------------------- - if np.all(np.isnan(w_pupil)): - mean_pupil = np.nan - ipa = np.nan - else: - mean_pupil = np.nanmean(w_pupil) - ipa = compute_IPA(w_pupil, fs=fs) - - # ---------------------------- - # FEATURE-TABELLE FÜLLEN - # ---------------------------- - features.append({ - "Fix_count_short_66_150": F_short, - "Fix_count_medium_300_500": F_medium, - "Fix_count_long_gt_1000": F_long, - "Fix_count_100": F_hundred, - "Fix_cancel": F_Cancel, - "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0, - "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0, - - "Sac_count": len(sac), - "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0, - "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0, - "Sac_median_dur": np.median(sac_durations) if sac_durations else 0, - - "Blink_count": len(blinks), - "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0, - "Blink_median_dur": np.median(blink_durations) if blink_durations else 0, - - "Pupil_mean": mean_pupil, - "Pupil_IPA": ipa - }) - - - result = pd.DataFrame(features) - print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape) - - return result - -############################################################################## -# 3. MAIN FUNKTION -############################################################################## - -def main(): - print("### STARTE FEATURE-EXTRAKTION ###") - print("Aktueller Arbeitsordner:", os.getcwd()) - - #df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r") - df = pd.read_parquet("cleaned_0001.parquet") - print("DataFrame geladen:", df.shape) - - # Nur Eye-Tracking auswählen - #eye_cols = [c for c in df.columns if "EYE_" in c] - #df_eye = df[eye_cols] - - #print("Eye-Tracking-Spalten:", len(eye_cols)) - #print("→", eye_cols[:10], " ...") - - print("Reinige Eyetracking-Daten ...") - df_eye = clean_eye_df(df) - - # Feature Extraction - features = extract_eye_features(df_eye, window_length_sec=50, fs=250) - - print("\n### FEATURE-MATRIX (HEAD) ###") - print(features.head()) - - print("\nSpeichere Output in features.csv ...") - features.to_csv("features4.csv", index=False) - - print("FERTIG!") - - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/dataset_creation/eye_batch_processor.py b/dataset_creation/eye_batch_processor.py deleted file mode 100644 index 8192147..0000000 --- a/dataset_creation/eye_batch_processor.py +++ /dev/null @@ -1,441 +0,0 @@ -import numpy as np -import pandas as pd -import h5py -import yaml -import os -from pathlib import Path -from sklearn.preprocessing import MinMaxScaler -from scipy.signal import welch -from pygazeanalyser.detectors import fixation_detection, saccade_detection - - -############################################################################## -# KONFIGURATION - HIER ANPASSEN! -############################################################################## -INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/") -OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet") - -WINDOW_SIZE_SAMPLES = 12500 # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz) -STEP_SIZE_SAMPLES = 1250 # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz) -SAMPLING_RATE = 250 # Hz - - -############################################################################## -# 1. HELFERFUNKTIONEN -############################################################################## -def clean_eye_df(df): - """ - Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten. - Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält. - """ - eye_cols = [c for c in df.columns if c.startswith("EYE_")] - df_eye = df[eye_cols] - - # INF → NaN - df_eye = df_eye.replace([np.inf, -np.inf], np.nan) - - # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt - df_eye = df_eye.dropna(subset=eye_cols, how="all") - - print(f" Eyetracking-Zeilen: {len(df)} → {len(df_eye)}") - - return df_eye.reset_index(drop=True) - - -def extract_gaze_signal(df): - """ - Extrahiert 2D-Gaze-Positionen auf dem Display, - maskiert ungültige Samples und interpoliert Lücken. - """ - # Gaze-Spalten - gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - - # Validity-Spalten (1 = gültig) - val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1) - val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1) - - # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor) - gx_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gx_R.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_R.replace([np.inf, -np.inf], np.nan, inplace=True) - - # Ungültige Werte maskieren - gx_L[~val_L] = np.nan - gy_L[~val_L] = np.nan - gx_R[~val_R] = np.nan - gy_R[~val_R] = np.nan - - # Mittelwert der beiden Augen pro Sample (nanmean ist robust) - gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1) - gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1) - - # Interpolation (wichtig für PyGaze!) - gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill() - gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill() - - xscaler = MinMaxScaler() - gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) - - yscaler = MinMaxScaler() - gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1)) - - out = np.column_stack((gxscale, gyscale)) - return out - - -def extract_pupil(df): - """Extrahiert Pupillengröße (beide Augen gemittelt).""" - pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - - vl = df.get("EYE_LEFT_PUPIL_VALIDITY") - vr = df.get("EYE_RIGHT_PUPIL_VALIDITY") - - if vl is None or vr is None: - validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy() - else: - validity = ((vl == 1) | (vr == 1)).astype(int).to_numpy() - - # Mittelwert der verfügbaren Pupillen - p = np.mean(np.column_stack([pl, pr]), axis=1) - - # INF/NaN reparieren - p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill() - p = p.to_numpy() - - return p, validity - - -def detect_blinks(pupil_validity, min_duration=5): - """Erkennt Blinks: Validity=0 → Blink.""" - blinks = [] - start = None - - for i, v in enumerate(pupil_validity): - if v == 0 and start is None: - start = i - elif v == 1 and start is not None: - if i - start >= min_duration: - blinks.append([start, i]) - start = None - - return blinks - - -def compute_IPA(pupil, fs=250): - """ - IPA = Index of Pupillary Activity (nach Duchowski 2018). - Hochfrequenzanteile der Pupillenzeitreihe. - """ - f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster - - hf_band = (f >= 0.6) & (f <= 2.0) - ipa = np.sum(Pxx[hf_band]) - - return ipa - - -############################################################################## -# 2. FEATURE-EXTRAKTION MIT SLIDING WINDOW -############################################################################## - -def extract_eye_features_sliding(df_eye, df_meta, window_size, step_size, fs=250): - """ - Extrahiert Features mit Sliding Window aus einem einzelnen Level/Phase. - - Parameters: - ----------- - df_eye : DataFrame - Eye-Tracking Daten (bereits gereinigt) - df_meta : DataFrame - Metadaten (subjectID, rowID, STUDY, LEVEL, PHASE) - window_size : int - Anzahl Samples pro Window - step_size : int - Schrittweite in Samples - fs : int - Sampling Rate in Hz - """ - # Gaze - gaze = extract_gaze_signal(df_eye) - - # Pupille - pupil, pupil_validity = extract_pupil(df_eye) - - features = [] - num_windows = (len(df_eye) - window_size) // step_size + 1 - - if num_windows <= 0: - return pd.DataFrame() - - for i in range(num_windows): - start_idx = i * step_size - end_idx = start_idx + window_size - - w_gaze = gaze[start_idx:end_idx] - w_pupil = pupil[start_idx:end_idx] - w_valid = pupil_validity[start_idx:end_idx] - - # Metadaten für dieses Window - meta_row = df_meta.iloc[start_idx] - - # ---------------------------- - # FIXATIONS (PyGaze) - # ---------------------------- - time_ms = np.arange(window_size) * 1000.0 / fs - - fix, efix = fixation_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, - missing=0.0, maxdist=0.003, mindur=10 - ) - - fixation_durations = [] - for f in efix: - if np.isfinite(f[2]) and f[2] > 0: - fixation_durations.append(f[2]) - - # Kategorien laut Paper - F_short = sum(66 <= d <= 150 for d in fixation_durations) - F_medium = sum(300 <= d <= 500 for d in fixation_durations) - F_long = sum(d >= 1000 for d in fixation_durations) - F_hundred = sum(d > 100 for d in fixation_durations) - # F_Cancel = sum(66 < d for d in fixation_durations) - - # ---------------------------- - # SACCADES - # ---------------------------- - sac, esac = saccade_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, - missing=0, minlen=12, maxvel=0.2, maxacc=1 - ) - - sac_durations = [s[2] for s in esac] - sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac] - - # ---------------------------- - # BLINKS - # ---------------------------- - blinks = detect_blinks(w_valid) - blink_durations = [(b[1] - b[0]) / fs for b in blinks] - - # ---------------------------- - # PUPIL - # ---------------------------- - if np.all(np.isnan(w_pupil)): - mean_pupil = np.nan - ipa = np.nan - else: - mean_pupil = np.nanmean(w_pupil) - ipa = compute_IPA(w_pupil, fs=fs) - - # ---------------------------- - # FEATURE-DICTIONARY - # ---------------------------- - features.append({ - # Metadaten - 'subjectID': meta_row['subjectID'], - 'start_time': meta_row['rowID'], - 'STUDY': meta_row.get('STUDY', np.nan), - 'LEVEL': meta_row.get('LEVEL', np.nan), - 'PHASE': meta_row.get('PHASE', np.nan), - - # Fixation Features - "Fix_count_short_66_150": F_short, - "Fix_count_medium_300_500": F_medium, - "Fix_count_long_gt_1000": F_long, - "Fix_count_100": F_hundred, - # "Fix_cancel": F_Cancel, - "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0, - "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0, - - # Saccade Features - "Sac_count": len(sac), - "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0, - "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0, - "Sac_median_dur": np.median(sac_durations) if sac_durations else 0, - - # Blink Features - "Blink_count": len(blinks), - "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0, - "Blink_median_dur": np.median(blink_durations) if blink_durations else 0, - - # Pupil Features - "Pupil_mean": mean_pupil, - "Pupil_IPA": ipa - }) - - return pd.DataFrame(features) - - -############################################################################## -# 3. BATCH-VERARBEITUNG -############################################################################## - -def process_parquet_directory(input_dir, output_file, window_size, step_size, fs=250): - """ - Verarbeitet alle Parquet-Dateien in einem Verzeichnis. - - Parameters: - ----------- - input_dir : str - Pfad zum Verzeichnis mit Parquet-Dateien - output_file : str - Pfad für die Ausgabe-Parquet-Datei - window_size : int - Window-Größe in Samples - step_size : int - Schrittweite in Samples - fs : int - Sampling Rate in Hz - """ - input_path = Path(input_dir) - parquet_files = sorted(input_path.glob("*.parquet")) - - if not parquet_files: - print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!") - return - - print(f"\n{'='*70}") - print(f"STARTE BATCH-VERARBEITUNG") - print(f"{'='*70}") - print(f"Gefundene Dateien: {len(parquet_files)}") - print(f"Window Size: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)") - print(f"Step Size: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)") - print(f"{'='*70}\n") - - all_features = [] - - for file_idx, parquet_file in enumerate(parquet_files, 1): - print(f"\n[{file_idx}/{len(parquet_files)}] Verarbeite: {parquet_file.name}") - - try: - # Lade Parquet-Datei - df = pd.read_parquet(parquet_file) - print(f" Einträge geladen: {len(df)}") - - # Prüfe ob benötigte Spalten vorhanden sind - required_cols = ['subjectID', 'rowID'] - missing_cols = [col for col in required_cols if col not in df.columns] - if missing_cols: - print(f" WARNUNG: Fehlende Spalten: {missing_cols} - Überspringe Datei") - continue - - # Reinige Eye-Tracking-Daten - df_eye = clean_eye_df(df) - - if len(df_eye) == 0: - print(f" WARNUNG: Keine gültigen Eye-Tracking-Daten - Überspringe Datei") - continue - - # Metadaten extrahieren (aligned mit df_eye) - meta_cols = ['subjectID', 'rowID'] - if 'STUDY' in df.columns: - meta_cols.append('STUDY') - if 'LEVEL' in df.columns: - meta_cols.append('LEVEL') - if 'PHASE' in df.columns: - meta_cols.append('PHASE') - - df_meta = df[meta_cols].iloc[df_eye.index].reset_index(drop=True) - - # Gruppiere nach STUDY, LEVEL, PHASE (falls vorhanden) - group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df_meta.columns] - - if group_cols: - print(f" Gruppiere nach: {', '.join(group_cols)}") - for group_vals, group_df in df_meta.groupby(group_cols, sort=False): - group_eye = df_eye.iloc[group_df.index].reset_index(drop=True) - group_meta = group_df.reset_index(drop=True) - - print(f" Gruppe {group_vals}: {len(group_eye)} Samples", end=" → ") - - features_df = extract_eye_features_sliding( - group_eye, group_meta, window_size, step_size, fs - ) - - if not features_df.empty: - all_features.append(features_df) - print(f"{len(features_df)} Windows") - else: - print("Zu wenige Daten") - else: - # Keine Gruppierung - print(f" Keine Gruppierungsspalten gefunden") - features_df = extract_eye_features_sliding( - df_eye, df_meta, window_size, step_size, fs - ) - - if not features_df.empty: - all_features.append(features_df) - print(f" → {len(features_df)} Windows erstellt") - else: - print(f" → Zu wenige Daten") - - except Exception as e: - print(f" FEHLER bei Verarbeitung: {str(e)}") - import traceback - traceback.print_exc() - continue - - # Kombiniere alle Features - if not all_features: - print("\nKEINE FEATURES EXTRAHIERT!") - return None - - print(f"\n{'='*70}") - print(f"ZUSAMMENFASSUNG") - print(f"{'='*70}") - - final_df = pd.concat(all_features, ignore_index=True) - - print(f"Gesamt Windows: {len(final_df)}") - print(f"Spalten: {len(final_df.columns)}") - print(f"Subjects: {final_df['subjectID'].nunique()}") - - # Speichere Ergebnis - output_path = Path(output_file) - output_path.parent.mkdir(parents=True, exist_ok=True) - final_df.to_parquet(output_file, index=False) - - print(f"\n✓ Ergebnis gespeichert: {output_file}") - print(f"{'='*70}\n") - - return final_df - - -############################################################################## -# 4. MAIN -############################################################################## - -def main(): - print("\n" + "="*70) - print("EYE-TRACKING FEATURE EXTRAKTION - BATCH MODE") - print("="*70) - - result = process_parquet_directory( - input_dir=INPUT_DIR, - output_file=OUTPUT_FILE, - window_size=WINDOW_SIZE_SAMPLES, - step_size=STEP_SIZE_SAMPLES, - fs=SAMPLING_RATE - ) - - if result is not None: - print("\nErste 5 Zeilen des Ergebnisses:") - print(result.head()) - - print("\nSpalten-Übersicht:") - print(result.columns.tolist()) - - print("\nDatentypen:") - print(result.dtypes) - - print("\n✓ FERTIG!\n") - - -if __name__ == "__main__": - main() diff --git a/dataset_creation/eyetrackingFeatures.py b/dataset_creation/eyetrackingFeatures.py deleted file mode 100644 index 03d15c9..0000000 --- a/dataset_creation/eyetrackingFeatures.py +++ /dev/null @@ -1,323 +0,0 @@ -import numpy as np -import pandas as pd -import h5py -import yaml -import owncloud -import os -from sklearn.preprocessing import MinMaxScaler -from scipy.signal import welch -from pygazeanalyser.detectors import fixation_detection, saccade_detection - - -############################################################################## -# 1. HELFERFUNKTIONEN -############################################################################## -def clean_eye_df(df): - """ - Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten. - Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält. - """ - eye_cols = [c for c in df.columns if "EYE_" in c] - df_eye = df[eye_cols] - - # INF → NaN - df_eye = df_eye.replace([np.inf, -np.inf], np.nan) - - # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt - df_eye = df_eye.dropna(subset=eye_cols, how="all") - - print("Eyetracking-Zeilen vorher:", len(df)) - print("Eyetracking-Zeilen nachher:", len(df_eye)) - - #Index zurücksetzen - return df_eye.reset_index(drop=True) - - -def extract_gaze_signal(df): - """ - Extrahiert 2D-Gaze-Positionen auf dem Display, - maskiert ungültige Samples und interpoliert Lücken. - """ - - print("→ extract_gaze_signal(): Eingabegröße:", df.shape) - - # Gaze-Spalten - gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy() - gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy() - - - # Validity-Spalten (1 = gültig) - val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1) - val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1) - - # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor) - gx_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_L.replace([np.inf, -np.inf], np.nan, inplace=True) - gx_R.replace([np.inf, -np.inf], np.nan, inplace=True) - gy_R.replace([np.inf, -np.inf], np.nan, inplace=True) - - # Ungültige Werte maskieren - gx_L[~val_L] = np.nan - gy_L[~val_L] = np.nan - gx_R[~val_R] = np.nan - gy_R[~val_R] = np.nan - - # Mittelwert der beiden Augen pro Sample (nanmean ist robust) - gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1) - gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1) - - # Interpolation (wichtig für PyGaze!) - gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill() - gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill() - - xscaler = MinMaxScaler() - gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1)) - - yscaler = MinMaxScaler() - gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1)) - - print("xmax ymax", gxscale.max(), gyscale.max()) - - out = np.column_stack((gxscale, gyscale)) - - print("→ extract_gaze_signal(): Ausgabegröße:", out.shape) - - return out - - -def extract_pupil(df): - """Extrahiert Pupillengröße (beide Augen gemittelt).""" - - pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan) - - vl = df.get("EYE_LEFT_PUPIL_VALIDITY") - vr = df.get("EYE_RIGHT_PUPIL_VALIDITY") - - if vl is None or vr is None: - # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik: - # gültig, wenn Pupillendurchmesser nicht NaN. - validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy() - else: - # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist - validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy() - - # Mittelwert der verfügbaren Pupillen - p = np.mean(np.column_stack([pl, pr]), axis=1) - - # INF/NaN reparieren - p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill() - p = p.to_numpy() - - print("→ extract_pupil(): Pupillensignal Länge:", len(p)) - return p, validity - - -def detect_blinks(pupil_validity, min_duration=5): - """Erkennt Blinks: Validity=0 → Blink.""" - blinks = [] - start = None - - for i, v in enumerate(pupil_validity): - if v == 0 and start is None: - start = i - elif v == 1 and start is not None: - if i - start >= min_duration: - blinks.append([start, i]) - start = None - - return blinks - - -def compute_IPA(pupil, fs=250): - """ - IPA = Index of Pupillary Activity (nach Duchowski 2018). - Hochfrequenzanteile der Pupillenzeitreihe. - """ - f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2)) # 2 Sekunden Fenster - - hf_band = (f >= 0.6) & (f <= 2.0) - ipa = np.sum(Pxx[hf_band]) - - return ipa - - -############################################################################## -# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION) -############################################################################## - -def extract_eye_features(df, window_length_sec=50, fs=250): - """ - df = Tobii DataFrame - window_length_sec = Fenstergröße (z.B. W=1s) - """ - - print("→ extract_eye_features(): Starte Feature-Berechnung...") - print(" Fensterlänge W =", window_length_sec, "s") - - W = int(window_length_sec * fs) # Window größe in Samples - - # Gaze - gaze = extract_gaze_signal(df) - gx, gy = gaze[:, 0], gaze[:, 1] - print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx)) - print("Range:", np.nanmin(gx), np.nanmax(gx)) - print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy)) - print("Range:", np.nanmin(gy), np.nanmax(gy)) - - # Pupille - pupil, pupil_validity = extract_pupil(df) - - features = [] - - # Sliding windows - for start in range(0, len(df), W): - end = start + W - if end > len(df): - break #das letzte Fenster wird ignoriert - - - w_gaze = gaze[start:end] - w_pupil = pupil[start:end] - w_valid = pupil_validity[start:end] - - # ---------------------------- - # FIXATIONS (PyGaze) - # ---------------------------- - time_ms = np.arange(W) * 1000.0 / fs - - # print("gx im Fenster:", w_gaze[:,0][:20]) - # print("gy im Fenster:", w_gaze[:,1][:20]) - # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0])))) - - # print("Werte X im Fenster:", w_gaze[:,0]) - # print("Werte Y im Fenster:", w_gaze[:,1]) - # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0])))) - # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1])))) - print("time_ms:", time_ms) - - fix, efix = fixation_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, - missing=0.0, maxdist=0.001, mindur=65 # mindur=100ms - ) - - #print("Raw Fixation Output:", efix[0]) - - if start == 0: - print("DEBUG fix raw:", fix[:10]) - - # Robust fixations: PyGaze may return malformed entries - fixation_durations = [] - for f in efix: - print("Efix:", f[2]) - # start_t = f[1] # in ms - # end_t = f[2] # in ms - # duration = (end_t - start_t) / 1000.0 # in Sekunden - - #duration = f[2] / 1000.0 - if np.isfinite(f[2]) and f[2] > 0: - fixation_durations.append(f[2]) - - # Kategorien laut Paper - F_short = sum(66 <= d <= 150 for d in fixation_durations) - F_medium = sum(300 <= d <= 500 for d in fixation_durations) - F_long = sum(d >= 1000 for d in fixation_durations) - F_hundred = sum(d > 100 for d in fixation_durations) - F_Cancel = sum(66 < d for d in fixation_durations) - - # ---------------------------- - # SACCADES - # ---------------------------- - sac, esac = saccade_detection( - x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1 - ) - - sac_durations = [s[2] for s in esac] - sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac] - - # ---------------------------- - # BLINKS - # ---------------------------- - blinks = detect_blinks(w_valid) - blink_durations = [(b[1] - b[0]) / fs for b in blinks] - - # ---------------------------- - # PUPIL - # ---------------------------- - if np.all(np.isnan(w_pupil)): - mean_pupil = np.nan - ipa = np.nan - else: - mean_pupil = np.nanmean(w_pupil) - ipa = compute_IPA(w_pupil, fs=fs) - - # ---------------------------- - # FEATURE-TABELLE FÜLLEN - # ---------------------------- - features.append({ - "Fix_count_short_66_150": F_short, - "Fix_count_medium_300_500": F_medium, - "Fix_count_long_gt_1000": F_long, - "Fix_count_100": F_hundred, - "Fix_cancel": F_Cancel, - "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0, - "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0, - - "Sac_count": len(sac), - "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0, - "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0, - "Sac_median_dur": np.median(sac_durations) if sac_durations else 0, - - "Blink_count": len(blinks), - "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0, - "Blink_median_dur": np.median(blink_durations) if blink_durations else 0, - - "Pupil_mean": mean_pupil, - "Pupil_IPA": ipa - }) - - - result = pd.DataFrame(features) - print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape) - - return result - -############################################################################## -# 3. MAIN FUNKTION -############################################################################## - -def main(): - print("### STARTE FEATURE-EXTRAKTION ###") - print("Aktueller Arbeitsordner:", os.getcwd()) - - df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r") - #df = pd.read_parquet("cleaned_0001.parquet") - print("DataFrame geladen:", df.shape) - - # Nur Eye-Tracking auswählen - #eye_cols = [c for c in df.columns if "EYE_" in c] - #df_eye = df[eye_cols] - - #print("Eye-Tracking-Spalten:", len(eye_cols)) - #print("→", eye_cols[:10], " ...") - - print("Reinige Eyetracking-Daten ...") - df_eye = clean_eye_df(df) - - # Feature Extraction - features = extract_eye_features(df_eye, window_length_sec=50, fs=250) - - print("\n### FEATURE-MATRIX (HEAD) ###") - print(features.head()) - - print("\nSpeichere Output in features.csv ...") - features.to_csv("features2.csv", index=False) - - print("FERTIG!") - - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/dataset_creation/maxDist.py b/dataset_creation/maxDist.py index 8242101..2c506da 100644 --- a/dataset_creation/maxDist.py +++ b/dataset_creation/maxDist.py @@ -1,72 +1,79 @@ import math -def fixation_radius_normalized(theta_deg: float, - distance_cm: float, - screen_width_cm: float, - screen_height_cm: float, - resolution_x: int, - resolution_y: int, - method: str = "max"): + +def fixation_radius_normalized( + theta_deg: float, + distance_cm: float, + screen_width_cm: float, + screen_height_cm: float, + resolution_x: int, + resolution_y: int, + method: str = "max", +): """ - Berechnet den PyGaze-Fixationsradius für normierte Gaze-Daten in [0,1]. + Compute the PyGaze fixation radius for normalized gaze data in [0, 1]. """ - # Schritt 1: visueller Winkel → physische Distanz (cm) + # Visual angle to physical distance (cm) delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2) - # Schritt 2: physische Distanz → Pixel + # Physical distance to pixels delta_px_x = delta_cm * (resolution_x / screen_width_cm) delta_px_y = delta_cm * (resolution_y / screen_height_cm) - # Pixelradius + # Pixel radius if method == "max": r_px = max(delta_px_x, delta_px_y) else: r_px = math.sqrt(delta_px_x**2 + delta_px_y**2) - # Schritt 3: Pixelradius → normierter Radius + # Pixel radius to normalized radius r_norm_x = r_px / resolution_x r_norm_y = r_px / resolution_y if method == "max": return max(r_norm_x, r_norm_y) - else: - return math.sqrt(r_norm_x**2 + r_norm_y**2) + return math.sqrt(r_norm_x**2 + r_norm_y**2) +def run_example(): + # Example: 55" 4k monitor + screen_width_cm = 3 * 121.8 + screen_height_cm = 68.5 + resolution_x = 3 * 3840 + resolution_y = 2160 + distance_to_screen_cm = 120 + max_angle = 1.0 + + maxdist_px = fixation_radius_normalized( + theta_deg=max_angle, + distance_cm=distance_to_screen_cm, + screen_width_cm=screen_width_cm, + screen_height_cm=screen_height_cm, + resolution_x=resolution_x, + resolution_y=resolution_y, + method="max", + ) + print("PyGaze max_dist (max):", maxdist_px) + + maxdist_px = fixation_radius_normalized( + theta_deg=max_angle, + distance_cm=distance_to_screen_cm, + screen_width_cm=screen_width_cm, + screen_height_cm=screen_height_cm, + resolution_x=resolution_x, + resolution_y=resolution_y, + method="euclid", + ) + print("PyGaze max_dist (euclid):", maxdist_px) +def main(): + run_example() -# Beispiel: 55" 4k Monitor -screen_width_cm = 3*121.8 -screen_height_cm = 68.5 -resolution_x = 3*3840 -resolution_y = 2160 -distance_to_screen_cm = 120 -method = 'max' -max_angle= 1.0 +if __name__ == "__main__": + main() -maxdist_px = fixation_radius_normalized(theta_deg=max_angle, - distance_cm=distance_to_screen_cm, - screen_width_cm=screen_width_cm, - screen_height_cm=screen_height_cm, - resolution_x=resolution_x, - resolution_y=resolution_y, - method=method) - -print("PyGaze max_dist (max):", maxdist_px) - -method = 'euclid' -maxdist_px = fixation_radius_normalized(theta_deg=max_angle, - distance_cm=distance_to_screen_cm, - screen_width_cm=screen_width_cm, - screen_height_cm=screen_height_cm, - resolution_x=resolution_x, - resolution_y=resolution_y, - method=method) - -print("PyGaze max_dist (euclid):", maxdist_px) - -# Passt noch nicht zu der Breite -# https://osdoc.cogsci.nl/4.0/de/visualangle/ -# https://reference.org/facts/Visual_angle/LUw29zy7 \ No newline at end of file +# Reference +# https://osdoc.cogsci.nl/4.0/de/visualangle/ +# https://reference.org/facts/Visual_angle/LUw29zy7 diff --git a/dataset_creation/open_parquet_test.ipynb b/dataset_creation/open_parquet_test.ipynb deleted file mode 100644 index b72c1f7..0000000 --- a/dataset_creation/open_parquet_test.ipynb +++ /dev/null @@ -1,155 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2b3fface", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74f1f5ec", - "metadata": {}, - "outputs": [], - "source": [ - "df= pd.read_parquet(r\" \")\n", - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05775454", - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99e17328", - "metadata": {}, - "outputs": [], - "source": [ - "df.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69e53731", - "metadata": {}, - "outputs": [], - "source": [ - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3754c664", - "metadata": {}, - "outputs": [], - "source": [ - "# Zeigt alle Kombinationen mit Häufigkeit\n", - "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f83b595c", - "metadata": {}, - "outputs": [], - "source": [ - "high_nback = df[\n", - " (df[\"STUDY\"]==\"n-back\") &\n", - " (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n", - " (df[\"PHASE\"].isin([\"train\", \"test\"]))\n", - "]\n", - "high_nback.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0940343", - "metadata": {}, - "outputs": [], - "source": [ - "low_all = df[\n", - " ((df[\"PHASE\"] == \"baseline\") |\n", - " ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n", - "]\n", - "print(low_all.shape)\n", - "high_kdrive = df[\n", - " (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n", - "]\n", - "print(high_kdrive.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7ce38d3", - "metadata": {}, - "outputs": [], - "source": [ - "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n", - "print(df.shape[0])\n", - "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48ba0379", - "metadata": {}, - "outputs": [], - "source": [ - "high_all = pd.concat([high_nback, high_kdrive])\n", - "high_all.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77dda26c", - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n", - "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n", - "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/dataset_creation/CPFC_both.py b/dataset_creation/parquet_file_creation.py similarity index 92% rename from dataset_creation/CPFC_both.py rename to dataset_creation/parquet_file_creation.py index 06e5a6c..ebf064b 100644 --- a/dataset_creation/CPFC_both.py +++ b/dataset_creation/parquet_file_creation.py @@ -1,8 +1,10 @@ import os import pandas as pd from pathlib import Path +# TODO: Set paths correctly +data_dir = Path("") # path to the directory with all .h5 files +base_dir = Path(r"") # directory to store the parquet files in -data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data") # Get all .h5 files and sort them matching_files = sorted(data_dir.glob("*.h5")) @@ -11,8 +13,8 @@ matching_files = sorted(data_dir.glob("*.h5")) CHUNK_SIZE = 50_000 for i, file_path in enumerate(matching_files): - print(f"Subject {i} gestartet") - print(f"{file_path} geoeffnet") + print(f"Starting with subject {i}") + print(f"Opened: {file_path}") # Step 1: Get total number of rows and column names with pd.HDFStore(file_path, mode="r") as store: @@ -81,7 +83,7 @@ for i, file_path in enumerate(matching_files): print(f"Final dataframe shape: {df_final.shape}") # Save to parquet - base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files") + os.makedirs(base_dir, exist_ok=True) out_name = base_dir / f"both_mod_{i:04d}.parquet"