From de0084dc09acbed3c3a2ecfe8e284bcb3026d844 Mon Sep 17 00:00:00 2001
From: Michael <weigmi87303@th-nuernberg.de>
Date: Wed, 4 Mar 2026 15:09:23 +0100
Subject: [PATCH] getting rid of redundant files in dataset creation

---
 ...wise_parquet_file_creation_EYE_TRACKING.py |  91 ----
 ...chunkwise_parquet_file_creation_FACE_AU.py |  91 ----
 dataset_creation/combined_feature_creation.py |  88 ++--
 dataset_creation/create_feature_table.py      | 113 -----
 .../create_multimodal_dataset_by_merge.py     |  56 ---
 ... => create_parquet_files_from_owncloud.py} |  16 +-
 dataset_creation/eyeAlt.py                    | 323 -------------
 dataset_creation/eye_batch_processor.py       | 441 ------------------
 dataset_creation/eyetrackingFeatures.py       | 323 -------------
 dataset_creation/maxDist.py                   |  99 ++--
 dataset_creation/open_parquet_test.ipynb      | 155 ------
 ...{CPFC_both.py => parquet_file_creation.py} |  10 +-
 12 files changed, 106 insertions(+), 1700 deletions(-)
 delete mode 100644 dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
 delete mode 100644 dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
 delete mode 100644 dataset_creation/create_feature_table.py
 delete mode 100644 dataset_creation/create_multimodal_dataset_by_merge.py
 rename dataset_creation/{create_parquet_files.py => create_parquet_files_from_owncloud.py} (82%)
 delete mode 100644 dataset_creation/eyeAlt.py
 delete mode 100644 dataset_creation/eye_batch_processor.py
 delete mode 100644 dataset_creation/eyetrackingFeatures.py
 delete mode 100644 dataset_creation/open_parquet_test.ipynb
 rename dataset_creation/{CPFC_both.py => parquet_file_creation.py} (92%)

diff --git a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
deleted file mode 100644
index 64b1ae6..0000000
--- a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import os
-import pandas as pd
-from pathlib import Path
-
-print(os.getcwd())
-num_files = 2  # number of files to process (min: 1, max: 30)
-
-print("connection aufgebaut")
-
-data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
-# os.chdir(data_dir)
-# Get all .h5 files and sort them
-matching_files = sorted(data_dir.glob("*.h5"))
-
-# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
-CHUNK_SIZE = 100_000
-
-for i, file_path in enumerate(matching_files):
-    print(f"Subject {i} gestartet")
-    print(f"{file_path} geoeffnet")
-    
-    # Step 1: Get total number of rows and column names
-    with pd.HDFStore(file_path, mode="r") as store:
-        cols = store.select("SIGNALS", start=0, stop=1).columns
-        nrows = store.get_storer("SIGNALS").nrows
-        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
-    
-    # Step 2: Filter columns that start with "FACE_AU"
-    eye_cols = [c for c in cols if c.startswith("EYE_")]
-    print(f"eye-tracking columns found: {eye_cols}")
-    
-    if len(eye_cols) == 0:
-        print(f"keine eye-tracking-Signale in Subject {i}")
-        continue
-    
-    # Columns to read
-    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
-    
-    # Step 3: Process file in chunks
-    chunks_to_save = []
-    
-    for start_row in range(0, nrows, CHUNK_SIZE):
-        stop_row = min(start_row + CHUNK_SIZE, nrows)
-        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
-        
-        # Read chunk
-        df_chunk = pd.read_hdf(
-            file_path, 
-            key="SIGNALS", 
-            columns=columns_to_read,
-            start=start_row,
-            stop=stop_row
-        )
-        
-        # Add metadata columns
-        df_chunk["subjectID"] = i
-        df_chunk["rowID"] = range(start_row, stop_row)
-        
-        # Clean data
-        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
-        df_chunk = df_chunk.dropna()
-        
-        # Only keep non-empty chunks
-        if len(df_chunk) > 0:
-            chunks_to_save.append(df_chunk)
-        
-        # Free memory
-        del df_chunk
-    
-    print("load and cleaning done")
-    
-    # Step 4: Combine all chunks and save
-    if chunks_to_save:
-        df_final = pd.concat(chunks_to_save, ignore_index=True)
-        print(f"Final dataframe shape: {df_final.shape}")
-        
-        # Save to parquet
-        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
-        os.makedirs(base_dir, exist_ok=True)
-        
-        out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
-        df_final.to_parquet(out_name, index=False)
-        print(f"Saved to {out_name}")
-        
-        # Free memory
-        del df_final
-        del chunks_to_save
-    else:
-        print(f"No valid data found for Subject {i}")
-
-print("All files processed!")
\ No newline at end of file
diff --git a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
deleted file mode 100644
index 667de93..0000000
--- a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import os
-import pandas as pd
-from pathlib import Path
-
-print(os.getcwd())
-num_files = 2  # number of files to process (min: 1, max: 30)
-
-print("connection aufgebaut")
-
-data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
-
-# Get all .h5 files and sort them
-matching_files = sorted(data_dir.glob("*.h5"))
-
-# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
-CHUNK_SIZE = 100_000
-
-for i, file_path in enumerate(matching_files):
-    print(f"Subject {i} gestartet")
-    print(f"{file_path} geoeffnet")
-    
-    # Step 1: Get total number of rows and column names
-    with pd.HDFStore(file_path, mode="r") as store:
-        cols = store.select("SIGNALS", start=0, stop=1).columns
-        nrows = store.get_storer("SIGNALS").nrows
-        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
-    
-    # Step 2: Filter columns that start with "FACE_AU"
-    eye_cols = [c for c in cols if c.startswith("FACE_AU")]
-    print(f"FACE_AU columns found: {eye_cols}")
-    
-    if len(eye_cols) == 0:
-        print(f"keine FACE_AU-Signale in Subject {i}")
-        continue
-    
-    # Columns to read
-    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
-    
-    # Step 3: Process file in chunks
-    chunks_to_save = []
-    
-    for start_row in range(0, nrows, CHUNK_SIZE):
-        stop_row = min(start_row + CHUNK_SIZE, nrows)
-        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
-        
-        # Read chunk
-        df_chunk = pd.read_hdf(
-            file_path, 
-            key="SIGNALS", 
-            columns=columns_to_read,
-            start=start_row,
-            stop=stop_row
-        )
-        
-        # Add metadata columns
-        df_chunk["subjectID"] = i
-        df_chunk["rowID"] = range(start_row, stop_row)
-        
-        # Clean data
-        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
-        df_chunk = df_chunk.dropna()
-        
-        # Only keep non-empty chunks
-        if len(df_chunk) > 0:
-            chunks_to_save.append(df_chunk)
-        
-        # Free memory
-        del df_chunk
-    
-    print("load and cleaning done")
-    
-    # Step 4: Combine all chunks and save
-    if chunks_to_save:
-        df_final = pd.concat(chunks_to_save, ignore_index=True)
-        print(f"Final dataframe shape: {df_final.shape}")
-        
-        # Save to parquet
-        base_dir = Path(r"C:\new_AU_parquet_files")
-        os.makedirs(base_dir, exist_ok=True)
-        
-        out_name = base_dir / f"cleaned_{i:04d}.parquet"
-        df_final.to_parquet(out_name, index=False)
-        print(f"Saved to {out_name}")
-        
-        # Free memory
-        del df_final
-        del chunks_to_save
-    else:
-        print(f"No valid data found for Subject {i}")
-
-print("All files processed!")
\ No newline at end of file
diff --git a/dataset_creation/combined_feature_creation.py b/dataset_creation/combined_feature_creation.py
index c525a6f..6fbfccd 100644
--- a/dataset_creation/combined_feature_creation.py
+++ b/dataset_creation/combined_feature_creation.py
@@ -4,27 +4,26 @@ import pandas as pd
 from pathlib import Path
 from sklearn.preprocessing import MinMaxScaler
 from scipy.signal import welch
-from pygazeanalyser.detectors import fixation_detection, saccade_detection
+from pygazeanalyser.detectors import fixation_detection, saccade_detection # not installed by default
 
 
 ##############################################################################
-# KONFIGURATION
+# CONFIGURATION
 ##############################################################################
-INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
-OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet")
-
-WINDOW_SIZE_SAMPLES = 25*50  # 50s bei 25Hz
-STEP_SIZE_SAMPLES = 125     # 5s bei 25Hz
+INPUT_DIR = Path(r"") # directory that stores the parquet files (one file per subject)
+OUTPUT_FILE = Path(r"")  # path for resulting dataset
+WINDOW_SIZE_SAMPLES = 25*50  # 50s at 25Hz
+STEP_SIZE_SAMPLES = 125     # 5s at 25Hz
 SAMPLING_RATE = 25          # Hz
 MIN_DUR_BLINKS = 2          # x * 40ms
 
 
 ##############################################################################
-# EYE-TRACKING FUNKTIONEN
+# EYE-TRACKING FUNCTIONS
 ##############################################################################
 
 def clean_eye_df(df):
-    """Extrahiert nur Eye-Tracking Spalten und entfernt leere Zeilen."""
+    """Extracts Eye-Tracking columns only and removes empty rows."""
     eye_cols = [c for c in df.columns if c.startswith("EYE_")]
 
     if not eye_cols:
@@ -38,7 +37,7 @@ def clean_eye_df(df):
 
 
 def extract_gaze_signal(df):
-    """Extrahiert 2D-Gaze-Positionen, maskiert ungültige Samples und interpoliert."""
+    """Extracts 2D gaze positions, masks invalid samples, and interpolates."""
     gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
     gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
     gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
@@ -51,14 +50,14 @@ def extract_gaze_signal(df):
     for arr in [gx_L, gy_L, gx_R, gy_R]:
         arr.replace([np.inf, -np.inf], np.nan, inplace=True)
 
-    # Ungültige maskieren
+    # Mask invalids
     gx_L[~val_L] = np.nan
     gy_L[~val_L] = np.nan
     gx_R[~val_R] = np.nan
     gy_R[~val_R] = np.nan
 
     
-    # Mittelwert beider Augen
+    # Mean of both eyes
     gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
     gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
 
@@ -66,7 +65,7 @@ def extract_gaze_signal(df):
     gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill()
     gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill()
     
-    # MinMax Skalierung
+    # MinMax scaling
     xscaler = MinMaxScaler()
     gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
 
@@ -77,7 +76,7 @@ def extract_gaze_signal(df):
 
 
 def extract_pupil(df):
-    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
+    """Extract pupil size (average of both eyes)."""
     pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
     pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
 
@@ -96,7 +95,7 @@ def extract_pupil(df):
 
 
 def detect_blinks(pupil_validity, min_duration=5):
-    """Erkennt Blinks: Validity=0 → Blink."""
+    """Detect blinks: Validity=0 → Blink."""
     blinks = []
     start = None
 
@@ -120,13 +119,13 @@ def compute_IPA(pupil, fs=25):
 
 def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
     """
-    Extrahiert Eye-Tracking Features für ein einzelnes Window.
-    Gibt Dictionary mit allen Eye-Features zurück.
+    Extracts eye tracking features for a single window.
+    Returns a dictionary containing all eye features.
     """
     # Gaze
     gaze = extract_gaze_signal(df_eye_window)
     
-    # Pupille
+    # Pupil
     pupil, pupil_validity = extract_pupil(df_eye_window)
 
     window_size = len(df_eye_window)
@@ -143,7 +142,6 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
 
     fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0]
 
-    # Kategorien
     F_short = sum(66 <= d <= 150 for d in fixation_durations)
     F_medium = sum(300 <= d <= 500 for d in fixation_durations)
     F_long = sum(d >= 1000 for d in fixation_durations)
@@ -197,27 +195,27 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
 
 
 ##############################################################################
-# KOMBINIERTE FEATURE-EXTRAKTION
+# Combined feature extraction
 ##############################################################################
 
 def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2):
     """
-    Verarbeitet Parquet-Dateien mit FACE_AU und EYE Spalten.
-    Extrahiert beide Feature-Sets und kombiniert sie.
+    Processes Parquet files with FACE_AU and EYE columns.
+    Extracts both feature sets and combines them.
     """
     input_path = Path(input_dir)
     parquet_files = sorted(input_path.glob("*.parquet"))
     
     if not parquet_files:
-        print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
+        print(f"Error: No parquet-files found in {input_dir}!")
         return None
     
     print(f"\n{'='*70}")
-    print(f"KOMBINIERTE FEATURE-EXTRAKTION")
+    print(f"Combined feature-extraction")
     print(f"{'='*70}")
-    print(f"Dateien: {len(parquet_files)}")
-    print(f"Window: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
-    print(f"Step: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
+    print(f"Files: {len(parquet_files)}")
+    print(f"Window: {window_size} Samples ({window_size/fs:.1f}s at {fs}Hz)")
+    print(f"Step: {step_size} Samples ({step_size/fs:.1f}s at {fs}Hz)")
     print(f"{'='*70}\n")
     
     all_windows = []
@@ -227,24 +225,22 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
         
         try:
             df = pd.read_parquet(parquet_file)
-            print(f"  Einträge: {len(df)}")
+            print(f"  Entries: {len(df)}")
 
-            
-            # Identifiziere Spalten
             au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
             eye_columns = [col for col in df.columns if col.startswith('EYE_')]
             
-            print(f"  AU-Spalten: {len(au_columns)}")
-            print(f"  Eye-Spalten: {len(eye_columns)}")
+            print(f"  AU-columns: {len(au_columns)}")
+            print(f"  Eye-columns: {len(eye_columns)}")
             
             has_au = len(au_columns) > 0
             has_eye = len(eye_columns) > 0
             
             if not has_au and not has_eye:
-                print(f"  WARNUNG: Keine AU oder Eye Spalten gefunden!")
+                print(f"  Warning: No AU or eye tracking columns found!")
                 continue
             
-            # Gruppiere nach STUDY, LEVEL, PHASE
+            # Group by STUDY, LEVEL, PHASE
             group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns]
             
             if group_cols:
@@ -258,7 +254,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                 
                 group_df = group_df.reset_index(drop=True)
                 
-                # Berechne Anzahl Windows
+                # calculate number of windows
                 num_windows = (len(group_df) - window_size) // step_size + 1
                 
                 if num_windows <= 0:
@@ -272,7 +268,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                     
                     window_df = group_df.iloc[start_idx:end_idx]
                     
-                    # Basis-Metadaten
+                    # basic metadata
                     result = {
                         'subjectID': window_df['subjectID'].iloc[0],
                         'start_time': window_df['rowID'].iloc[0],
@@ -281,12 +277,12 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                         'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan
                     }
                     
-                    # FACE AU Features
+                    # FACE AU features
                     if has_au:
                         for au_col in au_columns:
                             result[f'{au_col}_mean'] = window_df[au_col].mean()
                     
-                    # Eye-Tracking Features
+                    # Eye-tracking features
                     if has_eye:
                         try:
                             # clean dataframe from all nan rows
@@ -296,7 +292,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                             result.update(eye_features)
                         except Exception as e:
                             print(f"    WARNUNG: Eye-Features fehlgeschlagen: {str(e)}")
-                            # Füge NaN-Werte für Eye-Features hinzu
+                            # Add NaN-values for eye-features 
                             result.update({
                                 "Fix_count_short_66_150": np.nan,
                                 "Fix_count_medium_300_500": np.nan,
@@ -325,7 +321,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
             traceback.print_exc()
             continue
     
-    # Kombiniere alle Windows
+    # Combine all windows
     if not all_windows:
         print("\nKEINE FEATURES EXTRAHIERT!")
         return None
@@ -340,7 +336,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
     print(f"Spalten: {len(result_df.columns)}")
     print(f"Subjects: {result_df['subjectID'].nunique()}")
     
-    # Speichern
+    # Save
     output_path = Path(output_file)
     output_path.parent.mkdir(parents=True, exist_ok=True)
     result_df.to_parquet(output_file, index=False)
@@ -357,7 +353,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
 
 def main():
     print("\n" + "="*70)
-    print("KOMBINIERTE FEATURE-EXTRAKTION (AU + EYE)")
+    print("Combined extraction (AU + EYE)")
     print("="*70)
     
     result = process_combined_features(
@@ -370,16 +366,16 @@ def main():
     )
     
     if result is not None:
-        print("\nErste 5 Zeilen:")
+        print("\First 5 rows:")
         print(result.head())
         
-        print("\nSpalten-Übersicht:")
+        print("\nColumns overview:")
         print(result.dtypes)
         
-        print("\nStatistik:")
+        print("\Statistics:")
         print(result.describe())
     
-    print("\n✓ FERTIG!\n")
+    print("\nDone!\n")
 
 
 if __name__ == "__main__":
diff --git a/dataset_creation/create_feature_table.py b/dataset_creation/create_feature_table.py
deleted file mode 100644
index 54e7892..0000000
--- a/dataset_creation/create_feature_table.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pandas as pd
-import numpy as np
-from pathlib import Path
-
-def process_parquet_files(input_dir, output_file, window_size=1250, step_size=125):
-    """
-    Verarbeitet Parquet-Dateien mit Sliding Window Aggregation.
-    
-    Parameters:
-    -----------
-    input_dir : str
-        Verzeichnis mit Parquet-Dateien
-    output_file : str
-        Pfad für die Ausgabe-Parquet-Datei
-    window_size : int
-        Größe des Sliding Windows (default: 3000)
-    step_size : int
-        Schrittweite in Einträgen (default: 250 = 10 Sekunden bei 25 Hz)
-    """
-    
-    input_path = Path(input_dir)
-    parquet_files = sorted(input_path.glob("*.parquet"))
-    
-    if not parquet_files:
-        print(f"Keine Parquet-Dateien in {input_dir} gefunden!")
-        return
-    
-    print(f"Gefundene Dateien: {len(parquet_files)}")
-    
-    all_windows = []
-    
-    for file_idx, parquet_file in enumerate(parquet_files):
-        print(f"\nVerarbeite Datei {file_idx + 1}/{len(parquet_files)}: {parquet_file.name}")
-        
-        # Lade Parquet-Datei
-        df = pd.read_parquet(parquet_file)
-        print(f"  Einträge: {len(df)}")
-        
-        # Identifiziere AU-Spalten
-        au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
-        print(f"  AU-Spalten: {len(au_columns)}")
-        
-        # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
-        for (study_val, level_val, phase_val), level_df in df.groupby(['STUDY', 'LEVEL', 'PHASE'], sort=False):
-            print(f"    STUDY {study_val}, LEVEL {level_val}, PHASE {phase_val}: {len(level_df)} Einträge")
-            
-            # Reset index für korrekte Position-Berechnung
-            level_df = level_df.reset_index(drop=True)
-            
-            # Sliding Window über dieses Level
-            num_windows = (len(level_df) - window_size) // step_size + 1
-            
-            if num_windows <= 0:
-                print(f"      Zu wenige Einträge für Window (benötigt {window_size})")
-                continue
-            
-            for i in range(num_windows):
-                start_idx = i * step_size
-                end_idx = start_idx + window_size
-                
-                window_df = level_df.iloc[start_idx:end_idx]
-                
-                # Erstelle aggregiertes Ergebnis
-                result = {
-                    'subjectID': window_df['subjectID'].iloc[0],
-                    'start_time': window_df['rowID'].iloc[0],  # rowID als start_time
-                    'STUDY': window_df['STUDY'].iloc[0],
-                    'LEVEL': window_df['LEVEL'].iloc[0],
-                    'PHASE': window_df['PHASE'].iloc[0]
-                }
-                
-                # Summiere alle AU-Spalten
-                for au_col in au_columns:
-                    # result[f'{au_col}_sum'] = window_df[au_col].sum()
-                    result[f'{au_col}_mean'] = window_df[au_col].mean()
-                
-                all_windows.append(result)
-            
-            print(f"      Windows erstellt: {num_windows}")
-    
-    # Erstelle finalen DataFrame
-    result_df = pd.DataFrame(all_windows)
-    
-    print(f"\n{'='*60}")
-    print(f"Gesamt Windows erstellt: {len(result_df)}")
-    print(f"Spalten: {list(result_df.columns)}")
-    
-    # Speichere Ergebnis
-    result_df.to_parquet(output_file, index=False)
-    print(f"\nErgebnis gespeichert in: {output_file}")
-    
-    return result_df
-
-
-# Beispiel-Verwendung
-if __name__ == "__main__":
-    # Anpassen an deine Pfade
-    input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
-    output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
-
-    
-    
-    result = process_parquet_files(
-        input_dir=input_directory,
-        output_file=output_file,
-        window_size=1250,
-        step_size=125
-    )
-    
-    # Zeige erste Zeilen
-    if result is not None:
-        print("\nErste 5 Zeilen des Ergebnisses:")
-        print(result.head())
\ No newline at end of file
diff --git a/dataset_creation/create_multimodal_dataset_by_merge.py b/dataset_creation/create_multimodal_dataset_by_merge.py
deleted file mode 100644
index a81a242..0000000
--- a/dataset_creation/create_multimodal_dataset_by_merge.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from pathlib import Path
-import pandas as pd
-
-
-def main():
-    """
-    USER CONFIGURATION
-    ------------------
-    Specify input files and output directory here.
-    """
-
-    # Input parquet files (single-modality datasets)
-    file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
-    file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
-
-    # Output directory and file name
-    output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
-    output_file = output_dir / "merged_dataset.parquet"
-
-    # Column names (adjust only if your schema differs)
-    subject_col = "subjectID"
-    time_col = "start_time"
-
-    # ------------------------------------------------------------------
-    # Load datasets
-    # ------------------------------------------------------------------
-    df1 = pd.read_parquet(file_modality_1)
-    df2 = pd.read_parquet(file_modality_2)
-
-    # ------------------------------------------------------------------
-    # Keep only subjects that appear in BOTH datasets
-    # ------------------------------------------------------------------
-    common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
-
-    df1 = df1[df1[subject_col].isin(common_subjects)]
-    df2 = df2[df2[subject_col].isin(common_subjects)]
-
-    # ------------------------------------------------------------------
-    # Inner join on subject ID AND start_time
-    # ------------------------------------------------------------------
-    merged_df = pd.merge(
-        df1,
-        df2,
-        on=[subject_col, time_col],
-        how="inner",
-    )
-
-    # ------------------------------------------------------------------
-    # Save merged dataset
-    # ------------------------------------------------------------------
-    output_dir.mkdir(parents=True, exist_ok=True)
-    merged_df.to_parquet(output_file, index=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dataset_creation/create_parquet_files.py b/dataset_creation/create_parquet_files_from_owncloud.py
similarity index 82%
rename from dataset_creation/create_parquet_files.py
rename to dataset_creation/create_parquet_files_from_owncloud.py
index 1a2fb7f..7c3880f 100644
--- a/dataset_creation/create_parquet_files.py
+++ b/dataset_creation/create_parquet_files_from_owncloud.py
@@ -1,6 +1,5 @@
-#  pip install pyocclient
 import yaml
-import owncloud
+import owncloud #  pip install pyocclient
 import pandas as pd
 import h5py
 import os
@@ -26,7 +25,7 @@ for i in range(num_files):
 
     # Download file from ownCloud
     oc.get_file(file_name, local_tmp)
-    print(f"{file_name} geoeffnet")
+    print(f"Opened: {file_name}")
     # Load into memory and extract needed columns
     # with h5py.File(local_tmp, "r") as f:
     #     # Adjust this path depending on actual dataset layout inside .h5py file
@@ -35,14 +34,9 @@ for i in range(num_files):
     with pd.HDFStore(local_tmp, mode="r") as store:
         cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
 
-    # Step 2: Filter columns that start with "AU"
-    au_cols = [c for c in cols if c.startswith("AU")]
-    print(au_cols)
-    if len(au_cols)==0:
-        print(f"keine AU Signale in Subject {i}")
-        continue
+
     # Step 3: Read only those columns (plus any others you want)
-    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
+    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + cols)
 
 
     print("load done")
@@ -63,7 +57,7 @@ for i in range(num_files):
   
 
     # Save to parquet
-    os.makedirs("ParquetFiles", exist_ok=True)
+    os.makedirs("ParquetFiles", exist_ok=True) # TODO: change for custom directory
     out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
     df.to_parquet(out_name, index=False)
 
diff --git a/dataset_creation/eyeAlt.py b/dataset_creation/eyeAlt.py
deleted file mode 100644
index fef68ad..0000000
--- a/dataset_creation/eyeAlt.py
+++ /dev/null
@@ -1,323 +0,0 @@
-import numpy as np
-import pandas as pd
-import h5py
-import yaml
-import os
-from sklearn.preprocessing import MinMaxScaler
-from scipy.signal import welch
-from pygazeanalyser.detectors import fixation_detection, saccade_detection
-
-
-##############################################################################
-# 1. HELFERFUNKTIONEN
-##############################################################################
-def clean_eye_df(df):
-    """
-    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
-    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
-    """
-    eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
-    df_eye = df[eye_cols]
-
-    # INF → NaN
-    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
-
-    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
-    df_eye = df_eye.dropna(subset=eye_cols, how="all")
-
-    print("Eyetracking-Zeilen vorher:", len(df))
-    print("Eyetracking-Zeilen nachher:", len(df_eye))
-
-    #Index zurücksetzen
-    return df_eye.reset_index(drop=True)
-
-
-def extract_gaze_signal(df):
-    """
-    Extrahiert 2D-Gaze-Positionen auf dem Display,
-    maskiert ungültige Samples und interpoliert Lücken.
-    """
-
-    print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
-
-    # Gaze-Spalten
-    gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-    gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-
-
-    # Validity-Spalten (1 = gültig)
-    val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
-    val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
-
-    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
-    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-
-    # Ungültige Werte maskieren
-    gx_L[~val_L] = np.nan
-    gy_L[~val_L] = np.nan
-    gx_R[~val_R] = np.nan
-    gy_R[~val_R] = np.nan
-
-    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
-    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
-    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
-
-    # Interpolation (wichtig für PyGaze!)
-    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
-
-    # xscaler = MinMaxScaler()
-    # gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
-
-    # yscaler = MinMaxScaler()
-    # gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
-
-    #print("xmax ymax", gxscale.max(), gyscale.max())
-
-    #out = np.column_stack((gxscale, gyscale))
-    out = np.column_stack((gx, gy))
-
-    print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
-
-    return out
-
-
-def extract_pupil(df):
-    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
-
-    pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-    pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-
-    vl = df.get("LEFT_PUPIL_VALIDITY")
-    vr = df.get("RIGHT_PUPIL_VALIDITY")
-
-    if vl is None or vr is None:
-        # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
-        # gültig, wenn Pupillendurchmesser nicht NaN.
-        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
-    else:
-        # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
-        validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
-
-    # Mittelwert der verfügbaren Pupillen
-    p = np.mean(np.column_stack([pl, pr]), axis=1)
-
-    # INF/NaN reparieren
-    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    p = p.to_numpy()
-
-    print("→ extract_pupil(): Pupillensignal Länge:", len(p))
-    return p, validity
-
-
-def detect_blinks(pupil_validity, min_duration=5):
-    """Erkennt Blinks: Validity=0 → Blink."""
-    blinks = []
-    start = None
-
-    for i, v in enumerate(pupil_validity):
-        if v == 0 and start is None:
-            start = i
-        elif v == 1 and start is not None:
-            if i - start >= min_duration:
-                blinks.append([start, i])
-            start = None
-
-    return blinks
-
-
-def compute_IPA(pupil, fs=250):
-    """
-    IPA = Index of Pupillary Activity (nach Duchowski 2018).
-    Hochfrequenzanteile der Pupillenzeitreihe.
-    """
-    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
-
-    hf_band = (f >= 0.6) & (f <= 2.0)
-    ipa = np.sum(Pxx[hf_band])
-
-    return ipa
-
-
-##############################################################################
-# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
-##############################################################################
-
-def extract_eye_features(df, window_length_sec=50, fs=250):
-    """
-    df = Tobii DataFrame
-    window_length_sec = Fenstergröße (z.B. W=1s)
-    """
-
-    print("→ extract_eye_features(): Starte Feature-Berechnung...")
-    print("   Fensterlänge W =", window_length_sec, "s")
-
-    W = int(window_length_sec * fs) # Window größe in Samples
-
-    # Gaze
-    gaze = extract_gaze_signal(df)
-    gx, gy = gaze[:, 0], gaze[:, 1]
-    print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
-    print("Range:", np.nanmin(gx), np.nanmax(gx))
-    print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
-    print("Range:", np.nanmin(gy), np.nanmax(gy))
-
-    # Pupille
-    pupil, pupil_validity = extract_pupil(df)
-
-    features = []
-
-    # Sliding windows
-    for start in range(0, len(df), W):
-        end = start + W
-        if end > len(df):
-            break          #das letzte Fenster wird ignoriert
-
-
-        w_gaze = gaze[start:end]
-        w_pupil = pupil[start:end]
-        w_valid = pupil_validity[start:end]
-
-        # ----------------------------
-        # FIXATIONS (PyGaze)
-        # ----------------------------
-        time_ms = np.arange(W) * 1000.0 / fs
-
-        # print("gx im Fenster:", w_gaze[:,0][:20])
-        # print("gy im Fenster:", w_gaze[:,1][:20])
-        # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
-
-        # print("Werte X im Fenster:", w_gaze[:,0])
-        # print("Werte Y im Fenster:", w_gaze[:,1])
-        # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
-        # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
-        print("time_ms:", time_ms)
-
-        fix, efix = fixation_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
-            missing=0.0, maxdist=0.003, mindur=10 # mindur=100ms
-        )
-
-        #print("Raw Fixation Output:", efix[0])
-
-        if start == 0:
-            print("DEBUG fix raw:", fix[:10])
-
-        # Robust fixations: PyGaze may return malformed entries
-        fixation_durations = []
-        for f in efix:
-            print("Efix:", f[2])
-            # start_t = f[1]   # in ms
-            # end_t   = f[2]   # in ms
-            # duration = (end_t - start_t) / 1000.0  # in Sekunden
-
-            #duration = f[2] / 1000.0
-            if np.isfinite(f[2]) and f[2] > 0:
-                fixation_durations.append(f[2])
-
-        # Kategorien laut Paper
-        F_short = sum(66 <= d <= 150 for d in fixation_durations)
-        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
-        F_long = sum(d >= 1000 for d in fixation_durations)
-        F_hundred = sum(d > 100 for d in fixation_durations)
-        F_Cancel = sum(66 < d for d in fixation_durations)
-
-        # ----------------------------
-        # SACCADES
-        # ----------------------------
-        sac, esac = saccade_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
-        )
-
-        sac_durations = [s[2] for s in esac]
-        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
-
-        # ----------------------------
-        # BLINKS
-        # ----------------------------
-        blinks = detect_blinks(w_valid)
-        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
-
-        # ----------------------------
-        # PUPIL
-        # ----------------------------
-        if np.all(np.isnan(w_pupil)):
-            mean_pupil = np.nan
-            ipa = np.nan
-        else:
-            mean_pupil = np.nanmean(w_pupil)
-            ipa = compute_IPA(w_pupil, fs=fs)
-
-        # ----------------------------
-        # FEATURE-TABELLE FÜLLEN
-        # ----------------------------
-        features.append({
-            "Fix_count_short_66_150": F_short,
-            "Fix_count_medium_300_500": F_medium,
-            "Fix_count_long_gt_1000": F_long,
-            "Fix_count_100": F_hundred,
-            "Fix_cancel": F_Cancel,
-            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
-            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
-
-            "Sac_count": len(sac),
-            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
-            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
-            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
-
-            "Blink_count": len(blinks),
-            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
-            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
-
-            "Pupil_mean": mean_pupil,
-            "Pupil_IPA": ipa
-        })
-
-
-    result = pd.DataFrame(features)
-    print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
-
-    return result
-
-##############################################################################
-# 3. MAIN FUNKTION
-##############################################################################
-
-def main():
-    print("### STARTE FEATURE-EXTRAKTION ###")
-    print("Aktueller Arbeitsordner:", os.getcwd())
-
-    #df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
-    df = pd.read_parquet("cleaned_0001.parquet")
-    print("DataFrame geladen:", df.shape)
-
-    # Nur Eye-Tracking auswählen
-    #eye_cols = [c for c in df.columns if "EYE_" in c]
-    #df_eye = df[eye_cols]
-
-    #print("Eye-Tracking-Spalten:", len(eye_cols))
-    #print("→", eye_cols[:10], " ...")
-
-    print("Reinige Eyetracking-Daten ...")
-    df_eye = clean_eye_df(df)
-
-    # Feature Extraction
-    features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
-
-    print("\n### FEATURE-MATRIX (HEAD) ###")
-    print(features.head())
-
-    print("\nSpeichere Output in features.csv ...")
-    features.to_csv("features4.csv", index=False)
-
-    print("FERTIG!")
-
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/dataset_creation/eye_batch_processor.py b/dataset_creation/eye_batch_processor.py
deleted file mode 100644
index 8192147..0000000
--- a/dataset_creation/eye_batch_processor.py
+++ /dev/null
@@ -1,441 +0,0 @@
-import numpy as np
-import pandas as pd
-import h5py
-import yaml
-import os
-from pathlib import Path
-from sklearn.preprocessing import MinMaxScaler
-from scipy.signal import welch
-from pygazeanalyser.detectors import fixation_detection, saccade_detection
-
-
-##############################################################################
-# KONFIGURATION - HIER ANPASSEN!
-##############################################################################
-INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
-OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
-
-WINDOW_SIZE_SAMPLES = 12500  # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
-STEP_SIZE_SAMPLES = 1250    # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
-SAMPLING_RATE = 250         # Hz
-
-
-##############################################################################
-# 1. HELFERFUNKTIONEN
-##############################################################################
-def clean_eye_df(df):
-    """
-    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
-    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
-    """
-    eye_cols = [c for c in df.columns if c.startswith("EYE_")]
-    df_eye = df[eye_cols]
-
-    # INF → NaN
-    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
-
-    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
-    df_eye = df_eye.dropna(subset=eye_cols, how="all")
-
-    print(f"  Eyetracking-Zeilen: {len(df)} → {len(df_eye)}")
-
-    return df_eye.reset_index(drop=True)
-
-
-def extract_gaze_signal(df):
-    """
-    Extrahiert 2D-Gaze-Positionen auf dem Display,
-    maskiert ungültige Samples und interpoliert Lücken.
-    """
-    # Gaze-Spalten
-    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-
-    # Validity-Spalten (1 = gültig)
-    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
-    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
-
-    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
-    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-
-    # Ungültige Werte maskieren
-    gx_L[~val_L] = np.nan
-    gy_L[~val_L] = np.nan
-    gx_R[~val_R] = np.nan
-    gy_R[~val_R] = np.nan
-
-    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
-    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
-    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
-
-    # Interpolation (wichtig für PyGaze!)
-    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    
-    xscaler = MinMaxScaler()
-    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
-
-    yscaler = MinMaxScaler()
-    gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
-    
-    out = np.column_stack((gxscale, gyscale))
-    return out
-
-
-def extract_pupil(df):
-    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
-    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-
-    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
-    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
-
-    if vl is None or vr is None:
-        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
-    else:
-        validity = ((vl == 1) | (vr == 1)).astype(int).to_numpy()
-
-    # Mittelwert der verfügbaren Pupillen
-    p = np.mean(np.column_stack([pl, pr]), axis=1)
-
-    # INF/NaN reparieren
-    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    p = p.to_numpy()
-
-    return p, validity
-
-
-def detect_blinks(pupil_validity, min_duration=5):
-    """Erkennt Blinks: Validity=0 → Blink."""
-    blinks = []
-    start = None
-
-    for i, v in enumerate(pupil_validity):
-        if v == 0 and start is None:
-            start = i
-        elif v == 1 and start is not None:
-            if i - start >= min_duration:
-                blinks.append([start, i])
-            start = None
-
-    return blinks
-
-
-def compute_IPA(pupil, fs=250):
-    """
-    IPA = Index of Pupillary Activity (nach Duchowski 2018).
-    Hochfrequenzanteile der Pupillenzeitreihe.
-    """
-    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
-
-    hf_band = (f >= 0.6) & (f <= 2.0)
-    ipa = np.sum(Pxx[hf_band])
-
-    return ipa
-
-
-##############################################################################
-# 2. FEATURE-EXTRAKTION MIT SLIDING WINDOW
-##############################################################################
-
-def extract_eye_features_sliding(df_eye, df_meta, window_size, step_size, fs=250):
-    """
-    Extrahiert Features mit Sliding Window aus einem einzelnen Level/Phase.
-    
-    Parameters:
-    -----------
-    df_eye : DataFrame
-        Eye-Tracking Daten (bereits gereinigt)
-    df_meta : DataFrame
-        Metadaten (subjectID, rowID, STUDY, LEVEL, PHASE)
-    window_size : int
-        Anzahl Samples pro Window
-    step_size : int
-        Schrittweite in Samples
-    fs : int
-        Sampling Rate in Hz
-    """
-    # Gaze
-    gaze = extract_gaze_signal(df_eye)
-    
-    # Pupille
-    pupil, pupil_validity = extract_pupil(df_eye)
-
-    features = []
-    num_windows = (len(df_eye) - window_size) // step_size + 1
-    
-    if num_windows <= 0:
-        return pd.DataFrame()
-
-    for i in range(num_windows):
-        start_idx = i * step_size
-        end_idx = start_idx + window_size
-        
-        w_gaze = gaze[start_idx:end_idx]
-        w_pupil = pupil[start_idx:end_idx]
-        w_valid = pupil_validity[start_idx:end_idx]
-
-        # Metadaten für dieses Window
-        meta_row = df_meta.iloc[start_idx]
-        
-        # ----------------------------
-        # FIXATIONS (PyGaze)
-        # ----------------------------
-        time_ms = np.arange(window_size) * 1000.0 / fs
-
-        fix, efix = fixation_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
-            missing=0.0, maxdist=0.003, mindur=10
-        )
-
-        fixation_durations = []
-        for f in efix:
-            if np.isfinite(f[2]) and f[2] > 0:
-                fixation_durations.append(f[2])
-
-        # Kategorien laut Paper
-        F_short = sum(66 <= d <= 150 for d in fixation_durations)
-        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
-        F_long = sum(d >= 1000 for d in fixation_durations)
-        F_hundred = sum(d > 100 for d in fixation_durations)
-        # F_Cancel = sum(66 < d for d in fixation_durations)
-
-        # ----------------------------
-        # SACCADES
-        # ----------------------------
-        sac, esac = saccade_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, 
-            missing=0, minlen=12, maxvel=0.2, maxacc=1
-        )
-
-        sac_durations = [s[2] for s in esac]
-        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
-
-        # ----------------------------
-        # BLINKS
-        # ----------------------------
-        blinks = detect_blinks(w_valid)
-        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
-
-        # ----------------------------
-        # PUPIL
-        # ----------------------------
-        if np.all(np.isnan(w_pupil)):
-            mean_pupil = np.nan
-            ipa = np.nan
-        else:
-            mean_pupil = np.nanmean(w_pupil)
-            ipa = compute_IPA(w_pupil, fs=fs)
-
-        # ----------------------------
-        # FEATURE-DICTIONARY
-        # ----------------------------
-        features.append({
-            # Metadaten
-            'subjectID': meta_row['subjectID'],
-            'start_time': meta_row['rowID'],
-            'STUDY': meta_row.get('STUDY', np.nan),
-            'LEVEL': meta_row.get('LEVEL', np.nan),
-            'PHASE': meta_row.get('PHASE', np.nan),
-            
-            # Fixation Features
-            "Fix_count_short_66_150": F_short,
-            "Fix_count_medium_300_500": F_medium,
-            "Fix_count_long_gt_1000": F_long,
-            "Fix_count_100": F_hundred,
-             # "Fix_cancel": F_Cancel,
-            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
-            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
-
-            # Saccade Features
-            "Sac_count": len(sac),
-            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
-            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
-            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
-
-            # Blink Features
-            "Blink_count": len(blinks),
-            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
-            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
-
-            # Pupil Features
-            "Pupil_mean": mean_pupil,
-            "Pupil_IPA": ipa
-        })
-
-    return pd.DataFrame(features)
-
-
-##############################################################################
-# 3. BATCH-VERARBEITUNG
-##############################################################################
-
-def process_parquet_directory(input_dir, output_file, window_size, step_size, fs=250):
-    """
-    Verarbeitet alle Parquet-Dateien in einem Verzeichnis.
-    
-    Parameters:
-    -----------
-    input_dir : str
-        Pfad zum Verzeichnis mit Parquet-Dateien
-    output_file : str
-        Pfad für die Ausgabe-Parquet-Datei
-    window_size : int
-        Window-Größe in Samples
-    step_size : int
-        Schrittweite in Samples
-    fs : int
-        Sampling Rate in Hz
-    """
-    input_path = Path(input_dir)
-    parquet_files = sorted(input_path.glob("*.parquet"))
-    
-    if not parquet_files:
-        print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
-        return
-    
-    print(f"\n{'='*70}")
-    print(f"STARTE BATCH-VERARBEITUNG")
-    print(f"{'='*70}")
-    print(f"Gefundene Dateien: {len(parquet_files)}")
-    print(f"Window Size: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
-    print(f"Step Size: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
-    print(f"{'='*70}\n")
-    
-    all_features = []
-    
-    for file_idx, parquet_file in enumerate(parquet_files, 1):
-        print(f"\n[{file_idx}/{len(parquet_files)}] Verarbeite: {parquet_file.name}")
-        
-        try:
-            # Lade Parquet-Datei
-            df = pd.read_parquet(parquet_file)
-            print(f"  Einträge geladen: {len(df)}")
-            
-            # Prüfe ob benötigte Spalten vorhanden sind
-            required_cols = ['subjectID', 'rowID']
-            missing_cols = [col for col in required_cols if col not in df.columns]
-            if missing_cols:
-                print(f"  WARNUNG: Fehlende Spalten: {missing_cols} - Überspringe Datei")
-                continue
-            
-            # Reinige Eye-Tracking-Daten
-            df_eye = clean_eye_df(df)
-            
-            if len(df_eye) == 0:
-                print(f"  WARNUNG: Keine gültigen Eye-Tracking-Daten - Überspringe Datei")
-                continue
-            
-            # Metadaten extrahieren (aligned mit df_eye)
-            meta_cols = ['subjectID', 'rowID']
-            if 'STUDY' in df.columns:
-                meta_cols.append('STUDY')
-            if 'LEVEL' in df.columns:
-                meta_cols.append('LEVEL')
-            if 'PHASE' in df.columns:
-                meta_cols.append('PHASE')
-            
-            df_meta = df[meta_cols].iloc[df_eye.index].reset_index(drop=True)
-            
-            # Gruppiere nach STUDY, LEVEL, PHASE (falls vorhanden)
-            group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df_meta.columns]
-            
-            if group_cols:
-                print(f"  Gruppiere nach: {', '.join(group_cols)}")
-                for group_vals, group_df in df_meta.groupby(group_cols, sort=False):
-                    group_eye = df_eye.iloc[group_df.index].reset_index(drop=True)
-                    group_meta = group_df.reset_index(drop=True)
-                    
-                    print(f"    Gruppe {group_vals}: {len(group_eye)} Samples", end=" → ")
-                    
-                    features_df = extract_eye_features_sliding(
-                        group_eye, group_meta, window_size, step_size, fs
-                    )
-                    
-                    if not features_df.empty:
-                        all_features.append(features_df)
-                        print(f"{len(features_df)} Windows")
-                    else:
-                        print("Zu wenige Daten")
-            else:
-                # Keine Gruppierung
-                print(f"  Keine Gruppierungsspalten gefunden")
-                features_df = extract_eye_features_sliding(
-                    df_eye, df_meta, window_size, step_size, fs
-                )
-                
-                if not features_df.empty:
-                    all_features.append(features_df)
-                    print(f"  → {len(features_df)} Windows erstellt")
-                else:
-                    print(f"  → Zu wenige Daten")
-                    
-        except Exception as e:
-            print(f"  FEHLER bei Verarbeitung: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            continue
-    
-    # Kombiniere alle Features
-    if not all_features:
-        print("\nKEINE FEATURES EXTRAHIERT!")
-        return None
-    
-    print(f"\n{'='*70}")
-    print(f"ZUSAMMENFASSUNG")
-    print(f"{'='*70}")
-    
-    final_df = pd.concat(all_features, ignore_index=True)
-    
-    print(f"Gesamt Windows: {len(final_df)}")
-    print(f"Spalten: {len(final_df.columns)}")
-    print(f"Subjects: {final_df['subjectID'].nunique()}")
-    
-    # Speichere Ergebnis
-    output_path = Path(output_file)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    final_df.to_parquet(output_file, index=False)
-    
-    print(f"\n✓ Ergebnis gespeichert: {output_file}")
-    print(f"{'='*70}\n")
-    
-    return final_df
-
-
-##############################################################################
-# 4. MAIN
-##############################################################################
-
-def main():
-    print("\n" + "="*70)
-    print("EYE-TRACKING FEATURE EXTRAKTION - BATCH MODE")
-    print("="*70)
-    
-    result = process_parquet_directory(
-        input_dir=INPUT_DIR,
-        output_file=OUTPUT_FILE,
-        window_size=WINDOW_SIZE_SAMPLES,
-        step_size=STEP_SIZE_SAMPLES,
-        fs=SAMPLING_RATE
-    )
-    
-    if result is not None:
-        print("\nErste 5 Zeilen des Ergebnisses:")
-        print(result.head())
-        
-        print("\nSpalten-Übersicht:")
-        print(result.columns.tolist())
-        
-        print("\nDatentypen:")
-        print(result.dtypes)
-    
-    print("\n✓ FERTIG!\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dataset_creation/eyetrackingFeatures.py b/dataset_creation/eyetrackingFeatures.py
deleted file mode 100644
index 03d15c9..0000000
--- a/dataset_creation/eyetrackingFeatures.py
+++ /dev/null
@@ -1,323 +0,0 @@
-import numpy as np
-import pandas as pd
-import h5py
-import yaml
-import owncloud
-import os
-from sklearn.preprocessing import MinMaxScaler
-from scipy.signal import welch
-from pygazeanalyser.detectors import fixation_detection, saccade_detection
-
-
-##############################################################################
-# 1. HELFERFUNKTIONEN
-##############################################################################
-def clean_eye_df(df):
-    """
-    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
-    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
-    """
-    eye_cols = [c for c in df.columns if "EYE_" in c]
-    df_eye = df[eye_cols]
-
-    # INF → NaN
-    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
-
-    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
-    df_eye = df_eye.dropna(subset=eye_cols, how="all")
-
-    print("Eyetracking-Zeilen vorher:", len(df))
-    print("Eyetracking-Zeilen nachher:", len(df_eye))
-
-    #Index zurücksetzen
-    return df_eye.reset_index(drop=True)
-
-
-def extract_gaze_signal(df):
-    """
-    Extrahiert 2D-Gaze-Positionen auf dem Display,
-    maskiert ungültige Samples und interpoliert Lücken.
-    """
-
-    print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
-
-    # Gaze-Spalten
-    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
-    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
-
-
-    # Validity-Spalten (1 = gültig)
-    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
-    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
-
-    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
-    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
-
-    # Ungültige Werte maskieren
-    gx_L[~val_L] = np.nan
-    gy_L[~val_L] = np.nan
-    gx_R[~val_R] = np.nan
-    gy_R[~val_R] = np.nan
-
-    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
-    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
-    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
-
-    # Interpolation (wichtig für PyGaze!)
-    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
-
-    xscaler = MinMaxScaler()
-    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
-
-    yscaler = MinMaxScaler()
-    gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
-
-    print("xmax ymax", gxscale.max(), gyscale.max())
-
-    out = np.column_stack((gxscale, gyscale))
-
-    print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
-
-    return out
-
-
-def extract_pupil(df):
-    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
-
-    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
-
-    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
-    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
-
-    if vl is None or vr is None:
-        # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
-        # gültig, wenn Pupillendurchmesser nicht NaN.
-        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
-    else:
-        # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
-        validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
-
-    # Mittelwert der verfügbaren Pupillen
-    p = np.mean(np.column_stack([pl, pr]), axis=1)
-
-    # INF/NaN reparieren
-    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
-    p = p.to_numpy()
-
-    print("→ extract_pupil(): Pupillensignal Länge:", len(p))
-    return p, validity
-
-
-def detect_blinks(pupil_validity, min_duration=5):
-    """Erkennt Blinks: Validity=0 → Blink."""
-    blinks = []
-    start = None
-
-    for i, v in enumerate(pupil_validity):
-        if v == 0 and start is None:
-            start = i
-        elif v == 1 and start is not None:
-            if i - start >= min_duration:
-                blinks.append([start, i])
-            start = None
-
-    return blinks
-
-
-def compute_IPA(pupil, fs=250):
-    """
-    IPA = Index of Pupillary Activity (nach Duchowski 2018).
-    Hochfrequenzanteile der Pupillenzeitreihe.
-    """
-    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
-
-    hf_band = (f >= 0.6) & (f <= 2.0)
-    ipa = np.sum(Pxx[hf_band])
-
-    return ipa
-
-
-##############################################################################
-# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
-##############################################################################
-
-def extract_eye_features(df, window_length_sec=50, fs=250):
-    """
-    df = Tobii DataFrame
-    window_length_sec = Fenstergröße (z.B. W=1s)
-    """
-
-    print("→ extract_eye_features(): Starte Feature-Berechnung...")
-    print("   Fensterlänge W =", window_length_sec, "s")
-
-    W = int(window_length_sec * fs) # Window größe in Samples
-
-    # Gaze
-    gaze = extract_gaze_signal(df)
-    gx, gy = gaze[:, 0], gaze[:, 1]
-    print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
-    print("Range:", np.nanmin(gx), np.nanmax(gx))
-    print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
-    print("Range:", np.nanmin(gy), np.nanmax(gy))
-
-    # Pupille
-    pupil, pupil_validity = extract_pupil(df)
-
-    features = []
-
-    # Sliding windows
-    for start in range(0, len(df), W):
-        end = start + W
-        if end > len(df):
-            break          #das letzte Fenster wird ignoriert
-
-
-        w_gaze = gaze[start:end]
-        w_pupil = pupil[start:end]
-        w_valid = pupil_validity[start:end]
-
-        # ----------------------------
-        # FIXATIONS (PyGaze)
-        # ----------------------------
-        time_ms = np.arange(W) * 1000.0 / fs
-
-        # print("gx im Fenster:", w_gaze[:,0][:20])
-        # print("gy im Fenster:", w_gaze[:,1][:20])
-        # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
-
-        # print("Werte X im Fenster:", w_gaze[:,0])
-        # print("Werte Y im Fenster:", w_gaze[:,1])
-        # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
-        # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
-        print("time_ms:", time_ms)
-
-        fix, efix = fixation_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
-            missing=0.0, maxdist=0.001, mindur=65 # mindur=100ms
-        )
-
-        #print("Raw Fixation Output:", efix[0])
-
-        if start == 0:
-            print("DEBUG fix raw:", fix[:10])
-
-        # Robust fixations: PyGaze may return malformed entries
-        fixation_durations = []
-        for f in efix:
-            print("Efix:", f[2])
-            # start_t = f[1]   # in ms
-            # end_t   = f[2]   # in ms
-            # duration = (end_t - start_t) / 1000.0  # in Sekunden
-
-            #duration = f[2] / 1000.0
-            if np.isfinite(f[2]) and f[2] > 0:
-                fixation_durations.append(f[2])
-
-        # Kategorien laut Paper
-        F_short = sum(66 <= d <= 150 for d in fixation_durations)
-        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
-        F_long = sum(d >= 1000 for d in fixation_durations)
-        F_hundred = sum(d > 100 for d in fixation_durations)
-        F_Cancel = sum(66 < d for d in fixation_durations)
-
-        # ----------------------------
-        # SACCADES
-        # ----------------------------
-        sac, esac = saccade_detection(
-            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
-        )
-
-        sac_durations = [s[2] for s in esac]
-        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
-
-        # ----------------------------
-        # BLINKS
-        # ----------------------------
-        blinks = detect_blinks(w_valid)
-        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
-
-        # ----------------------------
-        # PUPIL
-        # ----------------------------
-        if np.all(np.isnan(w_pupil)):
-            mean_pupil = np.nan
-            ipa = np.nan
-        else:
-            mean_pupil = np.nanmean(w_pupil)
-            ipa = compute_IPA(w_pupil, fs=fs)
-
-        # ----------------------------
-        # FEATURE-TABELLE FÜLLEN
-        # ----------------------------
-        features.append({
-            "Fix_count_short_66_150": F_short,
-            "Fix_count_medium_300_500": F_medium,
-            "Fix_count_long_gt_1000": F_long,
-            "Fix_count_100": F_hundred,
-            "Fix_cancel": F_Cancel,
-            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
-            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
-
-            "Sac_count": len(sac),
-            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
-            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
-            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
-
-            "Blink_count": len(blinks),
-            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
-            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
-
-            "Pupil_mean": mean_pupil,
-            "Pupil_IPA": ipa
-        })
-
-
-    result = pd.DataFrame(features)
-    print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
-
-    return result
-
-##############################################################################
-# 3. MAIN FUNKTION
-##############################################################################
-
-def main():
-    print("### STARTE FEATURE-EXTRAKTION ###")
-    print("Aktueller Arbeitsordner:", os.getcwd())
-
-    df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
-    #df = pd.read_parquet("cleaned_0001.parquet")
-    print("DataFrame geladen:", df.shape)
-
-    # Nur Eye-Tracking auswählen
-    #eye_cols = [c for c in df.columns if "EYE_" in c]
-    #df_eye = df[eye_cols]
-
-    #print("Eye-Tracking-Spalten:", len(eye_cols))
-    #print("→", eye_cols[:10], " ...")
-
-    print("Reinige Eyetracking-Daten ...")
-    df_eye = clean_eye_df(df)
-
-    # Feature Extraction
-    features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
-
-    print("\n### FEATURE-MATRIX (HEAD) ###")
-    print(features.head())
-
-    print("\nSpeichere Output in features.csv ...")
-    features.to_csv("features2.csv", index=False)
-
-    print("FERTIG!")
-
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/dataset_creation/maxDist.py b/dataset_creation/maxDist.py
index 8242101..2c506da 100644
--- a/dataset_creation/maxDist.py
+++ b/dataset_creation/maxDist.py
@@ -1,72 +1,79 @@
 import math
 
-def fixation_radius_normalized(theta_deg: float,
-                               distance_cm: float,
-                               screen_width_cm: float,
-                               screen_height_cm: float,
-                               resolution_x: int,
-                               resolution_y: int,
-                               method: str = "max"):
+
+def fixation_radius_normalized(
+    theta_deg: float,
+    distance_cm: float,
+    screen_width_cm: float,
+    screen_height_cm: float,
+    resolution_x: int,
+    resolution_y: int,
+    method: str = "max",
+):
     """
-    Berechnet den PyGaze-Fixationsradius für normierte Gaze-Daten in [0,1].
+    Compute the PyGaze fixation radius for normalized gaze data in [0, 1].
     """
-    # Schritt 1: visueller Winkel → physische Distanz (cm)
+    # Visual angle to physical distance (cm)
     delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2)
 
-    # Schritt 2: physische Distanz → Pixel
+    # Physical distance to pixels
     delta_px_x = delta_cm * (resolution_x / screen_width_cm)
     delta_px_y = delta_cm * (resolution_y / screen_height_cm)
 
-    # Pixelradius
+    # Pixel radius
     if method == "max":
         r_px = max(delta_px_x, delta_px_y)
     else:
         r_px = math.sqrt(delta_px_x**2 + delta_px_y**2)
 
-    # Schritt 3: Pixelradius → normierter Radius
+    # Pixel radius to normalized radius
     r_norm_x = r_px / resolution_x
     r_norm_y = r_px / resolution_y
 
     if method == "max":
         return max(r_norm_x, r_norm_y)
-    else:
-        return math.sqrt(r_norm_x**2 + r_norm_y**2)
+    return math.sqrt(r_norm_x**2 + r_norm_y**2)
 
 
+def run_example():
+    # Example: 55" 4k monitor
+    screen_width_cm = 3 * 121.8
+    screen_height_cm = 68.5
+    resolution_x = 3 * 3840
+    resolution_y = 2160
+    distance_to_screen_cm = 120
+    max_angle = 1.0
+
+    maxdist_px = fixation_radius_normalized(
+        theta_deg=max_angle,
+        distance_cm=distance_to_screen_cm,
+        screen_width_cm=screen_width_cm,
+        screen_height_cm=screen_height_cm,
+        resolution_x=resolution_x,
+        resolution_y=resolution_y,
+        method="max",
+    )
+    print("PyGaze max_dist (max):", maxdist_px)
+
+    maxdist_px = fixation_radius_normalized(
+        theta_deg=max_angle,
+        distance_cm=distance_to_screen_cm,
+        screen_width_cm=screen_width_cm,
+        screen_height_cm=screen_height_cm,
+        resolution_x=resolution_x,
+        resolution_y=resolution_y,
+        method="euclid",
+    )
+    print("PyGaze max_dist (euclid):", maxdist_px)
 
 
+def main():
+    run_example()
 
 
-# Beispiel: 55" 4k Monitor
-screen_width_cm = 3*121.8
-screen_height_cm = 68.5
-resolution_x = 3*3840
-resolution_y = 2160
-distance_to_screen_cm = 120
-method = 'max'
-max_angle= 1.0
+if __name__ == "__main__":
+    main()
 
-maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
-                            distance_cm=distance_to_screen_cm,
-                            screen_width_cm=screen_width_cm,
-                            screen_height_cm=screen_height_cm,
-                            resolution_x=resolution_x,
-                            resolution_y=resolution_y,
-                            method=method)
-
-print("PyGaze max_dist (max):", maxdist_px)
-
-method = 'euclid'
-maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
-                            distance_cm=distance_to_screen_cm,
-                            screen_width_cm=screen_width_cm,
-                            screen_height_cm=screen_height_cm,
-                            resolution_x=resolution_x,
-                            resolution_y=resolution_y,
-                            method=method)
-
-print("PyGaze max_dist (euclid):", maxdist_px)
-
-# Passt noch nicht zu der Breite 
-#  https://osdoc.cogsci.nl/4.0/de/visualangle/
-# https://reference.org/facts/Visual_angle/LUw29zy7
\ No newline at end of file
+# Reference
+# https://osdoc.cogsci.nl/4.0/de/visualangle/
+# https://reference.org/facts/Visual_angle/LUw29zy7
diff --git a/dataset_creation/open_parquet_test.ipynb b/dataset_creation/open_parquet_test.ipynb
deleted file mode 100644
index b72c1f7..0000000
--- a/dataset_creation/open_parquet_test.ipynb
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b3fface",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "74f1f5ec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df= pd.read_parquet(r\" \")\n",
-    "print(df.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "05775454",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99e17328",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.tail()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "69e53731",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3754c664",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Zeigt alle Kombinationen mit Häufigkeit\n",
-    "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f83b595c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "high_nback = df[\n",
-    "    (df[\"STUDY\"]==\"n-back\") &\n",
-    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
-    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
-    "]\n",
-    "high_nback.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c0940343",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "low_all = df[\n",
-    "    ((df[\"PHASE\"] == \"baseline\") |\n",
-    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
-    "]\n",
-    "print(low_all.shape)\n",
-    "high_kdrive = df[\n",
-    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
-    "]\n",
-    "print(high_kdrive.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f7ce38d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
-    "print(df.shape[0])\n",
-    "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "48ba0379",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "high_all = pd.concat([high_nback, high_kdrive])\n",
-    "high_all.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "77dda26c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
-    "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
-    "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/dataset_creation/CPFC_both.py b/dataset_creation/parquet_file_creation.py
similarity index 92%
rename from dataset_creation/CPFC_both.py
rename to dataset_creation/parquet_file_creation.py
index 06e5a6c..ebf064b 100644
--- a/dataset_creation/CPFC_both.py
+++ b/dataset_creation/parquet_file_creation.py
@@ -1,8 +1,10 @@
 import os
 import pandas as pd
 from pathlib import Path
+# TODO: Set paths correctly
+data_dir = Path("") # path to the directory with all .h5 files 
+base_dir = Path(r"") #  directory to store the parquet files in
 
-data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")
 
 # Get all .h5 files and sort them
 matching_files = sorted(data_dir.glob("*.h5"))
@@ -11,8 +13,8 @@ matching_files = sorted(data_dir.glob("*.h5"))
 CHUNK_SIZE = 50_000
 
 for i, file_path in enumerate(matching_files):
-    print(f"Subject {i} gestartet")
-    print(f"{file_path} geoeffnet")
+    print(f"Starting with subject {i}")
+    print(f"Opened: {file_path}")
     
     # Step 1: Get total number of rows and column names
     with pd.HDFStore(file_path, mode="r") as store:
@@ -81,7 +83,7 @@ for i, file_path in enumerate(matching_files):
         print(f"Final dataframe shape: {df_final.shape}")
         
         # Save to parquet
-        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
+        
         os.makedirs(base_dir, exist_ok=True)
         
         out_name = base_dir / f"both_mod_{i:04d}.parquet"