14 changed files with 1740 additions and 195 deletions
--- a/EDA/histogramms.ipynb
+++ b/EDA/histogramms.ipynb
@ -1,13 +1,5 @@
 {
 "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "cc08936c",
-   "metadata": {},
-   "source": [
-    "## Insights into the dataset with histogramms and scatter plots"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "1014c5e0",
@ -25,8 +17,7 @@
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "from pathlib import Path"
+    "import matplotlib.pyplot as plt"
   ]
  },
  {
@ -36,7 +27,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "path  = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n",
+    "path =r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\"\n",
    "df = pd.read_parquet(path=path)"
   ]
  },
@ -113,27 +104,21 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "face_au_cols = [c for c in low_all.columns if c.startswith(\"FACE_AU\")]\n",
-    "eye_cols =  ['Fix_count_short_66_150', 'Fix_count_medium_300_500',\n",
-    "       'Fix_count_long_gt_1000', 'Fix_count_100', 'Fix_mean_duration',\n",
-    "       'Fix_median_duration', 'Sac_count', 'Sac_mean_amp', 'Sac_mean_dur',\n",
-    "       'Sac_median_dur', 'Blink_count', 'Blink_mean_dur', 'Blink_median_dur',\n",
-    "       'Pupil_mean', 'Pupil_IPA']\n",
-    "\n",
-    "cols = face_au_cols+eye_cols\n",
+    "# Get all columns that start with 'AU'\n",
+    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
    "\n",
    "# Calculate number of rows and columns for subplots\n",
-    "n_cols = len(cols)\n",
-    "n_rows = 7\n",
+    "n_cols = len(au_columns)\n",
+    "n_rows = 4\n",
    "n_cols_subplot = 5\n",
    "\n",
    "# Create figure with subplots\n",
    "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
    "axes = axes.flatten()\n",
-    "fig.suptitle('Feature Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
+    "fig.suptitle('Action Unit (AU) Distributions: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
    "\n",
    "# Create histogram for each AU column\n",
-    "for idx, col in enumerate(cols):\n",
+    "for idx, col in enumerate(au_columns):\n",
    "    ax = axes[idx]\n",
    "    \n",
    "    # Plot overlapping histograms\n",
@ -148,48 +133,18 @@
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "# Hide any unused subplots\n",
-    "for idx in range(len(cols), len(axes)):\n",
+    "for idx in range(len(au_columns), len(axes)):\n",
    "    axes[idx].set_visible(False)\n",
    "\n",
    "# Adjust layout\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6cd53cdb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create figure with subplots\n",
-    "fig, axes = plt.subplots(n_rows, n_cols_subplot, figsize=(20, 16))\n",
-    "axes = axes.flatten()\n",
-    "fig.suptitle('Feature Scatter: Low vs High', fontsize=20, fontweight='bold', y=0.995)\n",
-    "\n",
-    "for idx, col in enumerate(cols):\n",
-    "    ax = axes[idx]\n",
-    "\n",
-    "    # Scatterplots\n",
-    "    ax.scatter(range(len(low_all[col])), low_all[col], alpha=0.6, color='blue', label='low_all', s=10)\n",
-    "    ax.scatter(range(len(high_all[col])), high_all[col], alpha=0.6, color='red', label='high_all', s=10)\n",
-    "\n",
-    "    ax.set_title(col, fontsize=10, fontweight='bold')\n",
-    "    ax.set_xlabel('Sample index', fontsize=8)\n",
-    "    ax.set_ylabel('Value', fontsize=8)\n",
-    "    ax.legend(fontsize=8)\n",
-    "    ax.grid(True, alpha=0.3)\n",
-    "\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
@ -203,7 +158,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.11.5"
  }
 },
 "nbformat": 4,
--- a/dataset_creation/parquet_file_creation.py
+++ b/dataset_creation/parquet_file_creation.py
@ -1,10 +1,8 @@
 import os
 import pandas as pd
 from pathlib import Path
-# TODO: Set paths correctly
-data_dir = Path("") # path to the directory with all .h5 files 
-base_dir = Path(r"") #  directory to store the parquet files in

+data_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/raw_data")

 # Get all .h5 files and sort them
 matching_files = sorted(data_dir.glob("*.h5"))
@ -13,8 +11,8 @@ matching_files = sorted(data_dir.glob("*.h5"))
 CHUNK_SIZE = 50_000

 for i, file_path in enumerate(matching_files):
-    print(f"Starting with subject {i}")
-    print(f"Opened: {file_path}")
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
    
    # Step 1: Get total number of rows and column names
    with pd.HDFStore(file_path, mode="r") as store:
@ -83,7 +81,7 @@ for i, file_path in enumerate(matching_files):
        print(f"Final dataframe shape: {df_final.shape}")
        
        # Save to parquet
-        
+        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
        os.makedirs(base_dir, exist_ok=True)
        
        out_name = base_dir / f"both_mod_{i:04d}.parquet"
--- a/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
+++ b/dataset_creation/chunkwise_parquet_file_creation_EYE_TRACKING.py
@ -0,0 +1,91 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+print(os.getcwd())
+num_files = 2  # number of files to process (min: 1, max: 30)
+
+print("connection aufgebaut")
+
+data_dir = Path("/home/jovyan/Fahrsimulator_MSY2526_AI/EDA")
+# os.chdir(data_dir)
+# Get all .h5 files and sort them
+matching_files = sorted(data_dir.glob("*.h5"))
+
+# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
+CHUNK_SIZE = 100_000
+
+for i, file_path in enumerate(matching_files):
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
+    
+    # Step 1: Get total number of rows and column names
+    with pd.HDFStore(file_path, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns
+        nrows = store.get_storer("SIGNALS").nrows
+        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
+    
+    # Step 2: Filter columns that start with "FACE_AU"
+    eye_cols = [c for c in cols if c.startswith("EYE_")]
+    print(f"eye-tracking columns found: {eye_cols}")
+    
+    if len(eye_cols) == 0:
+        print(f"keine eye-tracking-Signale in Subject {i}")
+        continue
+    
+    # Columns to read
+    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
+    
+    # Step 3: Process file in chunks
+    chunks_to_save = []
+    
+    for start_row in range(0, nrows, CHUNK_SIZE):
+        stop_row = min(start_row + CHUNK_SIZE, nrows)
+        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
+        
+        # Read chunk
+        df_chunk = pd.read_hdf(
+            file_path, 
+            key="SIGNALS", 
+            columns=columns_to_read,
+            start=start_row,
+            stop=stop_row
+        )
+        
+        # Add metadata columns
+        df_chunk["subjectID"] = i
+        df_chunk["rowID"] = range(start_row, stop_row)
+        
+        # Clean data
+        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
+        df_chunk = df_chunk.dropna()
+        
+        # Only keep non-empty chunks
+        if len(df_chunk) > 0:
+            chunks_to_save.append(df_chunk)
+        
+        # Free memory
+        del df_chunk
+    
+    print("load and cleaning done")
+    
+    # Step 4: Combine all chunks and save
+    if chunks_to_save:
+        df_final = pd.concat(chunks_to_save, ignore_index=True)
+        print(f"Final dataframe shape: {df_final.shape}")
+        
+        # Save to parquet
+        base_dir = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files")
+        os.makedirs(base_dir, exist_ok=True)
+        
+        out_name = base_dir / f"ET_signals_extracted_{i:04d}.parquet"
+        df_final.to_parquet(out_name, index=False)
+        print(f"Saved to {out_name}")
+        
+        # Free memory
+        del df_final
+        del chunks_to_save
+    else:
+        print(f"No valid data found for Subject {i}")
+
+print("All files processed!")
--- a/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
+++ b/dataset_creation/chunkwise_parquet_file_creation_FACE_AU.py
@ -0,0 +1,91 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+print(os.getcwd())
+num_files = 2  # number of files to process (min: 1, max: 30)
+
+print("connection aufgebaut")
+
+data_dir = Path(r"C:\Users\x\repo\UXKI\Fahrsimulator_MSY2526_AI\newTmp")
+
+# Get all .h5 files and sort them
+matching_files = sorted(data_dir.glob("*.h5"))
+
+# Chunk size for reading (adjust based on your RAM - 100k rows is ~50-100MB depending on columns)
+CHUNK_SIZE = 100_000
+
+for i, file_path in enumerate(matching_files):
+    print(f"Subject {i} gestartet")
+    print(f"{file_path} geoeffnet")
+    
+    # Step 1: Get total number of rows and column names
+    with pd.HDFStore(file_path, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns
+        nrows = store.get_storer("SIGNALS").nrows
+        print(f"Total columns: {len(cols)}, Total rows: {nrows}")
+    
+    # Step 2: Filter columns that start with "FACE_AU"
+    eye_cols = [c for c in cols if c.startswith("FACE_AU")]
+    print(f"FACE_AU columns found: {eye_cols}")
+    
+    if len(eye_cols) == 0:
+        print(f"keine FACE_AU-Signale in Subject {i}")
+        continue
+    
+    # Columns to read
+    columns_to_read = ["STUDY", "LEVEL", "PHASE"] + eye_cols
+    
+    # Step 3: Process file in chunks
+    chunks_to_save = []
+    
+    for start_row in range(0, nrows, CHUNK_SIZE):
+        stop_row = min(start_row + CHUNK_SIZE, nrows)
+        print(f"Processing rows {start_row} to {stop_row} ({stop_row/nrows*100:.1f}%)")
+        
+        # Read chunk
+        df_chunk = pd.read_hdf(
+            file_path, 
+            key="SIGNALS", 
+            columns=columns_to_read,
+            start=start_row,
+            stop=stop_row
+        )
+        
+        # Add metadata columns
+        df_chunk["subjectID"] = i
+        df_chunk["rowID"] = range(start_row, stop_row)
+        
+        # Clean data
+        df_chunk = df_chunk[df_chunk["LEVEL"] != 0]
+        df_chunk = df_chunk.dropna()
+        
+        # Only keep non-empty chunks
+        if len(df_chunk) > 0:
+            chunks_to_save.append(df_chunk)
+        
+        # Free memory
+        del df_chunk
+    
+    print("load and cleaning done")
+    
+    # Step 4: Combine all chunks and save
+    if chunks_to_save:
+        df_final = pd.concat(chunks_to_save, ignore_index=True)
+        print(f"Final dataframe shape: {df_final.shape}")
+        
+        # Save to parquet
+        base_dir = Path(r"C:\new_AU_parquet_files")
+        os.makedirs(base_dir, exist_ok=True)
+        
+        out_name = base_dir / f"cleaned_{i:04d}.parquet"
+        df_final.to_parquet(out_name, index=False)
+        print(f"Saved to {out_name}")
+        
+        # Free memory
+        del df_final
+        del chunks_to_save
+    else:
+        print(f"No valid data found for Subject {i}")
+
+print("All files processed!")
--- a/dataset_creation/combined_feature_creation.py
+++ b/dataset_creation/combined_feature_creation.py
@ -4,26 +4,27 @@ import pandas as pd
 from pathlib import Path
 from sklearn.preprocessing import MinMaxScaler
 from scipy.signal import welch
-from pygazeanalyser.detectors import fixation_detection, saccade_detection # not installed by default
+from pygazeanalyser.detectors import fixation_detection, saccade_detection


 ##############################################################################
-# CONFIGURATION
+# KONFIGURATION
 ##############################################################################
-INPUT_DIR = Path(r"") # directory that stores the parquet files (one file per subject)
-OUTPUT_FILE = Path(r"")  # path for resulting dataset
-WINDOW_SIZE_SAMPLES = 25*50  # 50s at 25Hz
-STEP_SIZE_SAMPLES = 125     # 5s at 25Hz
+INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/both_mod_parquet_files")
+OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet")
+
+WINDOW_SIZE_SAMPLES = 25*50  # 50s bei 25Hz
+STEP_SIZE_SAMPLES = 125     # 5s bei 25Hz
 SAMPLING_RATE = 25          # Hz
 MIN_DUR_BLINKS = 2          # x * 40ms


 ##############################################################################
-# EYE-TRACKING FUNCTIONS
+# EYE-TRACKING FUNKTIONEN
 ##############################################################################

 def clean_eye_df(df):
-    """Extracts Eye-Tracking columns only and removes empty rows."""
+    """Extrahiert nur Eye-Tracking Spalten und entfernt leere Zeilen."""
    eye_cols = [c for c in df.columns if c.startswith("EYE_")]

    if not eye_cols:
@ -37,7 +38,7 @@ def clean_eye_df(df):


 def extract_gaze_signal(df):
-    """Extracts 2D gaze positions, masks invalid samples, and interpolates."""
+    """Extrahiert 2D-Gaze-Positionen, maskiert ungültige Samples und interpoliert."""
    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
@ -50,14 +51,14 @@ def extract_gaze_signal(df):
    for arr in [gx_L, gy_L, gx_R, gy_R]:
        arr.replace([np.inf, -np.inf], np.nan, inplace=True)

-    # Mask invalids
+    # Ungültige maskieren
    gx_L[~val_L] = np.nan
    gy_L[~val_L] = np.nan
    gx_R[~val_R] = np.nan
    gy_R[~val_R] = np.nan

    
-    # Mean of both eyes
+    # Mittelwert beider Augen
    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)

@ -65,7 +66,7 @@ def extract_gaze_signal(df):
    gx = pd.Series(gx).interpolate(limit=None, limit_direction="both").bfill().ffill()
    gy = pd.Series(gy).interpolate(limit=None, limit_direction="both").bfill().ffill()
    
-    # MinMax scaling
+    # MinMax Skalierung
    xscaler = MinMaxScaler()
    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))

@ -76,7 +77,7 @@ def extract_gaze_signal(df):


 def extract_pupil(df):
-    """Extract pupil size (average of both eyes)."""
+    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)

@ -95,7 +96,7 @@ def extract_pupil(df):


 def detect_blinks(pupil_validity, min_duration=5):
-    """Detect blinks: Validity=0 → Blink."""
+    """Erkennt Blinks: Validity=0 → Blink."""
    blinks = []
    start = None

@ -119,13 +120,13 @@ def compute_IPA(pupil, fs=25):

 def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):
    """
-    Extracts eye tracking features for a single window.
-    Returns a dictionary containing all eye features.
+    Extrahiert Eye-Tracking Features für ein einzelnes Window.
+    Gibt Dictionary mit allen Eye-Features zurück.
    """
    # Gaze
    gaze = extract_gaze_signal(df_eye_window)
    
-    # Pupil
+    # Pupille
    pupil, pupil_validity = extract_pupil(df_eye_window)

    window_size = len(df_eye_window)
@ -142,6 +143,7 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):

    fixation_durations = [f[2] for f in efix if np.isfinite(f[2]) and f[2] > 0]

+    # Kategorien
    F_short = sum(66 <= d <= 150 for d in fixation_durations)
    F_medium = sum(300 <= d <= 500 for d in fixation_durations)
    F_long = sum(d >= 1000 for d in fixation_durations)
@ -195,27 +197,27 @@ def extract_eye_features_window(df_eye_window, fs=25, min_dur_blinks=2):


 ##############################################################################
-# Combined feature extraction
+# KOMBINIERTE FEATURE-EXTRAKTION
 ##############################################################################

 def process_combined_features(input_dir, output_file, window_size, step_size, fs=25,min_duration_blinks=2):
    """
-    Processes Parquet files with FACE_AU and EYE columns.
-    Extracts both feature sets and combines them.
+    Verarbeitet Parquet-Dateien mit FACE_AU und EYE Spalten.
+    Extrahiert beide Feature-Sets und kombiniert sie.
    """
    input_path = Path(input_dir)
    parquet_files = sorted(input_path.glob("*.parquet"))
    
    if not parquet_files:
-        print(f"Error: No parquet-files found in {input_dir}!")
+        print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
        return None
    
    print(f"\n{'='*70}")
-    print(f"Combined feature-extraction")
+    print(f"KOMBINIERTE FEATURE-EXTRAKTION")
    print(f"{'='*70}")
-    print(f"Files: {len(parquet_files)}")
-    print(f"Window: {window_size} Samples ({window_size/fs:.1f}s at {fs}Hz)")
-    print(f"Step: {step_size} Samples ({step_size/fs:.1f}s at {fs}Hz)")
+    print(f"Dateien: {len(parquet_files)}")
+    print(f"Window: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
+    print(f"Step: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
    print(f"{'='*70}\n")
    
    all_windows = []
@ -225,22 +227,24 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
        
        try:
            df = pd.read_parquet(parquet_file)
-            print(f"  Entries: {len(df)}")
+            print(f"  Einträge: {len(df)}")

+            
+            # Identifiziere Spalten
            au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
            eye_columns = [col for col in df.columns if col.startswith('EYE_')]
            
-            print(f"  AU-columns: {len(au_columns)}")
-            print(f"  Eye-columns: {len(eye_columns)}")
+            print(f"  AU-Spalten: {len(au_columns)}")
+            print(f"  Eye-Spalten: {len(eye_columns)}")
            
            has_au = len(au_columns) > 0
            has_eye = len(eye_columns) > 0
            
            if not has_au and not has_eye:
-                print(f"  Warning: No AU or eye tracking columns found!")
+                print(f"  WARNUNG: Keine AU oder Eye Spalten gefunden!")
                continue
            
-            # Group by STUDY, LEVEL, PHASE
+            # Gruppiere nach STUDY, LEVEL, PHASE
            group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df.columns]
            
            if group_cols:
@ -254,7 +258,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                
                group_df = group_df.reset_index(drop=True)
                
-                # calculate number of windows
+                # Berechne Anzahl Windows
                num_windows = (len(group_df) - window_size) // step_size + 1
                
                if num_windows <= 0:
@ -268,7 +272,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                    
                    window_df = group_df.iloc[start_idx:end_idx]
                    
-                    # basic metadata
+                    # Basis-Metadaten
                    result = {
                        'subjectID': window_df['subjectID'].iloc[0],
                        'start_time': window_df['rowID'].iloc[0],
@ -277,12 +281,12 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                        'PHASE': window_df['PHASE'].iloc[0] if 'PHASE' in window_df.columns else np.nan
                    }
                    
-                    # FACE AU features
+                    # FACE AU Features
                    if has_au:
                        for au_col in au_columns:
                            result[f'{au_col}_mean'] = window_df[au_col].mean()
                    
-                    # Eye-tracking features
+                    # Eye-Tracking Features
                    if has_eye:
                        try:
                            # clean dataframe from all nan rows
@ -292,7 +296,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
                            result.update(eye_features)
                        except Exception as e:
                            print(f"    WARNUNG: Eye-Features fehlgeschlagen: {str(e)}")
-                            # Add NaN-values for eye-features 
+                            # Füge NaN-Werte für Eye-Features hinzu
                            result.update({
                                "Fix_count_short_66_150": np.nan,
                                "Fix_count_medium_300_500": np.nan,
@ -321,7 +325,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
            traceback.print_exc()
            continue
    
-    # Combine all windows
+    # Kombiniere alle Windows
    if not all_windows:
        print("\nKEINE FEATURES EXTRAHIERT!")
        return None
@ -336,7 +340,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs
    print(f"Spalten: {len(result_df.columns)}")
    print(f"Subjects: {result_df['subjectID'].nunique()}")
    
-    # Save
+    # Speichern
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    result_df.to_parquet(output_file, index=False)
@ -353,7 +357,7 @@ def process_combined_features(input_dir, output_file, window_size, step_size, fs

 def main():
    print("\n" + "="*70)
-    print("Combined extraction (AU + EYE)")
+    print("KOMBINIERTE FEATURE-EXTRAKTION (AU + EYE)")
    print("="*70)
    
    result = process_combined_features(
@ -366,16 +370,16 @@ def main():
    )
    
    if result is not None:
-        print("\First 5 rows:")
+        print("\nErste 5 Zeilen:")
        print(result.head())
        
-        print("\nColumns overview:")
+        print("\nSpalten-Übersicht:")
        print(result.dtypes)
        
-        print("\Statistics:")
+        print("\nStatistik:")
        print(result.describe())
    
-    print("\nDone!\n")
+    print("\n✓ FERTIG!\n")


 if __name__ == "__main__":
--- a/dataset_creation/create_feature_table.py
+++ b/dataset_creation/create_feature_table.py
@ -0,0 +1,113 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+def process_parquet_files(input_dir, output_file, window_size=1250, step_size=125):
+    """
+    Verarbeitet Parquet-Dateien mit Sliding Window Aggregation.
+    
+    Parameters:
+    -----------
+    input_dir : str
+        Verzeichnis mit Parquet-Dateien
+    output_file : str
+        Pfad für die Ausgabe-Parquet-Datei
+    window_size : int
+        Größe des Sliding Windows (default: 3000)
+    step_size : int
+        Schrittweite in Einträgen (default: 250 = 10 Sekunden bei 25 Hz)
+    """
+    
+    input_path = Path(input_dir)
+    parquet_files = sorted(input_path.glob("*.parquet"))
+    
+    if not parquet_files:
+        print(f"Keine Parquet-Dateien in {input_dir} gefunden!")
+        return
+    
+    print(f"Gefundene Dateien: {len(parquet_files)}")
+    
+    all_windows = []
+    
+    for file_idx, parquet_file in enumerate(parquet_files):
+        print(f"\nVerarbeite Datei {file_idx + 1}/{len(parquet_files)}: {parquet_file.name}")
+        
+        # Lade Parquet-Datei
+        df = pd.read_parquet(parquet_file)
+        print(f"  Einträge: {len(df)}")
+        
+        # Identifiziere AU-Spalten
+        au_columns = [col for col in df.columns if col.startswith('FACE_AU')]
+        print(f"  AU-Spalten: {len(au_columns)}")
+        
+        # Gruppiere nach STUDY, LEVEL, PHASE (um Übergänge zu vermeiden)
+        for (study_val, level_val, phase_val), level_df in df.groupby(['STUDY', 'LEVEL', 'PHASE'], sort=False):
+            print(f"    STUDY {study_val}, LEVEL {level_val}, PHASE {phase_val}: {len(level_df)} Einträge")
+            
+            # Reset index für korrekte Position-Berechnung
+            level_df = level_df.reset_index(drop=True)
+            
+            # Sliding Window über dieses Level
+            num_windows = (len(level_df) - window_size) // step_size + 1
+            
+            if num_windows <= 0:
+                print(f"      Zu wenige Einträge für Window (benötigt {window_size})")
+                continue
+            
+            for i in range(num_windows):
+                start_idx = i * step_size
+                end_idx = start_idx + window_size
+                
+                window_df = level_df.iloc[start_idx:end_idx]
+                
+                # Erstelle aggregiertes Ergebnis
+                result = {
+                    'subjectID': window_df['subjectID'].iloc[0],
+                    'start_time': window_df['rowID'].iloc[0],  # rowID als start_time
+                    'STUDY': window_df['STUDY'].iloc[0],
+                    'LEVEL': window_df['LEVEL'].iloc[0],
+                    'PHASE': window_df['PHASE'].iloc[0]
+                }
+                
+                # Summiere alle AU-Spalten
+                for au_col in au_columns:
+                    # result[f'{au_col}_sum'] = window_df[au_col].sum()
+                    result[f'{au_col}_mean'] = window_df[au_col].mean()
+                
+                all_windows.append(result)
+            
+            print(f"      Windows erstellt: {num_windows}")
+    
+    # Erstelle finalen DataFrame
+    result_df = pd.DataFrame(all_windows)
+    
+    print(f"\n{'='*60}")
+    print(f"Gesamt Windows erstellt: {len(result_df)}")
+    print(f"Spalten: {list(result_df.columns)}")
+    
+    # Speichere Ergebnis
+    result_df.to_parquet(output_file, index=False)
+    print(f"\nErgebnis gespeichert in: {output_file}")
+    
+    return result_df
+
+
+# Beispiel-Verwendung
+if __name__ == "__main__":
+    # Anpassen an deine Pfade
+    input_directory = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_parquet_files")
+    output_file = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_AU_dataset_mean/AU_dataset_mean.parquet")
+
+    
+    
+    result = process_parquet_files(
+        input_dir=input_directory,
+        output_file=output_file,
+        window_size=1250,
+        step_size=125
+    )
+    
+    # Zeige erste Zeilen
+    if result is not None:
+        print("\nErste 5 Zeilen des Ergebnisses:")
+        print(result.head())
--- a/dataset_creation/create_multimodal_dataset_by_merge.py
+++ b/dataset_creation/create_multimodal_dataset_by_merge.py
@ -0,0 +1,56 @@
+from pathlib import Path
+import pandas as pd
+
+
+def main():
+    """
+    USER CONFIGURATION
+    ------------------
+    Specify input files and output directory here.
+    """
+
+    # Input parquet files (single-modality datasets)
+    file_modality_1 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/AU_dataset_mean.parquet")
+    file_modality_2 = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/new_eye_dataset.parquet")
+
+    # Output directory and file name
+    output_dir = Path("/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/")
+    output_file = output_dir / "merged_dataset.parquet"
+
+    # Column names (adjust only if your schema differs)
+    subject_col = "subjectID"
+    time_col = "start_time"
+
+    # ------------------------------------------------------------------
+    # Load datasets
+    # ------------------------------------------------------------------
+    df1 = pd.read_parquet(file_modality_1)
+    df2 = pd.read_parquet(file_modality_2)
+
+    # ------------------------------------------------------------------
+    # Keep only subjects that appear in BOTH datasets
+    # ------------------------------------------------------------------
+    common_subjects = set(df1[subject_col]).intersection(df2[subject_col])
+
+    df1 = df1[df1[subject_col].isin(common_subjects)]
+    df2 = df2[df2[subject_col].isin(common_subjects)]
+
+    # ------------------------------------------------------------------
+    # Inner join on subject ID AND start_time
+    # ------------------------------------------------------------------
+    merged_df = pd.merge(
+        df1,
+        df2,
+        on=[subject_col, time_col],
+        how="inner",
+    )
+
+    # ------------------------------------------------------------------
+    # Save merged dataset
+    # ------------------------------------------------------------------
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merged_df.to_parquet(output_file, index=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset_creation/create_parquet_files_from_owncloud.py
+++ b/dataset_creation/create_parquet_files_from_owncloud.py
@ -1,5 +1,6 @@
+#  pip install pyocclient
 import yaml
-import owncloud #  pip install pyocclient
+import owncloud
 import pandas as pd
 import h5py
 import os
@ -25,7 +26,7 @@ for i in range(num_files):

    # Download file from ownCloud
    oc.get_file(file_name, local_tmp)
-    print(f"Opened: {file_name}")
+    print(f"{file_name} geoeffnet")
    # Load into memory and extract needed columns
    # with h5py.File(local_tmp, "r") as f:
    #     # Adjust this path depending on actual dataset layout inside .h5py file
@ -34,9 +35,14 @@ for i in range(num_files):
    with pd.HDFStore(local_tmp, mode="r") as store:
        cols = store.select("SIGNALS", start=0, stop=1).columns # get column names

-
+    # Step 2: Filter columns that start with "AU"
+    au_cols = [c for c in cols if c.startswith("AU")]
+    print(au_cols)
+    if len(au_cols)==0:
+        print(f"keine AU Signale in Subject {i}")
+        continue
    # Step 3: Read only those columns (plus any others you want)
-    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + cols)
+    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)


    print("load done")
@ -57,7 +63,7 @@ for i in range(num_files):
  

    # Save to parquet
-    os.makedirs("ParquetFiles", exist_ok=True) # TODO: change for custom directory
+    os.makedirs("ParquetFiles", exist_ok=True)
    out_name = f"ParquetFiles/cleaned_{i:04d}.parquet"
    df.to_parquet(out_name, index=False)

--- a/dataset_creation/eyeAlt.py
+++ b/dataset_creation/eyeAlt.py
@ -0,0 +1,323 @@
+import numpy as np
+import pandas as pd
+import h5py
+import yaml
+import os
+from sklearn.preprocessing import MinMaxScaler
+from scipy.signal import welch
+from pygazeanalyser.detectors import fixation_detection, saccade_detection
+
+
+##############################################################################
+# 1. HELFERFUNKTIONEN
+##############################################################################
+def clean_eye_df(df):
+    """
+    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
+    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
+    """
+    eye_cols = [c for c in df.columns if ("LEFT_" in c or "RIGHT_" in c)]
+    df_eye = df[eye_cols]
+
+    # INF → NaN
+    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
+
+    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
+    df_eye = df_eye.dropna(subset=eye_cols, how="all")
+
+    print("Eyetracking-Zeilen vorher:", len(df))
+    print("Eyetracking-Zeilen nachher:", len(df_eye))
+
+    #Index zurücksetzen
+    return df_eye.reset_index(drop=True)
+
+
+def extract_gaze_signal(df):
+    """
+    Extrahiert 2D-Gaze-Positionen auf dem Display,
+    maskiert ungültige Samples und interpoliert Lücken.
+    """
+
+    print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
+
+    # Gaze-Spalten
+    gx_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_L = df["LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_R = df["RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+
+
+    # Validity-Spalten (1 = gültig)
+    val_L = (df["LEFT_GAZE_POINT_VALIDITY"] == 1)
+    val_R = (df["RIGHT_GAZE_POINT_VALIDITY"] == 1)
+
+    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
+    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Ungültige Werte maskieren
+    gx_L[~val_L] = np.nan
+    gy_L[~val_L] = np.nan
+    gx_R[~val_R] = np.nan
+    gy_R[~val_R] = np.nan
+
+    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
+    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
+    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
+
+    # Interpolation (wichtig für PyGaze!)
+    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
+
+    # xscaler = MinMaxScaler()
+    # gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
+
+    # yscaler = MinMaxScaler()
+    # gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
+
+    #print("xmax ymax", gxscale.max(), gyscale.max())
+
+    #out = np.column_stack((gxscale, gyscale))
+    out = np.column_stack((gx, gy))
+
+    print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
+
+    return out
+
+
+def extract_pupil(df):
+    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
+
+    pl = df["LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pr = df["RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+
+    vl = df.get("LEFT_PUPIL_VALIDITY")
+    vr = df.get("RIGHT_PUPIL_VALIDITY")
+
+    if vl is None or vr is None:
+        # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
+        # gültig, wenn Pupillendurchmesser nicht NaN.
+        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
+    else:
+        # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
+        validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
+
+    # Mittelwert der verfügbaren Pupillen
+    p = np.mean(np.column_stack([pl, pr]), axis=1)
+
+    # INF/NaN reparieren
+    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    p = p.to_numpy()
+
+    print("→ extract_pupil(): Pupillensignal Länge:", len(p))
+    return p, validity
+
+
+def detect_blinks(pupil_validity, min_duration=5):
+    """Erkennt Blinks: Validity=0 → Blink."""
+    blinks = []
+    start = None
+
+    for i, v in enumerate(pupil_validity):
+        if v == 0 and start is None:
+            start = i
+        elif v == 1 and start is not None:
+            if i - start >= min_duration:
+                blinks.append([start, i])
+            start = None
+
+    return blinks
+
+
+def compute_IPA(pupil, fs=250):
+    """
+    IPA = Index of Pupillary Activity (nach Duchowski 2018).
+    Hochfrequenzanteile der Pupillenzeitreihe.
+    """
+    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
+
+    hf_band = (f >= 0.6) & (f <= 2.0)
+    ipa = np.sum(Pxx[hf_band])
+
+    return ipa
+
+
+##############################################################################
+# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
+##############################################################################
+
+def extract_eye_features(df, window_length_sec=50, fs=250):
+    """
+    df = Tobii DataFrame
+    window_length_sec = Fenstergröße (z.B. W=1s)
+    """
+
+    print("→ extract_eye_features(): Starte Feature-Berechnung...")
+    print("   Fensterlänge W =", window_length_sec, "s")
+
+    W = int(window_length_sec * fs) # Window größe in Samples
+
+    # Gaze
+    gaze = extract_gaze_signal(df)
+    gx, gy = gaze[:, 0], gaze[:, 1]
+    print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
+    print("Range:", np.nanmin(gx), np.nanmax(gx))
+    print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
+    print("Range:", np.nanmin(gy), np.nanmax(gy))
+
+    # Pupille
+    pupil, pupil_validity = extract_pupil(df)
+
+    features = []
+
+    # Sliding windows
+    for start in range(0, len(df), W):
+        end = start + W
+        if end > len(df):
+            break          #das letzte Fenster wird ignoriert
+
+
+        w_gaze = gaze[start:end]
+        w_pupil = pupil[start:end]
+        w_valid = pupil_validity[start:end]
+
+        # ----------------------------
+        # FIXATIONS (PyGaze)
+        # ----------------------------
+        time_ms = np.arange(W) * 1000.0 / fs
+
+        # print("gx im Fenster:", w_gaze[:,0][:20])
+        # print("gy im Fenster:", w_gaze[:,1][:20])
+        # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
+
+        # print("Werte X im Fenster:", w_gaze[:,0])
+        # print("Werte Y im Fenster:", w_gaze[:,1])
+        # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
+        # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
+        print("time_ms:", time_ms)
+
+        fix, efix = fixation_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
+            missing=0.0, maxdist=0.003, mindur=10 # mindur=100ms
+        )
+
+        #print("Raw Fixation Output:", efix[0])
+
+        if start == 0:
+            print("DEBUG fix raw:", fix[:10])
+
+        # Robust fixations: PyGaze may return malformed entries
+        fixation_durations = []
+        for f in efix:
+            print("Efix:", f[2])
+            # start_t = f[1]   # in ms
+            # end_t   = f[2]   # in ms
+            # duration = (end_t - start_t) / 1000.0  # in Sekunden
+
+            #duration = f[2] / 1000.0
+            if np.isfinite(f[2]) and f[2] > 0:
+                fixation_durations.append(f[2])
+
+        # Kategorien laut Paper
+        F_short = sum(66 <= d <= 150 for d in fixation_durations)
+        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
+        F_long = sum(d >= 1000 for d in fixation_durations)
+        F_hundred = sum(d > 100 for d in fixation_durations)
+        F_Cancel = sum(66 < d for d in fixation_durations)
+
+        # ----------------------------
+        # SACCADES
+        # ----------------------------
+        sac, esac = saccade_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
+        )
+
+        sac_durations = [s[2] for s in esac]
+        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
+
+        # ----------------------------
+        # BLINKS
+        # ----------------------------
+        blinks = detect_blinks(w_valid)
+        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
+
+        # ----------------------------
+        # PUPIL
+        # ----------------------------
+        if np.all(np.isnan(w_pupil)):
+            mean_pupil = np.nan
+            ipa = np.nan
+        else:
+            mean_pupil = np.nanmean(w_pupil)
+            ipa = compute_IPA(w_pupil, fs=fs)
+
+        # ----------------------------
+        # FEATURE-TABELLE FÜLLEN
+        # ----------------------------
+        features.append({
+            "Fix_count_short_66_150": F_short,
+            "Fix_count_medium_300_500": F_medium,
+            "Fix_count_long_gt_1000": F_long,
+            "Fix_count_100": F_hundred,
+            "Fix_cancel": F_Cancel,
+            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
+            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
+
+            "Sac_count": len(sac),
+            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
+            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
+            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
+
+            "Blink_count": len(blinks),
+            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
+            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
+
+            "Pupil_mean": mean_pupil,
+            "Pupil_IPA": ipa
+        })
+
+
+    result = pd.DataFrame(features)
+    print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
+
+    return result
+
+##############################################################################
+# 3. MAIN FUNKTION
+##############################################################################
+
+def main():
+    print("### STARTE FEATURE-EXTRAKTION ###")
+    print("Aktueller Arbeitsordner:", os.getcwd())
+
+    #df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
+    df = pd.read_parquet("cleaned_0001.parquet")
+    print("DataFrame geladen:", df.shape)
+
+    # Nur Eye-Tracking auswählen
+    #eye_cols = [c for c in df.columns if "EYE_" in c]
+    #df_eye = df[eye_cols]
+
+    #print("Eye-Tracking-Spalten:", len(eye_cols))
+    #print("→", eye_cols[:10], " ...")
+
+    print("Reinige Eyetracking-Daten ...")
+    df_eye = clean_eye_df(df)
+
+    # Feature Extraction
+    features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
+
+    print("\n### FEATURE-MATRIX (HEAD) ###")
+    print(features.head())
+
+    print("\nSpeichere Output in features.csv ...")
+    features.to_csv("features4.csv", index=False)
+
+    print("FERTIG!")
+
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset_creation/eye_batch_processor.py
+++ b/dataset_creation/eye_batch_processor.py
@ -0,0 +1,441 @@
+import numpy as np
+import pandas as pd
+import h5py
+import yaml
+import os
+from pathlib import Path
+from sklearn.preprocessing import MinMaxScaler
+from scipy.signal import welch
+from pygazeanalyser.detectors import fixation_detection, saccade_detection
+
+
+##############################################################################
+# KONFIGURATION - HIER ANPASSEN!
+##############################################################################
+INPUT_DIR = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/new_ET_Parquet_files/")
+OUTPUT_FILE = Path(r"/home/jovyan/data-paulusjafahrsimulator-gpu/Eye_dataset_old/new_eye_dataset.parquet")
+
+WINDOW_SIZE_SAMPLES = 12500  # Anzahl Samples pro Window (z.B. 1250 = 50s bei 25Hz, oder 5s bei 250Hz)
+STEP_SIZE_SAMPLES = 1250    # Schrittweite (z.B. 125 = 5s bei 25Hz, oder 0.5s bei 250Hz)
+SAMPLING_RATE = 250         # Hz
+
+
+##############################################################################
+# 1. HELFERFUNKTIONEN
+##############################################################################
+def clean_eye_df(df):
+    """
+    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
+    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
+    """
+    eye_cols = [c for c in df.columns if c.startswith("EYE_")]
+    df_eye = df[eye_cols]
+
+    # INF → NaN
+    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
+
+    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
+    df_eye = df_eye.dropna(subset=eye_cols, how="all")
+
+    print(f"  Eyetracking-Zeilen: {len(df)} → {len(df_eye)}")
+
+    return df_eye.reset_index(drop=True)
+
+
+def extract_gaze_signal(df):
+    """
+    Extrahiert 2D-Gaze-Positionen auf dem Display,
+    maskiert ungültige Samples und interpoliert Lücken.
+    """
+    # Gaze-Spalten
+    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+
+    # Validity-Spalten (1 = gültig)
+    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
+    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
+
+    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
+    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Ungültige Werte maskieren
+    gx_L[~val_L] = np.nan
+    gy_L[~val_L] = np.nan
+    gx_R[~val_R] = np.nan
+    gy_R[~val_R] = np.nan
+
+    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
+    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
+    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
+
+    # Interpolation (wichtig für PyGaze!)
+    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    
+    xscaler = MinMaxScaler()
+    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
+
+    yscaler = MinMaxScaler()
+    gyscale = yscaler.fit_transform(gy.values.reshape(-1, 1))
+    
+    out = np.column_stack((gxscale, gyscale))
+    return out
+
+
+def extract_pupil(df):
+    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
+    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+
+    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
+    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
+
+    if vl is None or vr is None:
+        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
+    else:
+        validity = ((vl == 1) | (vr == 1)).astype(int).to_numpy()
+
+    # Mittelwert der verfügbaren Pupillen
+    p = np.mean(np.column_stack([pl, pr]), axis=1)
+
+    # INF/NaN reparieren
+    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    p = p.to_numpy()
+
+    return p, validity
+
+
+def detect_blinks(pupil_validity, min_duration=5):
+    """Erkennt Blinks: Validity=0 → Blink."""
+    blinks = []
+    start = None
+
+    for i, v in enumerate(pupil_validity):
+        if v == 0 and start is None:
+            start = i
+        elif v == 1 and start is not None:
+            if i - start >= min_duration:
+                blinks.append([start, i])
+            start = None
+
+    return blinks
+
+
+def compute_IPA(pupil, fs=250):
+    """
+    IPA = Index of Pupillary Activity (nach Duchowski 2018).
+    Hochfrequenzanteile der Pupillenzeitreihe.
+    """
+    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
+
+    hf_band = (f >= 0.6) & (f <= 2.0)
+    ipa = np.sum(Pxx[hf_band])
+
+    return ipa
+
+
+##############################################################################
+# 2. FEATURE-EXTRAKTION MIT SLIDING WINDOW
+##############################################################################
+
+def extract_eye_features_sliding(df_eye, df_meta, window_size, step_size, fs=250):
+    """
+    Extrahiert Features mit Sliding Window aus einem einzelnen Level/Phase.
+    
+    Parameters:
+    -----------
+    df_eye : DataFrame
+        Eye-Tracking Daten (bereits gereinigt)
+    df_meta : DataFrame
+        Metadaten (subjectID, rowID, STUDY, LEVEL, PHASE)
+    window_size : int
+        Anzahl Samples pro Window
+    step_size : int
+        Schrittweite in Samples
+    fs : int
+        Sampling Rate in Hz
+    """
+    # Gaze
+    gaze = extract_gaze_signal(df_eye)
+    
+    # Pupille
+    pupil, pupil_validity = extract_pupil(df_eye)
+
+    features = []
+    num_windows = (len(df_eye) - window_size) // step_size + 1
+    
+    if num_windows <= 0:
+        return pd.DataFrame()
+
+    for i in range(num_windows):
+        start_idx = i * step_size
+        end_idx = start_idx + window_size
+        
+        w_gaze = gaze[start_idx:end_idx]
+        w_pupil = pupil[start_idx:end_idx]
+        w_valid = pupil_validity[start_idx:end_idx]
+
+        # Metadaten für dieses Window
+        meta_row = df_meta.iloc[start_idx]
+        
+        # ----------------------------
+        # FIXATIONS (PyGaze)
+        # ----------------------------
+        time_ms = np.arange(window_size) * 1000.0 / fs
+
+        fix, efix = fixation_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
+            missing=0.0, maxdist=0.003, mindur=10
+        )
+
+        fixation_durations = []
+        for f in efix:
+            if np.isfinite(f[2]) and f[2] > 0:
+                fixation_durations.append(f[2])
+
+        # Kategorien laut Paper
+        F_short = sum(66 <= d <= 150 for d in fixation_durations)
+        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
+        F_long = sum(d >= 1000 for d in fixation_durations)
+        F_hundred = sum(d > 100 for d in fixation_durations)
+        # F_Cancel = sum(66 < d for d in fixation_durations)
+
+        # ----------------------------
+        # SACCADES
+        # ----------------------------
+        sac, esac = saccade_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, 
+            missing=0, minlen=12, maxvel=0.2, maxacc=1
+        )
+
+        sac_durations = [s[2] for s in esac]
+        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
+
+        # ----------------------------
+        # BLINKS
+        # ----------------------------
+        blinks = detect_blinks(w_valid)
+        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
+
+        # ----------------------------
+        # PUPIL
+        # ----------------------------
+        if np.all(np.isnan(w_pupil)):
+            mean_pupil = np.nan
+            ipa = np.nan
+        else:
+            mean_pupil = np.nanmean(w_pupil)
+            ipa = compute_IPA(w_pupil, fs=fs)
+
+        # ----------------------------
+        # FEATURE-DICTIONARY
+        # ----------------------------
+        features.append({
+            # Metadaten
+            'subjectID': meta_row['subjectID'],
+            'start_time': meta_row['rowID'],
+            'STUDY': meta_row.get('STUDY', np.nan),
+            'LEVEL': meta_row.get('LEVEL', np.nan),
+            'PHASE': meta_row.get('PHASE', np.nan),
+            
+            # Fixation Features
+            "Fix_count_short_66_150": F_short,
+            "Fix_count_medium_300_500": F_medium,
+            "Fix_count_long_gt_1000": F_long,
+            "Fix_count_100": F_hundred,
+             # "Fix_cancel": F_Cancel,
+            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
+            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
+
+            # Saccade Features
+            "Sac_count": len(sac),
+            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
+            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
+            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
+
+            # Blink Features
+            "Blink_count": len(blinks),
+            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
+            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
+
+            # Pupil Features
+            "Pupil_mean": mean_pupil,
+            "Pupil_IPA": ipa
+        })
+
+    return pd.DataFrame(features)
+
+
+##############################################################################
+# 3. BATCH-VERARBEITUNG
+##############################################################################
+
+def process_parquet_directory(input_dir, output_file, window_size, step_size, fs=250):
+    """
+    Verarbeitet alle Parquet-Dateien in einem Verzeichnis.
+    
+    Parameters:
+    -----------
+    input_dir : str
+        Pfad zum Verzeichnis mit Parquet-Dateien
+    output_file : str
+        Pfad für die Ausgabe-Parquet-Datei
+    window_size : int
+        Window-Größe in Samples
+    step_size : int
+        Schrittweite in Samples
+    fs : int
+        Sampling Rate in Hz
+    """
+    input_path = Path(input_dir)
+    parquet_files = sorted(input_path.glob("*.parquet"))
+    
+    if not parquet_files:
+        print(f"FEHLER: Keine Parquet-Dateien in {input_dir} gefunden!")
+        return
+    
+    print(f"\n{'='*70}")
+    print(f"STARTE BATCH-VERARBEITUNG")
+    print(f"{'='*70}")
+    print(f"Gefundene Dateien: {len(parquet_files)}")
+    print(f"Window Size: {window_size} Samples ({window_size/fs:.1f}s bei {fs}Hz)")
+    print(f"Step Size: {step_size} Samples ({step_size/fs:.1f}s bei {fs}Hz)")
+    print(f"{'='*70}\n")
+    
+    all_features = []
+    
+    for file_idx, parquet_file in enumerate(parquet_files, 1):
+        print(f"\n[{file_idx}/{len(parquet_files)}] Verarbeite: {parquet_file.name}")
+        
+        try:
+            # Lade Parquet-Datei
+            df = pd.read_parquet(parquet_file)
+            print(f"  Einträge geladen: {len(df)}")
+            
+            # Prüfe ob benötigte Spalten vorhanden sind
+            required_cols = ['subjectID', 'rowID']
+            missing_cols = [col for col in required_cols if col not in df.columns]
+            if missing_cols:
+                print(f"  WARNUNG: Fehlende Spalten: {missing_cols} - Überspringe Datei")
+                continue
+            
+            # Reinige Eye-Tracking-Daten
+            df_eye = clean_eye_df(df)
+            
+            if len(df_eye) == 0:
+                print(f"  WARNUNG: Keine gültigen Eye-Tracking-Daten - Überspringe Datei")
+                continue
+            
+            # Metadaten extrahieren (aligned mit df_eye)
+            meta_cols = ['subjectID', 'rowID']
+            if 'STUDY' in df.columns:
+                meta_cols.append('STUDY')
+            if 'LEVEL' in df.columns:
+                meta_cols.append('LEVEL')
+            if 'PHASE' in df.columns:
+                meta_cols.append('PHASE')
+            
+            df_meta = df[meta_cols].iloc[df_eye.index].reset_index(drop=True)
+            
+            # Gruppiere nach STUDY, LEVEL, PHASE (falls vorhanden)
+            group_cols = [col for col in ['STUDY', 'LEVEL', 'PHASE'] if col in df_meta.columns]
+            
+            if group_cols:
+                print(f"  Gruppiere nach: {', '.join(group_cols)}")
+                for group_vals, group_df in df_meta.groupby(group_cols, sort=False):
+                    group_eye = df_eye.iloc[group_df.index].reset_index(drop=True)
+                    group_meta = group_df.reset_index(drop=True)
+                    
+                    print(f"    Gruppe {group_vals}: {len(group_eye)} Samples", end=" → ")
+                    
+                    features_df = extract_eye_features_sliding(
+                        group_eye, group_meta, window_size, step_size, fs
+                    )
+                    
+                    if not features_df.empty:
+                        all_features.append(features_df)
+                        print(f"{len(features_df)} Windows")
+                    else:
+                        print("Zu wenige Daten")
+            else:
+                # Keine Gruppierung
+                print(f"  Keine Gruppierungsspalten gefunden")
+                features_df = extract_eye_features_sliding(
+                    df_eye, df_meta, window_size, step_size, fs
+                )
+                
+                if not features_df.empty:
+                    all_features.append(features_df)
+                    print(f"  → {len(features_df)} Windows erstellt")
+                else:
+                    print(f"  → Zu wenige Daten")
+                    
+        except Exception as e:
+            print(f"  FEHLER bei Verarbeitung: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            continue
+    
+    # Kombiniere alle Features
+    if not all_features:
+        print("\nKEINE FEATURES EXTRAHIERT!")
+        return None
+    
+    print(f"\n{'='*70}")
+    print(f"ZUSAMMENFASSUNG")
+    print(f"{'='*70}")
+    
+    final_df = pd.concat(all_features, ignore_index=True)
+    
+    print(f"Gesamt Windows: {len(final_df)}")
+    print(f"Spalten: {len(final_df.columns)}")
+    print(f"Subjects: {final_df['subjectID'].nunique()}")
+    
+    # Speichere Ergebnis
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    final_df.to_parquet(output_file, index=False)
+    
+    print(f"\n✓ Ergebnis gespeichert: {output_file}")
+    print(f"{'='*70}\n")
+    
+    return final_df
+
+
+##############################################################################
+# 4. MAIN
+##############################################################################
+
+def main():
+    print("\n" + "="*70)
+    print("EYE-TRACKING FEATURE EXTRAKTION - BATCH MODE")
+    print("="*70)
+    
+    result = process_parquet_directory(
+        input_dir=INPUT_DIR,
+        output_file=OUTPUT_FILE,
+        window_size=WINDOW_SIZE_SAMPLES,
+        step_size=STEP_SIZE_SAMPLES,
+        fs=SAMPLING_RATE
+    )
+    
+    if result is not None:
+        print("\nErste 5 Zeilen des Ergebnisses:")
+        print(result.head())
+        
+        print("\nSpalten-Übersicht:")
+        print(result.columns.tolist())
+        
+        print("\nDatentypen:")
+        print(result.dtypes)
+    
+    print("\n✓ FERTIG!\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset_creation/eyetrackingFeatures.py
+++ b/dataset_creation/eyetrackingFeatures.py
@ -0,0 +1,323 @@
+import numpy as np
+import pandas as pd
+import h5py
+import yaml
+import owncloud
+import os
+from sklearn.preprocessing import MinMaxScaler
+from scipy.signal import welch
+from pygazeanalyser.detectors import fixation_detection, saccade_detection
+
+
+##############################################################################
+# 1. HELFERFUNKTIONEN
+##############################################################################
+def clean_eye_df(df):
+    """
+    Entfernt alle Zeilen, die keine echten Eyetracking-Daten enthalten.
+    Löst das Problem, dass das Haupt-DataFrame NaN-Zeilen für andere Sensoren enthält.
+    """
+    eye_cols = [c for c in df.columns if "EYE_" in c]
+    df_eye = df[eye_cols]
+
+    # INF → NaN
+    df_eye = df_eye.replace([np.inf, -np.inf], np.nan)
+
+    # Nur Zeilen behalten, wo es echte Eyetracking-Daten gibt
+    df_eye = df_eye.dropna(subset=eye_cols, how="all")
+
+    print("Eyetracking-Zeilen vorher:", len(df))
+    print("Eyetracking-Zeilen nachher:", len(df_eye))
+
+    #Index zurücksetzen
+    return df_eye.reset_index(drop=True)
+
+
+def extract_gaze_signal(df):
+    """
+    Extrahiert 2D-Gaze-Positionen auf dem Display,
+    maskiert ungültige Samples und interpoliert Lücken.
+    """
+
+    print("→ extract_gaze_signal(): Eingabegröße:", df.shape)
+
+    # Gaze-Spalten
+    gx_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_L = df["EYE_LEFT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+    gx_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_X"].astype(float).copy()
+    gy_R = df["EYE_RIGHT_GAZE_POINT_ON_DISPLAY_AREA_Y"].astype(float).copy()
+
+
+    # Validity-Spalten (1 = gültig)
+    val_L = (df["EYE_LEFT_GAZE_POINT_VALIDITY"] == 1)
+    val_R = (df["EYE_RIGHT_GAZE_POINT_VALIDITY"] == 1)
+
+    # Inf ersetzen mit NaN (kommt bei Tobii bei Blinks vor)
+    gx_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_L.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gx_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+    gy_R.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Ungültige Werte maskieren
+    gx_L[~val_L] = np.nan
+    gy_L[~val_L] = np.nan
+    gx_R[~val_R] = np.nan
+    gy_R[~val_R] = np.nan
+
+    # Mittelwert der beiden Augen pro Sample (nanmean ist robust)
+    gx = np.mean(np.column_stack([gx_L, gx_R]), axis=1)
+    gy = np.mean(np.column_stack([gy_L, gy_R]), axis=1)
+
+    # Interpolation (wichtig für PyGaze!)
+    gx = pd.Series(gx).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    gy = pd.Series(gy).interpolate(limit=50, limit_direction="both").bfill().ffill()
+
+    xscaler = MinMaxScaler()
+    gxscale = xscaler.fit_transform(gx.values.reshape(-1, 1))
+
+    yscaler = MinMaxScaler()
+    gyscale = yscaler.fit_transform(gx.values.reshape(-1, 1))
+
+    print("xmax ymax", gxscale.max(), gyscale.max())
+
+    out = np.column_stack((gxscale, gyscale))
+
+    print("→ extract_gaze_signal(): Ausgabegröße:", out.shape)
+
+    return out
+
+
+def extract_pupil(df):
+    """Extrahiert Pupillengröße (beide Augen gemittelt)."""
+
+    pl = df["EYE_LEFT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+    pr = df["EYE_RIGHT_PUPIL_DIAMETER"].replace([np.inf, -np.inf], np.nan)
+
+    vl = df.get("EYE_LEFT_PUPIL_VALIDITY")
+    vr = df.get("EYE_RIGHT_PUPIL_VALIDITY")
+
+    if vl is None or vr is None:
+        # Falls Validity-Spalten nicht vorhanden sind, versuchen wir grobe Heuristik:
+        # gültig, wenn Pupillendurchmesser nicht NaN.
+        validity = (~pl.isna() | ~pr.isna()).astype(int).to_numpy()
+    else:
+        # Falls vorhanden: 1 wenn mindestens eines der Augen gültig ist
+        validity = ( (vl == 1) | (vr == 1) ).astype(int).to_numpy()
+
+    # Mittelwert der verfügbaren Pupillen
+    p = np.mean(np.column_stack([pl, pr]), axis=1)
+
+    # INF/NaN reparieren
+    p = pd.Series(p).interpolate(limit=50, limit_direction="both").bfill().ffill()
+    p = p.to_numpy()
+
+    print("→ extract_pupil(): Pupillensignal Länge:", len(p))
+    return p, validity
+
+
+def detect_blinks(pupil_validity, min_duration=5):
+    """Erkennt Blinks: Validity=0 → Blink."""
+    blinks = []
+    start = None
+
+    for i, v in enumerate(pupil_validity):
+        if v == 0 and start is None:
+            start = i
+        elif v == 1 and start is not None:
+            if i - start >= min_duration:
+                blinks.append([start, i])
+            start = None
+
+    return blinks
+
+
+def compute_IPA(pupil, fs=250):
+    """
+    IPA = Index of Pupillary Activity (nach Duchowski 2018).
+    Hochfrequenzanteile der Pupillenzeitreihe.
+    """
+    f, Pxx = welch(pupil, fs=fs, nperseg=int(fs*2))  # 2 Sekunden Fenster
+
+    hf_band = (f >= 0.6) & (f <= 2.0)
+    ipa = np.sum(Pxx[hf_band])
+
+    return ipa
+
+
+##############################################################################
+# 2. FEATURE-EXTRAKTION (HAUPTFUNKTION)
+##############################################################################
+
+def extract_eye_features(df, window_length_sec=50, fs=250):
+    """
+    df = Tobii DataFrame
+    window_length_sec = Fenstergröße (z.B. W=1s)
+    """
+
+    print("→ extract_eye_features(): Starte Feature-Berechnung...")
+    print("   Fensterlänge W =", window_length_sec, "s")
+
+    W = int(window_length_sec * fs) # Window größe in Samples
+
+    # Gaze
+    gaze = extract_gaze_signal(df)
+    gx, gy = gaze[:, 0], gaze[:, 1]
+    print("Gültige Werte (gx):", np.sum(~np.isnan(gx)), "von", len(gx))
+    print("Range:", np.nanmin(gx), np.nanmax(gx))
+    print("Gültige Werte (gy):", np.sum(~np.isnan(gy)), "von", len(gy))
+    print("Range:", np.nanmin(gy), np.nanmax(gy))
+
+    # Pupille
+    pupil, pupil_validity = extract_pupil(df)
+
+    features = []
+
+    # Sliding windows
+    for start in range(0, len(df), W):
+        end = start + W
+        if end > len(df):
+            break          #das letzte Fenster wird ignoriert
+
+
+        w_gaze = gaze[start:end]
+        w_pupil = pupil[start:end]
+        w_valid = pupil_validity[start:end]
+
+        # ----------------------------
+        # FIXATIONS (PyGaze)
+        # ----------------------------
+        time_ms = np.arange(W) * 1000.0 / fs
+
+        # print("gx im Fenster:", w_gaze[:,0][:20])
+        # print("gy im Fenster:", w_gaze[:,1][:20])
+        # print("gx diff:", np.mean(np.abs(np.diff(w_gaze[:,0]))))
+
+        # print("Werte X im Fenster:", w_gaze[:,0])
+        # print("Werte Y im Fenster:", w_gaze[:,1])
+        # print("X-Stats: min/max/diff", np.nanmin(w_gaze[:,0]), np.nanmax(w_gaze[:,0]), np.nanmean(np.abs(np.diff(w_gaze[:,0]))))
+        # print("Y-Stats: min/max/diff", np.nanmin(w_gaze[:,1]), np.nanmax(w_gaze[:,1]), np.nanmean(np.abs(np.diff(w_gaze[:,1]))))
+        print("time_ms:", time_ms)
+
+        fix, efix = fixation_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms,
+            missing=0.0, maxdist=0.001, mindur=65 # mindur=100ms
+        )
+
+        #print("Raw Fixation Output:", efix[0])
+
+        if start == 0:
+            print("DEBUG fix raw:", fix[:10])
+
+        # Robust fixations: PyGaze may return malformed entries
+        fixation_durations = []
+        for f in efix:
+            print("Efix:", f[2])
+            # start_t = f[1]   # in ms
+            # end_t   = f[2]   # in ms
+            # duration = (end_t - start_t) / 1000.0  # in Sekunden
+
+            #duration = f[2] / 1000.0
+            if np.isfinite(f[2]) and f[2] > 0:
+                fixation_durations.append(f[2])
+
+        # Kategorien laut Paper
+        F_short = sum(66 <= d <= 150 for d in fixation_durations)
+        F_medium = sum(300 <= d <= 500 for d in fixation_durations)
+        F_long = sum(d >= 1000 for d in fixation_durations)
+        F_hundred = sum(d > 100 for d in fixation_durations)
+        F_Cancel = sum(66 < d for d in fixation_durations)
+
+        # ----------------------------
+        # SACCADES
+        # ----------------------------
+        sac, esac = saccade_detection(
+            x=w_gaze[:, 0], y=w_gaze[:, 1], time=time_ms, missing=0, minlen=12, maxvel=0.2, maxacc=1
+        )
+
+        sac_durations = [s[2] for s in esac]
+        sac_amplitudes = [((s[5]-s[3])**2 + (s[6]-s[4])**2)**0.5 for s in esac]
+
+        # ----------------------------
+        # BLINKS
+        # ----------------------------
+        blinks = detect_blinks(w_valid)
+        blink_durations = [(b[1] - b[0]) / fs for b in blinks]
+
+        # ----------------------------
+        # PUPIL
+        # ----------------------------
+        if np.all(np.isnan(w_pupil)):
+            mean_pupil = np.nan
+            ipa = np.nan
+        else:
+            mean_pupil = np.nanmean(w_pupil)
+            ipa = compute_IPA(w_pupil, fs=fs)
+
+        # ----------------------------
+        # FEATURE-TABELLE FÜLLEN
+        # ----------------------------
+        features.append({
+            "Fix_count_short_66_150": F_short,
+            "Fix_count_medium_300_500": F_medium,
+            "Fix_count_long_gt_1000": F_long,
+            "Fix_count_100": F_hundred,
+            "Fix_cancel": F_Cancel,
+            "Fix_mean_duration": np.mean(fixation_durations) if fixation_durations else 0,
+            "Fix_median_duration": np.median(fixation_durations) if fixation_durations else 0,
+
+            "Sac_count": len(sac),
+            "Sac_mean_amp": np.mean(sac_amplitudes) if sac_amplitudes else 0,
+            "Sac_mean_dur": np.mean(sac_durations) if sac_durations else 0,
+            "Sac_median_dur": np.median(sac_durations) if sac_durations else 0,
+
+            "Blink_count": len(blinks),
+            "Blink_mean_dur": np.mean(blink_durations) if blink_durations else 0,
+            "Blink_median_dur": np.median(blink_durations) if blink_durations else 0,
+
+            "Pupil_mean": mean_pupil,
+            "Pupil_IPA": ipa
+        })
+
+
+    result = pd.DataFrame(features)
+    print("→ extract_eye_features(): Fertig! Ergebnisgröße:", result.shape)
+
+    return result
+
+##############################################################################
+# 3. MAIN FUNKTION
+##############################################################################
+
+def main():
+    print("### STARTE FEATURE-EXTRAKTION ###")
+    print("Aktueller Arbeitsordner:", os.getcwd())
+
+    df = pd.read_hdf("tmp22.h5", "SIGNALS", mode="r")
+    #df = pd.read_parquet("cleaned_0001.parquet")
+    print("DataFrame geladen:", df.shape)
+
+    # Nur Eye-Tracking auswählen
+    #eye_cols = [c for c in df.columns if "EYE_" in c]
+    #df_eye = df[eye_cols]
+
+    #print("Eye-Tracking-Spalten:", len(eye_cols))
+    #print("→", eye_cols[:10], " ...")
+
+    print("Reinige Eyetracking-Daten ...")
+    df_eye = clean_eye_df(df)
+
+    # Feature Extraction
+    features = extract_eye_features(df_eye, window_length_sec=50, fs=250)
+
+    print("\n### FEATURE-MATRIX (HEAD) ###")
+    print(features.head())
+
+    print("\nSpeichere Output in features.csv ...")
+    features.to_csv("features2.csv", index=False)
+
+    print("FERTIG!")
+
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset_creation/maxDist.py
+++ b/dataset_creation/maxDist.py
@ -1,22 +1,19 @@
 import math

-
-def fixation_radius_normalized(
-    theta_deg: float,
+def fixation_radius_normalized(theta_deg: float,
                               distance_cm: float,
                               screen_width_cm: float,
                               screen_height_cm: float,
                               resolution_x: int,
                               resolution_y: int,
-    method: str = "max",
-):
+                               method: str = "max"):
    """
-    Compute the PyGaze fixation radius for normalized gaze data in [0, 1].
+    Berechnet den PyGaze-Fixationsradius für normierte Gaze-Daten in [0,1].
    """
-    # Visual angle to physical distance (cm)
+    # Schritt 1: visueller Winkel → physische Distanz (cm)
    delta_cm = 2 * distance_cm * math.tan(math.radians(theta_deg) / 2)

-    # Physical distance to pixels
+    # Schritt 2: physische Distanz → Pixel
    delta_px_x = delta_cm * (resolution_x / screen_width_cm)
    delta_px_y = delta_cm * (resolution_y / screen_height_cm)

@ -26,54 +23,50 @@ def fixation_radius_normalized(
    else:
        r_px = math.sqrt(delta_px_x**2 + delta_px_y**2)

-    # Pixel radius to normalized radius
+    # Schritt 3: Pixelradius → normierter Radius
    r_norm_x = r_px / resolution_x
    r_norm_y = r_px / resolution_y

    if method == "max":
        return max(r_norm_x, r_norm_y)
+    else:
        return math.sqrt(r_norm_x**2 + r_norm_y**2)


-def run_example():
-    # Example: 55" 4k monitor
+
+
+
+
+# Beispiel: 55" 4k Monitor
 screen_width_cm = 3*121.8
 screen_height_cm = 68.5
 resolution_x = 3*3840
 resolution_y = 2160
 distance_to_screen_cm = 120
+method = 'max'
 max_angle= 1.0

-    maxdist_px = fixation_radius_normalized(
-        theta_deg=max_angle,
+maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
                            distance_cm=distance_to_screen_cm,
                            screen_width_cm=screen_width_cm,
                            screen_height_cm=screen_height_cm,
                            resolution_x=resolution_x,
                            resolution_y=resolution_y,
-        method="max",
-    )
+                            method=method)
+
 print("PyGaze max_dist (max):", maxdist_px)

-    maxdist_px = fixation_radius_normalized(
-        theta_deg=max_angle,
+method = 'euclid'
+maxdist_px = fixation_radius_normalized(theta_deg=max_angle,
                            distance_cm=distance_to_screen_cm,
                            screen_width_cm=screen_width_cm,
                            screen_height_cm=screen_height_cm,
                            resolution_x=resolution_x,
                            resolution_y=resolution_y,
-        method="euclid",
-    )
+                            method=method)
+
 print("PyGaze max_dist (euclid):", maxdist_px)

-
-def main():
-    run_example()
-
-
-if __name__ == "__main__":
-    main()
-
-# Reference
+# Passt noch nicht zu der Breite 
 #  https://osdoc.cogsci.nl/4.0/de/visualangle/
 # https://reference.org/facts/Visual_angle/LUw29zy7
--- a/dataset_creation/open_parquet_test.ipynb
+++ b/dataset_creation/open_parquet_test.ipynb
@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b3fface",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74f1f5ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df= pd.read_parquet(r\" \")\n",
+    "print(df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05775454",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99e17328",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69e53731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3754c664",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Zeigt alle Kombinationen mit Häufigkeit\n",
+    "df[['STUDY', 'PHASE', 'LEVEL']].value_counts(ascending=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f83b595c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "high_nback = df[\n",
+    "    (df[\"STUDY\"]==\"n-back\") &\n",
+    "    (df[\"LEVEL\"].isin([2, 3, 5, 6])) &\n",
+    "    (df[\"PHASE\"].isin([\"train\", \"test\"]))\n",
+    "]\n",
+    "high_nback.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0940343",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_all = df[\n",
+    "    ((df[\"PHASE\"] == \"baseline\") |\n",
+    "     ((df[\"STUDY\"] == \"n-back\") & (df[\"PHASE\"] != \"baseline\") & (df[\"LEVEL\"].isin([1,4]))))\n",
+    "]\n",
+    "print(low_all.shape)\n",
+    "high_kdrive = df[\n",
+    "    (df[\"STUDY\"] == \"k-drive\") & (df[\"PHASE\"] != \"baseline\")\n",
+    "]\n",
+    "print(high_kdrive.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7ce38d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print((df.shape[0]==(high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0])))\n",
+    "print(df.shape[0])\n",
+    "print((high_kdrive.shape[0]+high_nback.shape[0]+low_all.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48ba0379",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "high_all = pd.concat([high_nback, high_kdrive])\n",
+    "high_all.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77dda26c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Gesamt: {df.shape[0]}=={low_all.shape[0]+high_all.shape[0]}\")\n",
+    "print(f\"Anzahl an low load Samples: {low_all.shape[0]}\")\n",
+    "print(f\"Anzahl an high load Samples: {high_all.shape[0]}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/model_training/IsolationForest/iforest_training.ipynb
+++ b/model_training/IsolationForest/iforest_training.ipynb
@ -28,7 +28,7 @@
    "sys.path.append(base_dir)\n",
    "print(base_dir)\n",
    "\n",
-    "from Fahrsimulator_MSY2526_AI.model_training.tools import evaluation_tools\n",
+    "from  tools import evaluation_tools\n",
    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
    "from sklearn.ensemble import IsolationForest\n",
    "from sklearn.model_selection import GridSearchCV, KFold\n",
@ -52,7 +52,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "data_path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")"
+    "data_path = Path(r\"C:\\Users\\micha\\FAUbox\\WS2526_Fahrsimulator_MSY (Celina Korzer)\\AU_dataset\\output_windowed.parquet\")"
   ]
  },
  {
@ -301,26 +301,20 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "au_columns = [col for col in low_all.columns if \"face\" in col.lower()] \n",
+    "# Cell 2: Get AU columns and prepare datasets\n",
+    "# Get all column names that start with 'AU'\n",
+    "au_columns = [col for col in low_all.columns if col.startswith('AU')]\n",
    "\n",
-    "eye_columns = [ \n",
-    "    'Fix_count_short_66_150','Fix_count_medium_300_500','Fix_count_long_gt_1000', \n",
-    "    'Fix_count_100','Fix_mean_duration','Fix_median_duration', \n",
-    "    'Sac_count','Sac_mean_amp','Sac_mean_dur','Sac_median_dur', \n",
-    "    'Blink_count','Blink_mean_dur','Blink_median_dur', \n",
-    "    'Pupil_mean','Pupil_IPA' \n",
-    "] \n",
-    "cols = au_columns +eye_columns\n",
    "# Prepare training data (only normal/low data)\n",
-    "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + cols].copy()\n",
+    "train_data = low_all[low_all['subjectID'].isin(train_subjects)][['subjectID'] + au_columns].copy()\n",
    "\n",
    "# Prepare validation data (normal and anomaly)\n",
-    "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
-    "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + cols].copy()\n",
+    "val_normal_data = low_all[low_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
+    "val_high_data = high_all[high_all['subjectID'].isin(val_subjects)][['subjectID'] + au_columns].copy()\n",
    "\n",
    "# Prepare test data (normal and anomaly)\n",
-    "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
-    "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + cols].copy()\n",
+    "test_normal_data = low_all[low_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
+    "test_high_data = high_all[high_all['subjectID'].isin(test_subjects)][['subjectID'] + au_columns].copy()\n",
    "\n",
    "print(f\"Train samples: {len(train_data)}\")\n",
    "print(f\"Val normal samples: {len(val_normal_data)}, Val high samples: {len(val_high_data)}\")\n",
@ -334,8 +328,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Fit normalizer on training data\n",
-    "normalizer = fit_normalizer(train_data, cols, method='minmax', scope='global')\n",
+    "# Cell 3: Fit normalizer on training data\n",
+    "normalizer = fit_normalizer(train_data, au_columns, method='minmax', scope='global')\n",
    "print(\"Normalizer fitted on training data\")"
   ]
  },
@ -346,12 +340,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Apply normalization to all datasets\n",
-    "train_normalized = apply_normalizer(train_data, cols, normalizer)\n",
-    "val_normal_normalized = apply_normalizer(val_normal_data, cols, normalizer)\n",
-    "val_high_normalized = apply_normalizer(val_high_data, cols, normalizer)\n",
-    "test_normal_normalized = apply_normalizer(test_normal_data, cols, normalizer)\n",
-    "test_high_normalized = apply_normalizer(test_high_data, cols, normalizer)\n",
+    "# Cell 4: Apply normalization to all datasets\n",
+    "train_normalized = apply_normalizer(train_data, au_columns, normalizer)\n",
+    "val_normal_normalized = apply_normalizer(val_normal_data, au_columns, normalizer)\n",
+    "val_high_normalized = apply_normalizer(val_high_data, au_columns, normalizer)\n",
+    "test_normal_normalized = apply_normalizer(test_normal_data, au_columns, normalizer)\n",
+    "test_high_normalized = apply_normalizer(test_high_data, au_columns, normalizer)\n",
    "\n",
    "print(\"Normalization applied to all datasets\")"
   ]
@ -363,9 +357,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "X_train = train_normalized[cols].copy()\n",
-    "X_val_normal = val_normal_normalized[cols].copy()\n",
-    "X_val_high = val_high_normalized[cols].copy()\n",
+    "# Cell 5: Extract AU columns and create labels for grid search\n",
+    "# Extract only AU columns (drop subjectID)\n",
+    "X_train = train_normalized[au_columns].copy()\n",
+    "X_val_normal = val_normal_normalized[au_columns].copy()\n",
+    "X_val_high = val_high_normalized[au_columns].copy()\n",
    "\n",
    "# Combine train and validation sets for grid search\n",
    "X_grid_search = pd.concat([X_train, X_val_normal, X_val_high], ignore_index=True)\n",
@ -420,7 +416,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Train final model with best parameters on training data\n",
+    "# Cell 7: Train final model with best parameters on training data\n",
    "final_model = IsolationForest(**best_params, random_state=42)\n",
    "final_model.fit(X_train.values)\n",
    "\n",
@ -434,9 +430,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Prepare independent test set\n",
-    "X_test_normal = test_normal_normalized[cols].copy()\n",
-    "X_test_high = test_high_normalized[cols].copy()\n",
+    "# Cell 8: Prepare independent test set\n",
+    "X_test_normal = test_normal_normalized[au_columns].copy()\n",
+    "X_test_high = test_high_normalized[au_columns].copy()\n",
    "\n",
    "# Combine test sets\n",
    "X_test = pd.concat([X_test_normal, X_test_high], ignore_index=True)\n",
@ -487,7 +483,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
@ -501,7 +497,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.11.5"
  }
 },
 "nbformat": 4,