diff --git a/EDA/researchOnSubjectPerformance.ipynb b/EDA/researchOnSubjectPerformance.ipynb index 5a53635..bfa4a14 100644 --- a/EDA/researchOnSubjectPerformance.ipynb +++ b/EDA/researchOnSubjectPerformance.ipynb @@ -15,6 +15,7 @@ "metadata": {}, "outputs": [], "source": [ + "%pip install pyocclient\n", "import yaml\n", "import owncloud\n", "import pandas as pd\n", @@ -36,101 +37,109 @@ "metadata": {}, "outputs": [], "source": [ - "# Load credentials\n", - "with open(\"../login.yaml\") as f:\n", + "# Load credentials from YAML\n", + "with open(\"login.yaml\", \"r\") as f:\n", " cfg = yaml.safe_load(f)\n", - " \n", - "url, password = cfg[0][\"url\"], cfg[1][\"password\"]\n", "\n", - "# Connect once\n", + "url = cfg[0][\"url\"]\n", + "password = cfg[1][\"password\"]\n", + "\n", + "# Connect once to the public OwnCloud link\n", "oc = owncloud.Client.from_public_link(url, folder_password=password)\n", - "# File pattern\n", - "# base = \"adabase-public-{num:04d}-v_0_0_2.h5py\"\n", - "base = \"{num:04d}-*.h5py\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07c03d07", - "metadata": {}, - "outputs": [], - "source": [ - "num_files = 2 # number of files to process (min: 1, max: 30)\n", + "\n", + "num_files = 1 # number of subject IDs to process (min: 1, max: 30)\n", "performance_data = []\n", "\n", + "# Read remote file list once\n", + "remote_files = oc.list(\".\")\n", + "remote_names = [f.get_name() for f in remote_files]\n", + "\n", "for i in range(num_files):\n", - " file_pattern = f\"{i:04d}-*\"\n", - " \n", - " # Get list of files matching the pattern\n", - " files = oc.list('.')\n", - " matching_files = [f.get_name() for f in files if f.get_name().startswith(f\"{i:04d}-\")]\n", - " \n", - " if matching_files:\n", - " file_name = matching_files[0] # Take the first matching file\n", - " local_tmp = f\"tmp_{i:04d}.h5\"\n", - " \n", - " oc.get_file(file_name, local_tmp)\n", - " print(f\"{file_name} geöffnet\")\n", - " else:\n", - " print(f\"Keine Datei gefunden für Muster: {file_pattern}\")\n", - " # file_name = base.format(num=i)\n", - " # local_tmp = f\"tmp_{i:04d}.h5\"\n", + " prefix = f\"{i:04d}-\"\n", + " matching_files = [name for name in remote_names if name.startswith(prefix) and name.endswith(\".hdf5\")]\n", "\n", - " # oc.get_file(file_name, local_tmp)\n", - " # print(f\"{file_name} geöffnet\")\n", - "\n", - " # check SIGNALS table for AUs\n", - " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", - " cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n", - " au_cols = [c for c in cols if c.startswith(\"AU\")]\n", - " if not au_cols:\n", - " print(f\"Subject {i} enthält keine AUs\")\n", + " if not matching_files:\n", + " print(f\"No file found for pattern: {prefix}*.hdf5\")\n", " continue\n", "\n", - " # load performance table\n", - " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", - " perf_df = store.select(\"PERFORMANCE\")\n", + " # Take the first matching file, e.g. 0000-AACA.hdf5\n", + " file_name = matching_files[0]\n", + " local_tmp = f\"tmp_{i:04d}.hdf5\"\n", + "\n", + " try:\n", + " # Download the file locally\n", + " oc.get_file(file_name, local_tmp)\n", + " print(f\"Downloaded and opened file: {file_name} -> {local_tmp}\")\n", + " except Exception as e:\n", + " print(f\"Failed to download file {file_name}: {e}\")\n", + " continue\n", + "\n", + " # Check SIGNALS table for AU columns\n", + " try:\n", + " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", + " cols = store.select(\"SIGNALS\", start=0, stop=1).columns\n", + " except Exception as e:\n", + " print(f\"Failed to read SIGNALS from {local_tmp}: {e}\")\n", + " continue\n", + "\n", + " au_cols = [c for c in cols if c.startswith(\"AU\")]\n", + " if not au_cols:\n", + " print(f\"Subject {i:04d} contains no AU columns\")\n", + " continue\n", + "\n", + " # Load PERFORMANCE table\n", + " try:\n", + " with pd.HDFStore(local_tmp, mode=\"r\") as store:\n", + " perf_df = store.select(\"PERFORMANCE\")\n", + " except Exception as e:\n", + " print(f\"Failed to read PERFORMANCE from {local_tmp}: {e}\")\n", + " continue\n", "\n", " f1_cols = [c for c in [\"AUDITIVE F1\", \"VISUAL F1\", \"F1\"] if c in perf_df.columns]\n", " if not f1_cols:\n", - " print(f\"Subject {i}: keine F1-Spalten gefunden\")\n", + " print(f\"Subject {i:04d}: no F1 columns found\")\n", " continue\n", "\n", " subject_entry = {\"subjectID\": i}\n", " valid_scores = []\n", "\n", - " # iterate rows: each (study, level, phase)\n", + " # Iterate through PERFORMANCE rows: each row is one (study, level, phase) combination\n", " for _, row in perf_df.iterrows():\n", - " study, level, phase = row[\"STUDY\"], row[\"LEVEL\"], row[\"PHASE\"]\n", + " study = row[\"STUDY\"]\n", + " level = row[\"LEVEL\"]\n", + " phase = row[\"PHASE\"]\n", " col_name = f\"STUDY_{study}_LEVEL_{level}_PHASE_{phase}\"\n", "\n", - " # collect valid F1 values among the three columns\n", + " # Collect non-NaN F1 values from the available F1 columns\n", " scores = [row[c] for c in f1_cols if pd.notna(row[c])]\n", " if scores:\n", " mean_score = float(np.mean(scores))\n", " subject_entry[col_name] = mean_score\n", " valid_scores.extend(scores)\n", "\n", - " # compute overall average across all valid combinations\n", + " # Compute overall average across all valid F1 values\n", " if valid_scores:\n", " subject_entry[\"overall_score\"] = float(np.mean(valid_scores))\n", " performance_data.append(subject_entry)\n", - " print(f\"Subject {i}: {len(valid_scores)} gültige Scores, Overall = {subject_entry['overall_score']:.3f}\")\n", + " print(\n", + " f\"Subject {i:04d}: {len(valid_scores)} valid scores, \"\n", + " f\"overall = {subject_entry['overall_score']:.3f}\"\n", + " )\n", " else:\n", - " print(f\"Subject {i}: keine gültigen F1-Scores\")\n", + " print(f\"Subject {i:04d}: no valid F1 scores found\")\n", "\n", - "# build dataframe\n", + "# Build final DataFrame and save CSV\n", "if performance_data:\n", " performance_df = pd.DataFrame(performance_data)\n", " combination_cols = sorted([c for c in performance_df.columns if c.startswith(\"STUDY_\")])\n", " final_cols = [\"subjectID\", \"overall_score\"] + combination_cols\n", " performance_df = performance_df[final_cols]\n", - " performance_df.to_csv(\"n_au_performance.csv\", index=False)\n", + " performance_df.to_csv(\"performance.csv\", index=False)\n", "\n", - " print(f\"\\nGesamt Subjects mit Action Units: {len(performance_df)}\")\n", + " print(f\"\\nTotal subjects with Action Units: {len(performance_df)}\")\n", + " print(\"Saved results to performance.csv\")\n", "else:\n", - " print(\"Keine gültigen Daten gefunden.\")" + " print(\"No valid data found.\")" ] }, { @@ -142,56 +151,11 @@ "source": [ "performance_df.head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db95eea7", - "metadata": {}, - "outputs": [], - "source": [ - "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n", - " md = store.select(\"META\")\n", - "print(\"File 0:\")\n", - "print(md)\n", - "with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\") as store:\n", - " md = store.select(\"META\")\n", - "print(\"File 1\")\n", - "print(md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8067036b", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_rows', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f18e7385", - "metadata": {}, - "outputs": [], - "source": [ - "with pd.HDFStore(\"tmp_0000.h5\", mode=\"r\") as store:\n", - " md = store.select(\"SIGNALS\", start=0, stop=1)\n", - "print(\"File 0:\")\n", - "md.head()\n", - "# with pd.HDFStore(\"tmp_0001.h5\", mode=\"r\",start=0, stop=1) as store:\n", - "# md = store.select(\"SIGNALS\")\n", - "# print(\"File 1\")\n", - "# print(md.columns)" - ] } ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -205,7 +169,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.10" } }, "nbformat": 4,