diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index 76844a5..3e0045b 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -23,7 +23,7 @@ "metadata": {}, "outputs": [], "source": [ - "file_path = \"adabase-public-0020-v_0_0_2.h5py\"" + "file_path = \"YOUR_FILE_PATH.h5py\"" ] }, { @@ -87,7 +87,7 @@ "id": "a4731c56", "metadata": {}, "source": [ - "Actions units" + "Insights on actions units" ] }, { @@ -167,7 +167,7 @@ "id": "332740a8", "metadata": {}, "source": [ - "Plots" + "Example plot of ECG curve" ] }, { @@ -177,7 +177,6 @@ "metadata": {}, "outputs": [], "source": [ - "# df_signals_ecg = pd.read_hdf(file_path, \"SIGNALS\", mode=\"r\", columns=[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I'])\n", "df_signals_ecg = df_signals[[\"STUDY\",\"LEVEL\", \"PHASE\", 'RAW_ECG_I']]\n", "df_signals_ecg.shape" ] diff --git a/EDA/distribution_plots.ipynb b/EDA/distribution_plots.ipynb index 84149c9..16a3c39 100644 --- a/EDA/distribution_plots.ipynb +++ b/EDA/distribution_plots.ipynb @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_path = Path(r\"\")" + "dataset_path = Path(r\"\") # TODO: enter path to dataset" ] }, { diff --git a/EDA/histogramms.ipynb b/EDA/histogramms.ipynb index 35b9c04..1b29a78 100644 --- a/EDA/histogramms.ipynb +++ b/EDA/histogramms.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "path = Path(r\"/home/jovyan/data-paulusjafahrsimulator-gpu/new_datasets/50s_25Hz_dataset.parquet\")\n", + "path = Path(r\".parquet\") # TODO: enter path to dataset\n", "df = pd.read_parquet(path=path)" ] }, @@ -192,18 +192,6 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.10" } }, "nbformat": 4, diff --git a/EDA/researchOnSubjectPerformance.ipynb b/EDA/researchOnSubjectPerformance.ipynb index bfa4a14..d126d12 100644 --- a/EDA/researchOnSubjectPerformance.ipynb +++ b/EDA/researchOnSubjectPerformance.ipynb @@ -155,7 +155,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "310", "language": "python", "name": "python3" }, @@ -169,7 +169,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.19" } }, "nbformat": 4, diff --git a/project_report.md b/project_report.md index 76533a1..278148a 100644 --- a/project_report.md +++ b/project_report.md @@ -68,21 +68,23 @@ Operational note: - `DB_PATH` and other paths are currently code-configured and must be adapted per deployment. ## 3) EDA -TO DO +The directory EDA provides several files to get insights into both the raw data from AdaBase and your own dataset. + +- `EDA.ipynb` - main EDA notebook: recreates the plot from AdaBase documentation, lists all experiments and in general serves as a playground for you to get to know the files. +- `distribution_plots.ipynb` - This notebook aimes to visualize the data distributions for each experiment - the goal is the find out, whether the split of experiments into high and low cognitive load is clearer if some experiments are dropped. +- `histogramms.ipynb` - Histogram analysis of low load vs high load per feature. Additionaly, scatter plots per feature are available. +- `researchOnSubjectPerformance.ipynb` - This noteboooks aims to see how the performance values range for the 30 subjects. The code creates and saves a table in csv-format, which will later be used as the foundation of the performance based split in ```model_training/tools/performance_based_split``` +- `owncloud_file_access.ipynb` - Get access to the files via owncloud and safe them as .h5 files, in correspondence to the parquet file creation script +- `login.yaml` - used to store URL and password to access files from owncloud, used in previous notebook +- `calculate_replacement_values.ipynb` - fallback / median computation notebook for deployment, creation of yaml syntax embedding + +General information: +- Due to their size, its absolutely recommended to download and save the dataset files once in the beginning +- For better data understanding, read the [AdaBase publication](https://www.mdpi.com/1424-8220/23/1/340) -- `EDA/EDA.ipynb` - main EDA notebook -- `EDA/distribution_plots.ipynb` - distribution visualization -- `EDA/histogramms.ipynb` - histogram analysis -- `EDA/researchOnSubjectPerformance.ipynb` - subject-level analysis -- `EDA/owncloud_file_access.ipynb` - ownCloud exploration/access notebook -- `EDA/calculate_replacement_values.ipynb` - fallback/median computation notebook -- `EDA/login.yaml` - local auth/config artifact for EDA workflows ## 4) Model Training -Location: -- `model_training/` (primarily notebook-driven) - Included model families: - CNN variants (different fusion strategies) - XGBoost