diff --git a/predict_pipeline/check_python_version.py b/predict_pipeline/check_python_version.py new file mode 100644 index 0000000..bb2c7a6 --- /dev/null +++ b/predict_pipeline/check_python_version.py @@ -0,0 +1,11 @@ +# from tools import db_helpers +import sys + + +def main(): + print(sys.version) + # db_helpers.add_columns_to_table() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/predict_pipeline/fill_db.ipynb b/predict_pipeline/fill_db.ipynb new file mode 100644 index 0000000..8832eaf --- /dev/null +++ b/predict_pipeline/fill_db.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0d70a13f", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/tools')\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import db_helpers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce696366", + "metadata": {}, + "outputs": [], + "source": [ + "database_path = Path(r\"/home/edgekit/MSY_FS/databases/rawdata.sqlite\")\n", + "parquet_path = Path(r\"/home/edgekit/MSY_FS/fahrsimulator_msy2526_ai/files_for_testing/both_mod_0000.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1aa9398", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = pd.read_parquet(parquet_path)\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b183746e", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24ed769d", + "metadata": {}, + "outputs": [], + "source": [ + "con, cursor = db_helpers.connect_db(database_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e604ed30", + "metadata": {}, + "outputs": [], + "source": [ + "df_clean = dataset.drop(columns=['subjectID','rowID', 'STUDY', 'LEVEL', 'PHASE'])\n", + "df_first_100 = df_clean.head(200)\n", + "df_first_100 = df_first_100.reset_index(drop=True)\n", + "df_first_100.insert(0, '_Id', df_first_100.index + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e77a812e", + "metadata": {}, + "outputs": [], + "source": [ + "def pandas_to_sqlite_dtype(dtype):\n", + " if pd.api.types.is_integer_dtype(dtype):\n", + " return \"INTEGER\"\n", + " if pd.api.types.is_float_dtype(dtype):\n", + " return \"REAL\"\n", + " if pd.api.types.is_bool_dtype(dtype):\n", + " return \"INTEGER\"\n", + " if pd.api.types.is_datetime64_any_dtype(dtype):\n", + " return \"TEXT\"\n", + " return \"TEXT\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e8897b2", + "metadata": {}, + "outputs": [], + "source": [ + "columns = {\n", + " col: pandas_to_sqlite_dtype(dtype)\n", + " for col, dtype in df_first_100.dtypes.items()\n", + "}\n", + "\n", + "constraints = {\n", + " \"_Id\": [\"NOT NULL\"]\n", + "}\n", + "\n", + "primary_key = {\n", + " \"pk_df_first_100\": [\"_Id\"]\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ab57624", + "metadata": {}, + "outputs": [], + "source": [ + "sql = db_helpers.create_table(\n", + " conn=con,\n", + " cursor=cursor,\n", + " table_name=\"rawdata\",\n", + " columns=columns,\n", + " constraints=constraints,\n", + " primary_key=primary_key,\n", + " commit=True\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25096a7f", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_insert = {\n", + " col: df_first_100[col].tolist()\n", + " for col in df_first_100.columns\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a5a3aa8", + "metadata": {}, + "outputs": [], + "source": [ + "db_helpers.insert_rows_into_table(\n", + " conn=con,\n", + " cursor=cursor,\n", + " table_name=\"rawdata\",\n", + " columns=columns_to_insert,\n", + " commit=True\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b56beae2", + "metadata": {}, + "outputs": [], + "source": [ + "a = db_helpers.get_data_from_table(conn=con, table_name='rawdata',columns_list=['*'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4a74a9d", + "metadata": {}, + "outputs": [], + "source": [ + "a.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da0f8737", + "metadata": {}, + "outputs": [], + "source": [ + "db_helpers.disconnect_db(con, cursor)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MSY_FS_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}