From a619f5ad8e4ce4e9ee850edce5c421f33ec58bef Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 30 Oct 2025 14:25:57 +0100 Subject: [PATCH] create cleaned parquet files --- create_parquet_files.py | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 create_parquet_files.py diff --git a/create_parquet_files.py b/create_parquet_files.py new file mode 100644 index 0000000..6651bfd --- /dev/null +++ b/create_parquet_files.py @@ -0,0 +1,64 @@ +# %pip install pyocclient +import yaml +import owncloud +import pandas as pd +import h5py + + +# Load credentials +with open("login.yaml") as f: + cfg = yaml.safe_load(f) + print("ahahahah") +url, password = cfg[0]["url"], cfg[1]["password"] + +# Connect once +oc = owncloud.Client.from_public_link(url, folder_password=password) +print("connection aufgebaut") +# File pattern +base = "adabase-public-{num:04d}-v_0_0_2.h5py" + +for i in range(1): + file_name = base.format(num=i) + local_tmp = f"tmp_{i:04d}.h5" + + # Download file from ownCloud + oc.get_file(file_name, local_tmp) + print(f"{file_name} geoeffnet") + # Load into memory and extract needed columns + # with h5py.File(local_tmp, "r") as f: + # # Adjust this path depending on actual dataset layout inside .h5py file + # df = pd.DataFrame({k: f[k][()] for k in f.keys() if k in ["STUDY", "LEVEL", "PHASE"] or k.startswith("AU")}) + + with pd.HDFStore(local_tmp, mode="r") as store: + cols = store.select("SIGNALS", start=0, stop=1).columns # get column names + + # Step 2: Filter columns that start with "AU" + au_cols = [c for c in cols if c.startswith("AU")] + print(au_cols) + + # Step 3: Read only those columns (plus any others you want) + df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols) + + + print("load done") + + # Add metadata columns + df["subjectID"] = i + df["rowID"] = range(len(df)) + + print("extra columns done") + # Clean data + # drop level = 0 + print(df.columns) + df = df[df["LEVEL"] != 0] + + df = df.dropna() + + print("data cleaning done") + + + # Save to parquet + out_name = f"cleaned_{i:04d}.parquet" + df.to_parquet(out_name, index=False) + + print(f"Processed {file_name} -> {out_name}")