create cleaned parquet files

2025-10-30 14:25:57 +01:00 · 2025-10-30 14:25:57 +01:00 · a619f5ad8e
commit a619f5ad8e
parent 991ca1ebba
1 changed files with 64 additions and 0 deletions
--- a/create_parquet_files.py
+++ b/create_parquet_files.py
@ -0,0 +1,64 @@
+# %pip install pyocclient
+import yaml
+import owncloud
+import pandas as pd
+import h5py
+
+
+# Load credentials
+with open("login.yaml") as f:
+    cfg = yaml.safe_load(f)
+    print("ahahahah")
+url, password = cfg[0]["url"], cfg[1]["password"]
+
+# Connect once
+oc = owncloud.Client.from_public_link(url, folder_password=password)
+print("connection aufgebaut")
+# File pattern
+base = "adabase-public-{num:04d}-v_0_0_2.h5py"
+
+for i in range(1):
+    file_name = base.format(num=i)
+    local_tmp = f"tmp_{i:04d}.h5"
+
+    # Download file from ownCloud
+    oc.get_file(file_name, local_tmp)
+    print(f"{file_name} geoeffnet")
+    # Load into memory and extract needed columns
+    # with h5py.File(local_tmp, "r") as f:
+    #     # Adjust this path depending on actual dataset layout inside .h5py file
+    #     df = pd.DataFrame({k: f[k][()] for k in f.keys() if k in ["STUDY", "LEVEL", "PHASE"] or k.startswith("AU")})
+    
+    with pd.HDFStore(local_tmp, mode="r") as store:
+        cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
+
+    # Step 2: Filter columns that start with "AU"
+    au_cols = [c for c in cols if c.startswith("AU")]
+    print(au_cols)
+
+    # Step 3: Read only those columns (plus any others you want)
+    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
+
+
+    print("load done")
+
+    # Add metadata columns
+    df["subjectID"] = i
+    df["rowID"] = range(len(df))
+
+    print("extra columns done")
+    # Clean data
+    # drop level = 0
+    print(df.columns)
+    df = df[df["LEVEL"] != 0]
+
+    df = df.dropna()
+    
+    print("data cleaning done")
+  
+
+    # Save to parquet
+    out_name = f"cleaned_{i:04d}.parquet"
+    df.to_parquet(out_name, index=False)
+
+    print(f"Processed {file_name} -> {out_name}")