# pip install pyocclient import yaml import owncloud import pandas as pd import h5py import os print(os.getcwd()) num_files = 30 # number of files to process (min: 1, max: 30) # Load credentials with open("login.yaml") as f: cfg = yaml.safe_load(f) print("yaml geladen") url, password = cfg[0]["url"], cfg[1]["password"] # Connect once oc = owncloud.Client.from_public_link(url, folder_password=password) print("connection aufgebaut") # File pattern base = "adabase-public-{num:04d}-v_0_0_2.h5py" for i in range(num_files): print(f"Subject {i} gestartet") file_name = base.format(num=i) local_tmp = f"tmp_{i:04d}.h5" # Download file from ownCloud oc.get_file(file_name, local_tmp) print(f"{file_name} geoeffnet") # Load into memory and extract needed columns # with h5py.File(local_tmp, "r") as f: # # Adjust this path depending on actual dataset layout inside .h5py file # df = pd.DataFrame({k: f[k][()] for k in f.keys() if k in ["STUDY", "LEVEL", "PHASE"] or k.startswith("AU")}) with pd.HDFStore(local_tmp, mode="r") as store: cols = store.select("SIGNALS", start=0, stop=1).columns # get column names # Step 2: Filter columns that start with "AU" au_cols = [c for c in cols if c.startswith("AU")] print(au_cols) if len(au_cols)==0: print(f"keine AU Signale in Subject {i}") continue # Step 3: Read only those columns (plus any others you want) df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols) print("load done") # Add metadata columns df["subjectID"] = i df["rowID"] = range(len(df)) print("extra columns done") # Clean data # drop level = 0 print(df.columns) df = df[df["LEVEL"] != 0] df = df.dropna() print("data cleaning done") # Save to parquet os.makedirs("ParquetFiles", exist_ok=True) out_name = f"ParquetFiles/cleaned_{i:04d}.parquet" df.to_parquet(out_name, index=False) print(f"Processed {file_name} -> {out_name}")