Fahrsimulator_MSY2526_AI/create_parquet_files.py

# %pip install pyocclient
import yaml
import owncloud
import pandas as pd
import h5py

num_files = 30  # number of files to process (min: 1, max: 30)
# Load credentials
with open("login.yaml") as f:
    cfg = yaml.safe_load(f)
    print("ahahahah")
url, password = cfg[0]["url"], cfg[1]["password"]

# Connect once
oc = owncloud.Client.from_public_link(url, folder_password=password)
print("connection aufgebaut")
# File pattern
base = "adabase-public-{num:04d}-v_0_0_2.h5py"

for i in range(num_files):
    file_name = base.format(num=i)
    local_tmp = f"tmp_{i:04d}.h5"

    # Download file from ownCloud
    oc.get_file(file_name, local_tmp)
    print(f"{file_name} geoeffnet")
    # Load into memory and extract needed columns
    # with h5py.File(local_tmp, "r") as f:
    #     # Adjust this path depending on actual dataset layout inside .h5py file
    #     df = pd.DataFrame({k: f[k][()] for k in f.keys() if k in ["STUDY", "LEVEL", "PHASE"] or k.startswith("AU")})

    with pd.HDFStore(local_tmp, mode="r") as store:
        cols = store.select("SIGNALS", start=0, stop=1).columns # get column names

    # Step 2: Filter columns that start with "AU"
    au_cols = [c for c in cols if c.startswith("AU")]
    print(au_cols)

    # Step 3: Read only those columns (plus any others you want)
    df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)


    print("load done")

    # Add metadata columns
    df["subjectID"] = i
    df["rowID"] = range(len(df))

    print("extra columns done")
    # Clean data
    # drop level = 0
    print(df.columns)
    df = df[df["LEVEL"] != 0]

    df = df.dropna()

    print("data cleaning done")


    # Save to parquet
    out_name = f"cleaned_{i:04d}.parquet"
    df.to_parquet(out_name, index=False)

    print(f"Processed {file_name} -> {out_name}")