create cleaned parquet files
This commit is contained in:
parent
991ca1ebba
commit
a619f5ad8e
64
create_parquet_files.py
Normal file
64
create_parquet_files.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
# %pip install pyocclient
|
||||||
|
import yaml
|
||||||
|
import owncloud
|
||||||
|
import pandas as pd
|
||||||
|
import h5py
|
||||||
|
|
||||||
|
|
||||||
|
# Load credentials
|
||||||
|
with open("login.yaml") as f:
|
||||||
|
cfg = yaml.safe_load(f)
|
||||||
|
print("ahahahah")
|
||||||
|
url, password = cfg[0]["url"], cfg[1]["password"]
|
||||||
|
|
||||||
|
# Connect once
|
||||||
|
oc = owncloud.Client.from_public_link(url, folder_password=password)
|
||||||
|
print("connection aufgebaut")
|
||||||
|
# File pattern
|
||||||
|
base = "adabase-public-{num:04d}-v_0_0_2.h5py"
|
||||||
|
|
||||||
|
for i in range(1):
|
||||||
|
file_name = base.format(num=i)
|
||||||
|
local_tmp = f"tmp_{i:04d}.h5"
|
||||||
|
|
||||||
|
# Download file from ownCloud
|
||||||
|
oc.get_file(file_name, local_tmp)
|
||||||
|
print(f"{file_name} geoeffnet")
|
||||||
|
# Load into memory and extract needed columns
|
||||||
|
# with h5py.File(local_tmp, "r") as f:
|
||||||
|
# # Adjust this path depending on actual dataset layout inside .h5py file
|
||||||
|
# df = pd.DataFrame({k: f[k][()] for k in f.keys() if k in ["STUDY", "LEVEL", "PHASE"] or k.startswith("AU")})
|
||||||
|
|
||||||
|
with pd.HDFStore(local_tmp, mode="r") as store:
|
||||||
|
cols = store.select("SIGNALS", start=0, stop=1).columns # get column names
|
||||||
|
|
||||||
|
# Step 2: Filter columns that start with "AU"
|
||||||
|
au_cols = [c for c in cols if c.startswith("AU")]
|
||||||
|
print(au_cols)
|
||||||
|
|
||||||
|
# Step 3: Read only those columns (plus any others you want)
|
||||||
|
df = pd.read_hdf(local_tmp, key="SIGNALS", columns=["STUDY", "LEVEL", "PHASE"] + au_cols)
|
||||||
|
|
||||||
|
|
||||||
|
print("load done")
|
||||||
|
|
||||||
|
# Add metadata columns
|
||||||
|
df["subjectID"] = i
|
||||||
|
df["rowID"] = range(len(df))
|
||||||
|
|
||||||
|
print("extra columns done")
|
||||||
|
# Clean data
|
||||||
|
# drop level = 0
|
||||||
|
print(df.columns)
|
||||||
|
df = df[df["LEVEL"] != 0]
|
||||||
|
|
||||||
|
df = df.dropna()
|
||||||
|
|
||||||
|
print("data cleaning done")
|
||||||
|
|
||||||
|
|
||||||
|
# Save to parquet
|
||||||
|
out_name = f"cleaned_{i:04d}.parquet"
|
||||||
|
df.to_parquet(out_name, index=False)
|
||||||
|
|
||||||
|
print(f"Processed {file_name} -> {out_name}")
|
||||||
Loading…
x
Reference in New Issue
Block a user