Imports

In [None]:
%pip install pyocclient
import yaml
import owncloud
import pandas as pd
import numpy as np
import hashlib

Connection to Owncloud

In [None]:
# Load credentials
with open("../login.yaml") as f:
    cfg = yaml.safe_load(f)
   
url, password = cfg[0]["url"], cfg[1]["password"]

# Connect once
oc = owncloud.Client.from_public_link(url, folder_password=password)
# File pattern


In [None]:
num_files = 30
performance_data = []
base = "adabase-public-{num:04d}-v_0_0_2.h5py"  # remote name pattern
for i in range(num_files):
    file_name = base.format(num=i)
    local_tmp = f"tmp_{i:04d}.h5"

    #oc.get_file(file_name, local_tmp)

    # quick checksum to detect identical downloads
    with open(local_tmp, "rb") as fh:
        file_hash = hashlib.sha1(fh.read()).hexdigest()
    print(f"File {i}: {file_name}  checksum={file_hash}")

    # check SIGNALS table for AUs
    with pd.HDFStore(local_tmp, mode="r") as store:
        cols = store.select("SIGNALS", start=0, stop=1).columns
    au_cols = [c for c in cols if c.startswith("AU")]
    if not au_cols:
        print(f"Subject {i} enth채lt keine AUs")
        continue

    # load performance table (make a copy)
    with pd.HDFStore(local_tmp, mode="r") as store:
        perf_df = store.select("PERFORMANCE").copy()

    # print(f"Subject {i}: PERFORMANCE rows={len(perf_df)}")
    # print(perf_df.head(3).to_string(index=False))

    f1_cols = [c for c in ["AUDITIVE F1", "VISUAL F1", "F1"] if c in perf_df.columns]
    if not f1_cols:
        print(f"Subject {i}: keine F1-Spalten gefunden")
        continue

    # drop rows that have all F1s NaN (no valid score for that combo)
    perf_df = perf_df.dropna(subset=f1_cols, how="all")
    if perf_df.empty:
        print(f"Subject {i}: keine g체ltigen F1-Daten nach Filter")
        continue

    subject_entry = {"subjectID": i}
    combo_means = []

    for _, row in perf_df.iterrows():
        study, level, phase = row["STUDY"], row["LEVEL"], row["PHASE"]
        col_name = f"STUDY_{study}_LEVEL_{level}_PHASE_{phase}"

        # mean of available F1 cols for this single combination
        vals = [float(row[c]) for c in f1_cols if pd.notna(row[c])]
        if not vals:
            continue
        mean_for_combo = float(np.mean(vals))
        subject_entry[col_name] = mean_for_combo
        combo_means.append(mean_for_combo)

    # overall: mean of per-combination means (not flattened raw F1s)
    if combo_means:
        subject_entry["overall_score"] = float(np.mean(combo_means))
        performance_data.append(subject_entry)
        print(f"Subject {i}: combos={len(combo_means)} overall={subject_entry['overall_score']:.4f}")
    else:
        print(f"Subject {i}: keine g체ltigen Kombinationen")

# build dataframe
if performance_data:
    performance_df = pd.DataFrame(performance_data)
    combination_cols = sorted([c for c in performance_df.columns if c.startswith("STUDY_")])
    final_cols = ["subjectID", "overall_score"] + combination_cols
    performance_df = performance_df.reindex(columns=final_cols)  # keeps missing combo cols as NaN
    performance_df.to_csv("au_performance.csv", index=False)
    print(f"\nGesamt Subjects mit Action Units: {len(performance_df)}")
else:
    print("Keine g체ltigen Daten gefunden.")


In [None]:
performance_df.head()

In [None]:
with pd.HDFStore(local_tmp, mode="r") as store:
    performance = store.select("PERFORMANCE")
performance

In [None]:


def hash_perf(local_tmp):
    with pd.HDFStore(local_tmp, mode="r") as s:
        df = s.select("PERFORMANCE")
    # hash based on data values only
    return hashlib.sha1(pd.util.hash_pandas_object(df, index=True).values).hexdigest()

hashes = []
for i in range(5):
    local_tmp = f"tmp_{i:04d}.h5"
    try:
        hashes.append((i, hash_perf(local_tmp)))
    except Exception as e:
        hashes.append((i, str(e)))

print(hashes)
