"""
Compute clinical label stats of mimic-cxr and annotation.csv
"""

import pandas as pd
from pathlib import Path
from mimic_cxr_utils import CATEGORIES


def process_annotation(df):
    valid_ids = []
    df = df[df["A1"] == "yes"]
    for item in df["record_path"]:
        study_id = Path(item).stem.split('\\')[-1]
        assert study_id[0] == 's' and len(study_id) == 9
        valid_ids.append(int(study_id[1:]))
    return valid_ids


chexpert_df = pd.read_csv("C:/Users/annonymous/Desktop/chexpert.csv")
annotation_df = pd.read_csv("C:/Users/annonymous/Desktop/annotation_v2.csv")
split_df = pd.read_csv("C:/Users/annonymous/Desktop/mimic-cxr-2.0.0-split.csv")

test_ids = split_df[split_df["split"] == "test"]["study_id"]
valid_ids = process_annotation(annotation_df)
chexpert_df = chexpert_df[chexpert_df["study_id"].isin(test_ids)]

print("MIMIC-CXR")
total = len(chexpert_df)
print("total", total)
for cat in CATEGORIES:
    count = (chexpert_df[cat] == 1).sum()
    print(cat, count, 100*count/total)

print("\nannotation")
print(len(valid_ids))
chexpert_df = chexpert_df[chexpert_df["study_id"].isin(valid_ids)]
total = len(chexpert_df)
print("total", total)
for cat in CATEGORIES:
    count = (chexpert_df[cat] == 1).sum()
    print(cat, count, 100*count/total)

