import gzip
import json
import os

import pandas as pd
from tqdm import tqdm


for filename in tqdm(os.listdir("train")):
    assay = filename.removesuffix(".jsonl.gz")
    data = []
    with gzip.open(f"train/{filename}", "rt") as file:
        for line in file:
            record = json.loads(line)
            smiles = record["SMILES"]
            label = int(float(record["Property"]))
            data.append({"SMILES": smiles, "label": label})

    if len(data) < 1000 or len(data) > 10000:
        continue

    df = pd.DataFrame.from_records(data)

    pos_cls = df["label"].sum() / len(df)
    if pos_cls < 0.05 or pos_cls > 0.95:
        continue

    df.to_parquet(f"fsmol_chembl_datasets/{assay}.parquet")
