from pathlib import Path
import pandas as pd
from datasets import load_dataset

out_csv = Path("data/raw/zinc250k.csv")
out_csv.parent.mkdir(parents=True, exist_ok=True)

ds = load_dataset("yairschiff/zinc250k")  
df = pd.concat([ds[k].to_pandas() for k in ds.keys()], ignore_index=True)

# Normalize column name to "smiles"
if "smiles" not in df.columns:
    for alt in ["SMILES", "mol", "smile"]:
        if alt in df.columns:
            df = df.rename(columns={alt: "smiles"})
            break

df[["smiles"]].dropna().to_csv(out_csv, index=False)
print("Wrote:", out_csv, "rows:", len(df))