import os, argparse, pickle, random
import numpy as np, pandas as pd

try:
    from finch import FINCH
except Exception as e:
    raise SystemExit("FINCH not found. Install `finchpy` or ensure a local `finch.py` with FINCH() exists.") from e

def load_features_pkl(path, expected_dim, sample_check=1000, out_dtype=np.float32):
    with open(path, 'rb') as f: obj = pickle.load(f)
    if not isinstance(obj, (list, tuple)): raise SystemExit('features pkl must contain list/tuple')
    N = len(obj)
    idxs = random.sample(range(N), min(sample_check, N))
    for i in idxs:
        if len(obj[i]) != expected_dim:
            raise SystemExit(f"vector length mismatch at {i}: {len(obj[i])} vs {expected_dim}")
    arr = np.empty((N, expected_dim), dtype=out_dtype)
    for i in range(N): arr[i] = np.asarray(obj[i], dtype=out_dtype)
    return arr

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--features-pkl', type=str, required=True)
    ap.add_argument('--expected-dim', type=int, default=256)
    ap.add_argument('--meta-csv', type=str, default=None, help='optional CSV with [Image, Caption]')
    ap.add_argument('--out-dir', type=str, default='outputs/clusters')
    args = ap.parse_args()

    X = load_features_pkl(args.features_pkl, args.expected_dim)
    meta = None
    if args.meta_csv and os.path.exists(args.meta_csv):
        meta = pd.read_csv(args.meta_csv)[['Image','Caption']]
        if len(meta) != len(X): raise SystemExit('meta rows != features N')

    labels_per_level, _, _ = FINCH(X, use_ann_above_samples=0)
    os.makedirs(args.out_dir, exist_ok=True)

    N, L = labels_per_level.shape
    for level in range(L):
        labels = labels_per_level[:, level]
        df = (meta.copy() if meta is not None else pd.DataFrame(index=range(N)))
        df['cluster'] = labels
        out_csv = os.path.join(args.out_dir, f"clustered_captions_level_{level+1}.csv")
        df.to_csv(out_csv, index=False)
        print(f"[SAVED] level {level+1} → {out_csv}")

if __name__ == '__main__':
    main()
