
from datasets import load_dataset
from pathlib import Path
from tqdm.auto import tqdm
import argparse, json, sys

def dump_jsonl(prompts, outfile: Path):
    """Write one‐prompt-per-line JSONL."""
    outfile.parent.mkdir(parents=True, exist_ok=True)
    with outfile.open("w", encoding="utf-8") as f:
        for p in prompts:
            json.dump({"prompt": p.strip()}, f, ensure_ascii=False)
            f.write("\n")

def stream_prompt_column(ds_id: str, split: str = "train"):
    """
    Stream the 'Prompt' column only, so we **do NOT** pull image binaries
    from the diffusers-parti-prompts tracks (saves ~200 MB).
    """
    ds = load_dataset(ds_id, split=split, streaming=True)
    for row in ds:
        # some tracks call the text field "prompt" (lower-case) – fall back
        yield row.get("Prompt") or row.get("prompt")

def main(argv=None):
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "--out-dir",
        type=Path,
        default=Path("./prompts"),
        help="Where the JSONL files will be stored",
    )
    ap.add_argument(
        "--opp-track",
        default="diffusers-parti-prompts/sdxl-1.0",
        help=("Any dataset under the diffusers-parti-prompts/* organisation "
              "works. Defaults to the SDXL track."),
    )
    args = ap.parse_args(argv)

    # 1. PartiPrompts (P2)
    print("Downloading PartiPrompts (P2)…", file=sys.stderr)
    p2_prompts = stream_prompt_column("nateraw/parti-prompts")
    dump_jsonl(p2_prompts, args.out_dir / "partiprompts_p2.jsonl")
    # this is same as P2
    
    # # 2. Open Parti Prompts (OPP)
    # print(f"Downloading OPP from {args.opp_track}…", file=sys.stderr)
    # opp_prompts = stream_prompt_column(args.opp_track)
    # dump_jsonl(opp_prompts, args.out_dir / "open_parti_prompts.jsonl")

    print("Done, JSONL files ready for generate.py")

if __name__ == "__main__":
    main()
