#!/usr/bin/env python3
"""
Option3 data curation: remove ODS samples when SGS Rouge_L > alpha

Defaults match notebook paths; outputs curated JSONL and stats file.
"""
import argparse
import json
from pathlib import Path


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument('--ods', type=Path, default=Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/train.jsonl'))
    p.add_argument('--sgs', type=Path, default=Path('/hy-tmp/dc/processed_data/dolly/full/gpt2-base/answers_with_metrics_gpt2-base.new.jsonl'))
    p.add_argument('--out', type=Path, default=Path('/hy-tmp/dc/processed_data/dolly/full/gpt2_curated/train.jsonl'))
    p.add_argument('--log', type=Path, default=Path('/hy-tmp/dc/processed_data/dolly/full/gpt2_curated/train_o3.stats.txt'))
    p.add_argument('--alpha', type=float, default=0.1)
    return p.parse_args()


def safe_float(x):
    try:
        return float(x)
    except Exception:
        return 0.0


def stream_jsonl(path: Path):
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            yield json.loads(s)


def main():
    args = parse_args()
    ods_iter = stream_jsonl(args.ods)
    sgs_iter = stream_jsonl(args.sgs)

    args.out.parent.mkdir(parents=True, exist_ok=True)
    kept = 0
    deleted_sgs = 0
    total = 0

    with args.out.open('w', encoding='utf-8') as out_f:
        for o, s in zip(ods_iter, sgs_iter):
            total += 1
            s_val = safe_float(s.get('Rouge_L', s.get('rougeL', 0.0)))
            if s_val > args.alpha:
                deleted_sgs += 1
                continue
            out_f.write(json.dumps(o, ensure_ascii=False) + '\n')
            kept += 1

    args.log.parent.mkdir(parents=True, exist_ok=True)
    with args.log.open('w', encoding='utf-8') as lf:
        lf.write(f'total_input: {total}\n')
        lf.write(f'kept: {kept}\n')
        lf.write(f'deleted_by_sgs: {deleted_sgs}\n')

    print('done', 'total', total, 'kept', kept, 'deleted_by_sgs', deleted_sgs)


if __name__ == '__main__':
    main()


