#!/usr/bin/env python3
"""
Generate a BugBench-style dataset by applying human-generated bug patterns to model-generated buggy solutions.

This script:
1. Loads buggy_solution from `anonymous/bugbench_v2` (model-generated to mimic human style bugs)
2. Loads a human-generated bug from the corresponding task in `anonymous/bugbench`
3. Loads the ground truth
4. Prompts the model to apply the human-generated bug pattern to the buggy_solution from bugbench_v2
5. Validates and pushes the results to HuggingFace

Example:
  python examples/bugs/data_processing/generate_bugbench_v2.py \
    --bugbench-v2-repo anonymous/bugbench_v2 --bugbench-v2-split train \
    --bugbench-repo anonymous/bugbench --bugbench-split train \
    --output-repo yourname/bugbench_v2_human_applied --output-split train \
    --model gpt-5.2 --private
"""

from __future__ import annotations

import argparse
import ast
import asyncio
import os
import re
import textwrap
from io import StringIO
import tokenize
from typing import Any, Dict, Iterable, List, Optional, Tuple


_CODE_BLOCK_RE = re.compile(r"```(?:python)?\s*\n(.*?)\n```", re.DOTALL | re.IGNORECASE)
_MAX_TRIES_PER_EXAMPLE = 5


def _env_first(*names: str) -> str:
    for n in names:
        v = os.getenv(n, "")
        if isinstance(v, str) and v.strip():
            return v.strip()
    return ""


def _chunked(xs: List[int], n: int) -> Iterable[List[int]]:
    if n <= 0:
        raise ValueError("chunk size must be > 0")
    for i in range(0, len(xs), n):
        yield xs[i : i + n]


def _extract_first_python_code_block(text: str) -> str:
    text = (text or "").strip()
    m = _CODE_BLOCK_RE.search(text)
    if not m:
        return text
    return m.group(1).strip()


def _strip_python_comments(code: str) -> str:
    """Remove Python `# ...` comments without touching strings/docstrings."""
    code = (code or "").strip()
    if not code:
        return ""
    try:
        toks = tokenize.generate_tokens(StringIO(code).readline)
        kept = [t for t in toks if t.type != tokenize.COMMENT]
        out = tokenize.untokenize(kept)
    except Exception:
        # Best-effort fallback: remove full-line comments only.
        out_lines = []
        for line in code.splitlines():
            if line.lstrip().startswith("#"):
                continue
            out_lines.append(line)
        out = "\n".join(out_lines)
    out = "\n".join([ln.rstrip() for ln in out.splitlines()]).strip()
    return out


def _get_uid(example: Dict[str, Any]) -> str:
    """Extract uid/task_id from example."""
    return str(example.get("uid") or example.get("task_id") or "")


def _extract_code_from_fenced(code: str) -> str:
    """Extract code from fenced blocks if present."""
    if not code:
        return ""
    code = code.strip()
    # Try to extract from code fences
    m = _CODE_BLOCK_RE.search(code)
    if m:
        return m.group(1).strip()
    return code


def _extract_code_safely(code: str) -> str:
    """Extract code using the standard utility if available, otherwise use regex."""
    try:
        from rllm.rewards.code_reward import extract_code_from_model
        extracted = extract_code_from_model(code)
        return extracted if extracted else code
    except ImportError:
        return _extract_code_from_fenced(code)


def _strip_code_fences(s: str) -> str:
    """Strip code fences from a string."""
    t = (s or "").strip()
    if t.startswith("```"):
        # Handle ```python ...``` or ``` ...```
        t = t.strip("`")
        # After stripping backticks we may have "python\n..."
        if "\n" in t:
            first, rest = t.split("\n", 1)
            if first.strip().lower() in {"python", "py"}:
                return rest.strip().rstrip("`").strip()
        return t.strip().rstrip("`").strip()
    return t


def _extract_function_body_from_code(code: str, func_name: str) -> Optional[str]:
    """
    Return the function body (as a string with 4-space indentation) for `func_name`
    from full Python source `code`. Returns None if not found or unparsable.
    """
    code = (code or "").strip("\n")
    if not code or not func_name:
        return None

    try:
        tree = ast.parse(code)
    except Exception:
        tree = None

    if tree is not None:
        for node in tree.body:
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name:
                if not getattr(node, "body", None):
                    return ""
                lines = code.splitlines()
                start = int(node.body[0].lineno) - 1
                end = int(getattr(node, "end_lineno", node.body[-1].end_lineno or node.body[-1].lineno))
                body_lines = lines[start:end]
                dedented = textwrap.dedent("\n".join(body_lines)).rstrip()
                # Re-indent to match BugBench convention: body-only with 4-space indent.
                return "\n".join(("    " + ln if ln.strip() else ln) for ln in dedented.splitlines()).rstrip()

    # Fallback: naive text slicing after "def func_name"
    marker = f"def {func_name}"
    idx = code.find(marker)
    if idx == -1:
        return None
    after = code[idx:]
    # Find first newline after def line
    nl = after.find("\n")
    if nl == -1:
        return None
    body_text = after[nl + 1 :].rstrip()
    dedented = textwrap.dedent(body_text).rstrip()
    return "\n".join(("    " + ln if ln.strip() else ln) for ln in dedented.splitlines()).rstrip()


def _normalize_body_like_bugbench(text: str, entry_point: str) -> str:
    """
    Attempt to normalize a code snippet into BugBench's `buggy`/`canonical_solution` format:
    function body only, 4-space indented.
    """
    raw = _strip_code_fences(text)
    raw = raw.strip("\n")
    if not raw:
        return ""

    # If it's full code containing the function, extract the body.
    if entry_point:
        body = _extract_function_body_from_code(raw, entry_point)
        if body is not None:
            return body

    # Otherwise, assume it's already a body; normalize indentation.
    dedented = textwrap.dedent(raw).rstrip()
    return "\n".join(("    " + ln if ln.strip() else ln) for ln in dedented.splitlines()).rstrip()


BUGBENCH_COLUMNS = [
    "task_id",
    "instruct_prompt",
    "buggy",
    "canonical_solution",
    "test",
    "complete_prompt",
    "code_prompt",
    "entry_point",
    "doc_struct",
    "libs",
]


def _convert_to_bugbench_format(example: Dict[str, Any], output_column: str = "buggy_solution") -> Dict[str, Any]:
    """
    Convert a bugbench_v2 format example to bugbench format.
    """
    task_id = str(example.get("uid") or example.get("task_id") or "")
    entry_point = str(example.get("entry_point") or "")
    
    # Extract buggy solution (the newly generated one from output_column, fallback to buggy_solution)
    buggy_solution = example.get(output_column) or example.get("buggy_solution", "")
    buggy = _normalize_body_like_bugbench(buggy_solution, entry_point)
    
    # Extract canonical/reference solution
    canonical_solution = example.get("reference_solution", "")
    canonical = _normalize_body_like_bugbench(canonical_solution, entry_point)
    
    # Map other fields
    instruct_prompt = str(example.get("instruct_prompt", "") or "")
    test = str(example.get("ground_truth", "") or example.get("test", "") or "")
    complete_prompt = str(example.get("complete_prompt", "") or example.get("question", "") or "")
    code_prompt = str(example.get("code_prompt", "") or example.get("starter_code", "") or "")
    
    # Default empty values for optional fields
    doc_struct = str(example.get("doc_struct", "") or "")
    libs = str(example.get("libs", "") or "[]")
    
    return {
        "task_id": task_id,
        "instruct_prompt": instruct_prompt,
        "buggy": buggy,
        "canonical_solution": canonical,
        "test": test,
        "complete_prompt": complete_prompt,
        "code_prompt": code_prompt,
        "entry_point": entry_point,
        "doc_struct": doc_struct,
        "libs": libs,
    }


def _build_human_likeness_comparison_prompt(
    problem: str,
    original_buggy_solution: str,
    new_buggy_solution: str,
    ground_truth: str,
    human_reference_bug: str,
) -> str:
    """
    Build a prompt asking the model to compare which bug is more human-like:
    the original model-generated bug vs the new bug created by applying human patterns.
    """
    original_code = _extract_code_safely(original_buggy_solution)
    new_code = _extract_code_safely(new_buggy_solution)
    gt_code = _extract_code_safely(ground_truth)
    human_code = _extract_code_safely(human_reference_bug)
    
    return f"""You are an expert code reviewer evaluating which of two buggy solutions is more human-like.

PROBLEM:
{problem}

GROUND TRUTH (correct solution for reference):
```python
{gt_code}
```

HUMAN-GENERATED BUG (reference for what human bugs look like):
```python
{human_code}
```

BUG A:
```python
{original_code}
```

BUG B:
```python
{new_code}
```

TASK:
Compare Bug A and Bug B to determine which one is more human-like. Consider:

1. **Naturalness**: Which bug looks more like something a human would naturally write (vs. something an LLM would generate)?
2. **Bug characteristics**: Which bug has characteristics more typical of human mistakes (e.g., subtle off-by-ones, natural oversights, realistic edge case misses)?
3. **Code style**: Which bug's code style and structure feels more human-written?
4. **Bug subtlety**: Which bug is more subtly wrong in a way humans typically make mistakes?

OUTPUT (JSON ONLY):
Return ONLY a JSON object (no markdown, no code fences, no extra text) in this exact schema:

{{"more_human_like": "A" | "B" | "tie", "score_A": <number 0-10>, "score_B": <number 0-10>, "reasoning": "<brief explanation>"}}

Where:
- "more_human_like": "A" if Bug A is more human-like, "B" if Bug B is more human-like, "tie" if they're equally human-like
- "score_A": Human-likeness score for Bug A (0-10, where 10 is most human-like)
- "score_B": Human-likeness score for Bug B (0-10, where 10 is most human-like)
- "reasoning": Brief explanation of your judgment"""


def _build_apply_human_bug_prompt(
    problem: str,
    model_buggy_solution: str,
    human_buggy_solution: str,
    ground_truth: str,
    starter_code: Optional[str] = None,
) -> str:
    """
    Build a prompt asking the model to apply the human-generated bug pattern
    to the model-generated buggy solution.
    """
    model_code = _extract_code_safely(model_buggy_solution)
    human_code = _extract_code_safely(human_buggy_solution)
    gt_code = _extract_code_safely(ground_truth)
    
    starter_section = ""
    if starter_code and starter_code.strip():
        starter_section = f"""
STARTER CODE (keep the same signature / entry point):
```python
{starter_code.strip()}
```"""
    
    return f"""You are a code modification assistant. Your task is to apply a human-generated bug pattern to a model-generated buggy solution.

PROBLEM:
{problem}
{starter_section}

GROUND TRUTH (correct solution for reference):
```python
{gt_code}
```

MODEL-GENERATED BUGGY SOLUTION (to be modified):
```python
{model_code}
```

HUMAN-GENERATED BUGGY SOLUTION (use this as a reference for the bug pattern/style):
```python
{human_code}
```

TASK:
Analyze the human-generated buggy solution to understand:
1. What type of bug was introduced (e.g., off-by-one, wrong operator, missing edge case, boundary error)
2. Where the bug was introduced (e.g., loop bounds, conditionals, indexing, return statements)
3. How subtle the bug is (e.g., passes most tests but fails edge cases)

Then apply a similar bug pattern to the model-generated buggy solution. The resulting code should:
- Be syntactically valid Python
- Fail at least one unit test (but should compile and run)
- Preserve the overall structure and approach of the model-generated solution
- Not drastically rewrite the code
- Keep the same function signature, imports, and I/O format

Return ONLY the modified Python code inside a single ```python``` block."""


async def _generate_one_modified_buggy(
    *,
    engine,
    bugbench_v2_task: Dict[str, Any],
    bugbench_task: Dict[str, Any],
    idx: int,
) -> Tuple[int, str]:
    """Generate a modified buggy solution by applying human bug pattern to model buggy solution."""
    
    # Extract problem information
    problem = (
        bugbench_v2_task.get("instruct_prompt") 
        or bugbench_v2_task.get("truncated_instruct_prompt")
        or bugbench_v2_task.get("question")
        or ""
    ).strip()
    
    starter_code = (
        bugbench_v2_task.get("starter_code") 
        or bugbench_v2_task.get("code_prompt")
        or ""
    ).strip() or None
    
    # Extract code solutions
    model_buggy = bugbench_v2_task.get("buggy_solution", "")
    human_buggy = bugbench_task.get("buggy_solution", "")
    ground_truth = bugbench_v2_task.get("ground_truth") or bugbench_task.get("ground_truth") or ""
    
    if not model_buggy:
        raise ValueError(f"Missing buggy_solution in bugbench_v2 task (idx={idx})")
    if not human_buggy:
        raise ValueError(f"Missing buggy_solution in bugbench task (idx={idx})")
    if not ground_truth:
        raise ValueError(f"Missing ground_truth (idx={idx})")
    
    prompt = _build_apply_human_bug_prompt(
        problem=problem,
        model_buggy_solution=model_buggy,
        human_buggy_solution=human_buggy,
        ground_truth=ground_truth,
        starter_code=starter_code,
    )
    
    messages = [
        {"role": "system", "content": "You are a careful Python programmer who can analyze and apply bug patterns."},
        {"role": "user", "content": prompt},
    ]
    
    out = await engine.get_model_response(messages)
    text = (getattr(out, "content", None) or getattr(out, "text", "") or "").strip()
    code = _extract_first_python_code_block(text)
    code = _strip_python_comments(code)
    
    if not code:
        raise RuntimeError(f"Empty model output for idx={idx}")
    
    # Keep output in the dataset as a python fence, matching existing BugBench style.
    fenced = f"```python\n{code}\n```"
    return idx, fenced


async def _compare_human_likeness(
    *,
    engine,
    problem: str,
    original_buggy: str,
    new_buggy: str,
    ground_truth: str,
    human_reference_bug: str,
    idx: int,
) -> Optional[Dict[str, Any]]:
    """Compare which bug (original vs new) is more human-like."""
    import json
    
    prompt = _build_human_likeness_comparison_prompt(
        problem=problem,
        original_buggy_solution=original_buggy,
        new_buggy_solution=new_buggy,
        ground_truth=ground_truth,
        human_reference_bug=human_reference_bug,
    )
    
    messages = [
        {"role": "system", "content": "You are an expert code reviewer who can evaluate human-likeness of code bugs."},
        {"role": "user", "content": prompt},
    ]
    
    try:
        out = await engine.get_model_response(messages)
        text = (getattr(out, "content", None) or getattr(out, "text", "") or "").strip()
        
        # Try to parse JSON response
        json_match = re.search(r'\{[^{}]*"more_human_like"[^{}]*\}', text, re.DOTALL)
        if json_match:
            text = json_match.group(0)
        
        result = json.loads(text)
        return {
            "more_human_like": result.get("more_human_like", "tie"),
            "score_A": float(result.get("score_A", 0)),
            "score_B": float(result.get("score_B", 0)),
            "reasoning": str(result.get("reasoning", "")),
            "raw_response": text,
        }
    except Exception as e:
        print(f"[idx={idx}] Failed to parse human-likeness comparison: {e}")
        return None


def _normalize_task_info_for_reward(task: Dict[str, Any]) -> Dict[str, Any]:
    """Minimal normalization for RewardCodeFn (keeps this script torch-free)."""
    extra = task.get("extra_info")
    base = extra if isinstance(extra, dict) else task
    info: Dict[str, Any] = dict(base) if isinstance(base, dict) else {}

    if "ground_truth" not in info or info.get("ground_truth") in (None, ""):
        gt = task.get("ground_truth") or task.get("test")
        if gt in (None, "") and isinstance(extra, dict):
            gt = extra.get("ground_truth") or extra.get("test")
        info["ground_truth"] = gt

    if not info.get("data_source"):
        info["data_source"] = task.get("data_source") or (extra.get("data_source") if isinstance(extra, dict) else None) or "bugbench"

    if not info.get("entry_point"):
        md = task.get("metadata") or (extra.get("metadata") if isinstance(extra, dict) else None) or {}
        if isinstance(md, dict):
            info["entry_point"] = task.get("entry_point") or md.get("func_name") or md.get("entry_point")
        else:
            info["entry_point"] = task.get("entry_point")

    return info


def _has_compile_or_runtime_error(meta: Dict[str, Any]) -> bool:
    """
    Heuristic: BigCodeBench returns a single `output` string which often contains tracebacks.
    Treat common exceptions as invalid bugs (we want runnable code that fails tests).
    """
    out = str(meta.get("output", "") or "")
    out_l = out.lower()
    if not out_l:
        return False

    # If tests use assertions, we still consider that a valid "bug" signal.
    if "assertionerror" in out_l:
        return False

    bad_patterns = [
        "syntaxerror",
        "indentationerror",
        "invalid syntax",
        "was never closed",
        "unterminated",
        "eof while scanning",
        "importerror",
        "module not found",
        "modulenotfounderror",
        "nameerror",
        "typeerror",
        "attributeerror",
        "valueerror",
        "indexerror",
        "keyerror",
        "zerodivisionerror",
        "overflowerror",
        "recursionerror",
        "runtimeerror",
        "exception",
        "traceback",
        "error during testing:",
    ]
    return any(p in out_l for p in bad_patterns)


def _validate_bug_is_nontrivial_and_runnable(task: Dict[str, Any], buggy_fenced: str) -> Tuple[bool, bool, Dict[str, Any]]:
    """
    A "valid bug" should:
    - Run through the unit-test harness without crashing (no compile/runtime errors)
    - Fail at least one unit test (i.e., is_correct == False)
    """
    from rllm.rewards.code_reward import RewardCodeFn
    from rllm.rewards.reward_types import RewardConfig

    task_info = _normalize_task_info_for_reward(task)
    out = RewardCodeFn(RewardConfig())(task_info=task_info, action=buggy_fenced)
    meta = out.metadata or {}

    has_compile_error = _has_compile_or_runtime_error(meta)
    bug_valid = bool(out.is_correct is False) and not has_compile_error
    return bug_valid, has_compile_error, meta


def _load_hf_dataset(source: str, split: str):
    from datasets import load_dataset  # type: ignore
    
    try:
        return load_dataset(source, split=split)
    except ValueError as e:
        # If the requested split doesn't exist but the dataset has exactly one split, use it.
        ds_all = load_dataset(source)
        if hasattr(ds_all, "keys"):
            keys = list(ds_all.keys())
            if len(keys) == 1:
                only = keys[0]
                print(f'Warning: split "{split}" not found; using only available split "{only}".')
                return ds_all[only]
            raise ValueError(f'Split "{split}" not found. Available splits: {keys}') from e
        raise


async def _run(args: argparse.Namespace) -> None:
    from rllm.engine.rollout import OpenAIEngine

    # Load both datasets
    print(f"Loading bugbench_v2: {args.bugbench_v2_repo} (split={args.bugbench_v2_split})...")
    ds_v2 = _load_hf_dataset(args.bugbench_v2_repo, args.bugbench_v2_split)
    n_v2 = len(ds_v2)
    print(f"Loaded {n_v2} examples from bugbench_v2")
    
    print(f"Loading bugbench: {args.bugbench_repo} (split={args.bugbench_split})...")
    ds_bugbench = _load_hf_dataset(args.bugbench_repo, args.bugbench_split)
    n_bugbench = len(ds_bugbench)
    print(f"Loaded {n_bugbench} examples from bugbench")
    
    # Build index by uid for bugbench
    bugbench_by_uid: Dict[str, Dict[str, Any]] = {}
    for i in range(n_bugbench):
        example = ds_bugbench[i]
        uid = _get_uid(example)
        if uid:
            bugbench_by_uid[uid] = example
    
    print(f"Indexed {len(bugbench_by_uid)} examples from bugbench by uid")
    
    # Filter bugbench_v2 to only include examples that have a match in bugbench
    matched_indices: List[int] = []
    for i in range(n_v2):
        example = ds_v2[i]
        uid = _get_uid(example)
        if uid and uid in bugbench_by_uid:
            matched_indices.append(i)
    
    print(f"Found {len(matched_indices)} matching examples")
    
    if not matched_indices:
        raise ValueError("No matching examples found between bugbench_v2 and bugbench. Check that uids/task_ids match.")
    
    # Apply start/end/limit filters
    start = max(0, int(args.start))
    end = len(matched_indices) if args.end is None else min(len(matched_indices), int(args.end))
    if end <= start:
        raise ValueError(f"Invalid slice: start={start} end={end} (matched examples={len(matched_indices)})")
    
    filtered_indices = matched_indices[start:end]
    if args.limit is not None:
        filtered_indices = filtered_indices[:int(args.limit)]
    
    print(f"Will process {len(filtered_indices)} examples (slice {start}:{end}, limit={args.limit})")
    
    output_col = str(args.output_column)
    uid_col = str(args.uid_column)
    
    # Initialize output values (keep existing if present and we're not overwriting).
    existing_vals: List[Optional[str]]
    if output_col in ds_v2.column_names:
        existing_vals = [ds_v2[i].get(output_col) for i in range(n_v2)]
    else:
        existing_vals = [None for _ in range(n_v2)]
    
    # Store comparison results if enabled
    comparison_results: List[Optional[Dict[str, Any]]] = [None for _ in range(n_v2)]
    # Store test outputs
    test_outputs: List[str] = ["" for _ in range(n_v2)]
    
    if not filtered_indices:
        print("Nothing to do.")
    else:
        api_key = args.api_key or _env_first("OPENAI_API_KEY")
        if "api.openai.com" in str(args.base_url) and not api_key:
            raise ValueError("OPENAI_API_KEY is missing/empty (or pass --api-key).")

        sampling_params: Dict[str, Any] = {
            "reasoning_effort": args.reasoning_effort,
        }
        # Let the engine handle gpt-5's `max_completion_tokens`.
        if args.max_completion_tokens is not None:
            sampling_params["max_completion_tokens"] = int(args.max_completion_tokens)

        engine = OpenAIEngine(
            model=args.model,
            base_url=args.base_url,
            api_key=api_key or "EMPTY",
            max_response_length=int(args.max_completion_tokens or 2048),
            api_retries=int(args.api_retries),
            sampling_params=sampling_params,
            verbose=bool(args.verbose),
        )

        sem = asyncio.Semaphore(int(args.concurrency))

        def _format_test_output(meta: Dict[str, Any]) -> str:
            out = meta.get("output", "")
            if isinstance(out, dict):
                try:
                    import json
                    return json.dumps(out, indent=2, ensure_ascii=False)
                except Exception:
                    return str(out)
            return str(out)

        async def _bounded(i: int) -> Optional[Tuple[int, str, Optional[Dict[str, Any]], str]]:
            async with sem:
                bugbench_v2_task = ds_v2[i]
                uid = _get_uid(bugbench_v2_task)
                bugbench_task = bugbench_by_uid[uid]
                
                last_meta: Dict[str, Any] = {}
                for attempt in range(1, _MAX_TRIES_PER_EXAMPLE + 1):
                    try:
                        _, buggy = await _generate_one_modified_buggy(
                            engine=engine,
                            bugbench_v2_task=bugbench_v2_task,
                            bugbench_task=bugbench_task,
                            idx=i,
                        )
                    except Exception as e:
                        print(f"[idx={i} uid={uid}] Generation failed on attempt {attempt}/{_MAX_TRIES_PER_EXAMPLE}: {e}")
                        if attempt == _MAX_TRIES_PER_EXAMPLE:
                            print(f"[idx={i} uid={uid}] Skipping example after {_MAX_TRIES_PER_EXAMPLE} failed generation attempts")
                            return None
                        continue

                    try:
                        bug_valid, has_compile_error, meta = await asyncio.to_thread(
                            _validate_bug_is_nontrivial_and_runnable, bugbench_v2_task, buggy
                        )
                    except Exception as e:
                        print(f"[idx={i} uid={uid}] Validation failed on attempt {attempt}/{_MAX_TRIES_PER_EXAMPLE}: {e}")
                        if attempt == _MAX_TRIES_PER_EXAMPLE:
                            print(f"[idx={i} uid={uid}] Skipping example after {_MAX_TRIES_PER_EXAMPLE} failed validation attempts")
                            return None
                        continue

                    last_meta = meta or {}
                    if bug_valid and not has_compile_error:
                        # Format and store test output
                        test_output = _format_test_output(last_meta)
                        if args.verbose:
                            print(f"\n[idx={i} uid={uid}] unit test output:\n{test_output}\n")
                        
                        # Optionally compare human-likeness and choose the more human-like bug
                        comparison_result = None
                        final_buggy = buggy  # Default to new bug
                        if args.compare_human_likeness:
                            problem = (
                                bugbench_v2_task.get("instruct_prompt") 
                                or bugbench_v2_task.get("truncated_instruct_prompt")
                                or bugbench_v2_task.get("question")
                                or ""
                            ).strip()
                            original_buggy = bugbench_v2_task.get("buggy_solution", "")
                            ground_truth = bugbench_v2_task.get("ground_truth") or bugbench_task.get("ground_truth") or ""
                            human_reference_bug = bugbench_task.get("buggy_solution", "")
                            
                            try:
                                comparison_result = await _compare_human_likeness(
                                    engine=engine,
                                    problem=problem,
                                    original_buggy=original_buggy,
                                    new_buggy=buggy,
                                    ground_truth=ground_truth,
                                    human_reference_bug=human_reference_bug,
                                    idx=i,
                                )
                                if comparison_result:
                                    winner = comparison_result.get("more_human_like", "tie")
                                    score_a = comparison_result.get("score_A", 0)
                                    score_b = comparison_result.get("score_B", 0)
                                    print(f"[idx={i} uid={uid}] Human-likeness comparison: {winner} (A={score_a:.1f}, B={score_b:.1f})")
                                    
                                    # Choose the more human-like bug
                                    if winner == "A":
                                        # Original model-generated bug is more human-like
                                        final_buggy = original_buggy
                                        print(f"[idx={i} uid={uid}] Using original bug (A) as it's more human-like")
                                        # Re-validate the original bug to get its test output
                                        try:
                                            _, _, original_meta = await asyncio.to_thread(
                                                _validate_bug_is_nontrivial_and_runnable, bugbench_v2_task, original_buggy
                                            )
                                            test_output = _format_test_output(original_meta or {})
                                        except Exception as e:
                                            print(f"[idx={i} uid={uid}] Warning: Failed to re-validate original bug: {e}")
                                    elif winner == "B":
                                        # New bug is more human-like (already using it)
                                        final_buggy = buggy
                                        print(f"[idx={i} uid={uid}] Using new bug (B) as it's more human-like")
                                    else:
                                        # Tie - default to new bug
                                        final_buggy = buggy
                                        print(f"[idx={i} uid={uid}] Tie - using new bug (B) by default")
                            except Exception as e:
                                print(f"[idx={i} uid={uid}] Human-likeness comparison failed: {e}")
                        
                        return i, final_buggy, comparison_result, test_output

                    why = "compile/runtime error" if has_compile_error else "passed all tests"
                    print(f"[idx={i} uid={uid}] retry {attempt}/{_MAX_TRIES_PER_EXAMPLE}: invalid bug ({why})")

                # If we get here, all attempts were invalid - use the original model-generated bug
                original_buggy = bugbench_v2_task.get("buggy_solution", "")
                if not original_buggy:
                    print(f"[idx={i} uid={uid}] No original buggy solution found, skipping")
                    return None
                
                print(f"[idx={i} uid={uid}] Using original model-generated bug after {_MAX_TRIES_PER_EXAMPLE} failed attempts")
                
                # Get test output for the original bug
                try:
                    _, _, original_meta = await asyncio.to_thread(
                        _validate_bug_is_nontrivial_and_runnable, bugbench_v2_task, original_buggy
                    )
                    test_output = _format_test_output(original_meta or {})
                except Exception as e:
                    print(f"[idx={i} uid={uid}] Warning: Failed to validate original bug: {e}")
                    test_output = ""
                
                # No comparison needed since we're using the original
                return i, original_buggy, None, test_output

        completed = 0
        skipped = 0
        for batch in _chunked(filtered_indices, max(int(args.batch_size), 1)):
            results = await asyncio.gather(*[_bounded(i) for i in batch], return_exceptions=True)
            for r in results:
                if isinstance(r, Exception):
                    # For unexpected exceptions, still raise
                    raise r
                if r is None:
                    # Skipped example
                    skipped += 1
                    continue
                if len(r) == 4:
                    i, buggy, comparison, test_output = r
                    existing_vals[i] = buggy
                    if comparison is not None:
                        comparison_results[i] = comparison
                    test_outputs[i] = test_output
                elif len(r) == 3:
                    # Backward compatibility (without test_output)
                    i, buggy, comparison = r
                    existing_vals[i] = buggy
                    if comparison is not None:
                        comparison_results[i] = comparison
                    test_outputs[i] = ""
                else:
                    # Backward compatibility (without comparison or test_output)
                    i, buggy = r
                    existing_vals[i] = buggy
                    test_outputs[i] = ""
                completed += 1
                if completed % max(int(args.log_every), 1) == 0:
                    print(f"Generated {completed}/{len(filtered_indices)} (skipped: {skipped})")
        
        if skipped > 0:
            print(f"\nSkipped {skipped} examples that failed after {_MAX_TRIES_PER_EXAMPLE} attempts")

    # Materialize updated dataset.
    if output_col in ds_v2.column_names:
        ds_v2 = ds_v2.remove_columns([output_col])
    ds_v2 = ds_v2.add_column(output_col, existing_vals)
    
    # Filter to only include successfully processed examples (skip None values)
    successful_indices = [i for i in filtered_indices if existing_vals[i] is not None]
    if len(successful_indices) < len(filtered_indices):
        print(f"Filtering dataset: {len(successful_indices)} successful out of {len(filtered_indices)} attempted")
    processed_ds = ds_v2.select(successful_indices)
    
    # Convert to bugbench format
    print("Converting to bugbench format...")
    from datasets import Features, Value
    
    def convert_example(ex, idx):
        # Try to get doc_struct and libs from bugbench dataset if available
        uid = _get_uid(ex)
        bugbench_ex = bugbench_by_uid.get(uid, {})
        ex_with_metadata = dict(ex)
        ex_with_metadata["doc_struct"] = bugbench_ex.get("doc_struct", ex.get("doc_struct", ""))
        ex_with_metadata["libs"] = bugbench_ex.get("libs", ex.get("libs", "[]"))
        
        result = _convert_to_bugbench_format(ex_with_metadata, output_column=output_col)
        
        # Add test output
        if idx < len(successful_indices):
            original_idx = successful_indices[idx]
            if original_idx < len(test_outputs):
                result["test_output"] = test_outputs[original_idx]
            else:
                result["test_output"] = ""
        else:
            result["test_output"] = ""
        
        return result
    
    # Always add test_output column
    feature_columns = list(BUGBENCH_COLUMNS)
    feature_columns.append("test_output")
    
    bugbench_features = Features({k: Value("string") for k in feature_columns})
    processed_ds = processed_ds.map(
        convert_example,
        with_indices=True,
        remove_columns=processed_ds.column_names,
        features=bugbench_features,
    )
    
    print(f"Converted to bugbench format with columns: {processed_ds.column_names}")

    if args.save_local:
        os.makedirs(os.path.dirname(args.save_local) or ".", exist_ok=True)
        processed_ds.to_parquet(args.save_local)
        print(f"Saved local parquet: {args.save_local}")

    if args.no_push:
        print("Skipping push (--no-push). Done.")
        return

    hf_token = args.hf_token or _env_first("HUGGINGFACE_HUB_TOKEN", "HF_TOKEN")
    if not hf_token:
        raise ValueError("Missing Hugging Face token. Set HF_TOKEN/HUGGINGFACE_HUB_TOKEN or pass --hf-token.")

    print(f"Pushing to Hugging Face: {args.output_repo} (split={args.output_split}, private={bool(args.private)})")
    processed_ds.push_to_hub(
        args.output_repo,
        split=args.output_split,
        private=bool(args.private),
        token=hf_token,
    )
    print(f"Done. Dataset: https://huggingface.co/datasets/{args.output_repo}")


def _build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description=__doc__)

    # Source datasets
    p.add_argument("--bugbench-v2-repo", type=str, default="anonymous/bugbench_v2", 
                   help="HuggingFace dataset repo for bugbench_v2 (model-generated bugs)")
    p.add_argument("--bugbench-v2-split", type=str, default="test_all", 
                   help="Split to load from bugbench_v2")
    p.add_argument("--bugbench-repo", type=str, default="anonymous/bugbench", 
                   help="HuggingFace dataset repo for bugbench (human-generated bugs)")
    p.add_argument("--bugbench-split", type=str, default="test_all", 
                   help="Split to load from bugbench")

    # Output dataset
    p.add_argument("--output-repo", type=str, required=True, 
                   help="HF dataset repo id, e.g. anonymous/bugbench_v2_human_applied")
    p.add_argument("--output-split", type=str, default="test_all", 
                   help="Split name to push (default: same as --bugbench-v2-split)")
    p.add_argument("--private", action="store_true", help="Push as a private dataset")
    p.add_argument("--hf-token", type=str, default=None, 
                   help="HF token (or set HF_TOKEN/HUGGINGFACE_HUB_TOKEN)")

    # Column handling
    p.add_argument("--output-column", type=str, default="buggy_solution", 
                   help="Column to write generated buggy solutions into")
    p.add_argument("--uid-column", type=str, default="uid", 
                   help="Column used for matching and logging metadata")

    # Slice/limit
    p.add_argument("--start", type=int, default=0, help="Start index (inclusive)")
    p.add_argument("--end", type=int, default=None, help="End index (exclusive)")
    p.add_argument("--limit", type=int, default=None, help="Max number of generations to run")

    # LLM config
    p.add_argument("--model", type=str, default="gpt-5.2", help="Model name (default: gpt-5.2)")
    p.add_argument("--base-url", type=str, default="https://api.openai.com/v1", 
                   help="OpenAI-compatible base URL")
    p.add_argument("--api-key", type=str, default=None, help="API key (or set OPENAI_API_KEY)")
    p.add_argument("--reasoning-effort", type=str, default="medium", 
                   help="GPT-5 reasoning_effort (none|low|medium|high)")
    p.add_argument("--max-completion-tokens", type=int, default=2048, 
                   help="Max completion tokens (GPT-5 uses max_completion_tokens)")
    p.add_argument("--api-retries", type=int, default=5, help="API retries")
    p.add_argument("--concurrency", type=int, default=8, help="Concurrent requests")
    p.add_argument("--batch-size", type=int, default=32, 
                   help="How many tasks to schedule per gather() batch")
    p.add_argument("--log-every", type=int, default=25, help="Log progress every N completions")
    p.add_argument("--verbose", action="store_true", help="Print model outputs (engine verbose)")
    p.add_argument("--compare-human-likeness", action="store_true",
                   help="Compare original model-generated bug vs new bug for human-likeness using LLM judge")

    # Persistence
    p.add_argument("--save-local", type=str, default=None, 
                   help="Optional: save updated dataset to local parquet path")
    p.add_argument("--no-push", action="store_true", 
                   help="Generate/save locally but do not push to HF")

    return p


def main() -> None:
    args = _build_arg_parser().parse_args()
    if args.output_split is None:
        args.output_split = args.bugbench_v2_split
    asyncio.run(_run(args))


if __name__ == "__main__":
    main()
