#!/usr/bin/env python3
"""
Convert meta.json to the llama-factory training format.

Usage example:
    python scripts/convert_to_llama_factory.py \
        --input data/datasets/1/meta.json \
        --output data/datasets/1/train.jsonl
"""

import argparse
import json
import sys
from pathlib import Path
from typing import Any, Dict, List

def convert_to_llama_factory_format(meta_data: Dict[str, Any], image_dir: Path = None) -> Dict[str, Any]:
    question = meta_data.get("question", "").strip()
    cot = meta_data.get("cot", "").strip()
    answer = meta_data.get("answer", "").strip()
    image_path = meta_data.get("image_path", "")
    code = meta_data.get("code", {})

    if not question and not answer:
        raise ValueError("Empty question and answer")

    # Handle image path logic: if a base directory is provided, join it with the image path
    final_image_path = image_path
    if image_path and image_dir:
        # Support both absolute and relative paths
        img_p = Path(image_path)
        if not img_p.is_absolute():
            final_image_path = str(image_dir / img_p)

    # Build user prompt
    # User Prompt format:
    # <image> {if there is an image, put it at the very beginning}
    # Solve the following problem.
    # Provide your solution in the following format:
    # <code>...</code>
    # <think>...</think>
    # <answer>...</answer>
    # Problem:
    # {question}
    user_prompt_parts = []
    if image_path:
        user_prompt_parts.append("<image>")
    user_prompt_parts.extend([
        "Solve the following problem.",
        "Provide your solution in the following format:",
        "<code>...</code>",
        "<think>...</think>",
        "<answer>...</answer>",
        "Problem:",
        question,
    ])
    instruction = "\n".join(user_prompt_parts)
    
    # Build output in the order of <code> <think> <answer>
    output_parts = []
    
    # 1. Add <code> tag (single-line JSON format)
    if code:
        code_json = json.dumps(code, ensure_ascii=False)
        output_parts.append(f"<code>{code_json}</code>")
    
    # 2. Add <think> tag
    if cot:
        output_parts.append(f"<think>{cot}</think>")
    
    # 3. Add <answer> tag
    if answer:
        output_parts.append(f"<answer>{answer}</answer>")
    
    output = "\n".join(output_parts)

    return {
        "instruction": instruction,
        "input": "",
        "output": output,
        "images": [final_image_path] if image_path else [],
    }

def main():
    parser = argparse.ArgumentParser(
        description="Convert meta.json to llama-factory training format"
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Input meta.json file path",
    )
    parser.add_argument(
        "--output",
        type=str,
        required=True,
        help="Output JSONL file path",
    )
    parser.add_argument(
        "--format",
        type=str,
        default="conversation",
        choices=["conversation", "instruction"],
        help="Output format: 'conversation' (dialog format) or 'instruction' (instruction format), default is 'conversation'",
    )
    args = parser.parse_args()
    
    input_path = Path(args.input)
    output_path = Path(args.output)
    
    if not input_path.exists():
        print(f"[ERR] Input file does not exist: {input_path}", file=sys.stderr)
        sys.exit(1)
    
    # Ensure the output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Read meta.json
    print(f"[INFO] Reading {input_path}...")
    try:
        with input_path.open("r", encoding="utf-8") as f:
            meta_list = json.load(f)
    except Exception as e:
        print(f"[ERR] Failed to read meta.json: {e}", file=sys.stderr)
        sys.exit(1)
    
    if not isinstance(meta_list, list):
        print(f"[ERR] meta.json should be a list/array", file=sys.stderr)
        sys.exit(1)
    
    print(f"[INFO] Found {len(meta_list)} records")
    
    # Convert format
    converted_data = []
    for i, meta_data in enumerate(meta_list):
        try:
            converted = convert_to_llama_factory_format(meta_data)
            
            # Keep system prompt consistent with the eval script
            system_prompt = "You are a mathematical reasoning assistant.\nYour task is to solve the problem and give the correct answer."


            if args.format == "conversation":
                # Use conversation format (more suitable for multi-turn dialogue)
                converted_data.append({
                    "conversations": [
                        {
                            "from": "system",
                            "value": system_prompt,
                        },
                        {
                            "from": "human",
                            "value": converted["instruction"],
                        },
                        {
                            "from": "gpt",
                            "value": converted["output"],
                        },
                    ],
                    "images": converted["images"],
                })
            else:
                # Use instruction format
                converted_data.append(converted)
        except Exception as e:
            print(f"[WARN] Failed to convert record {i+1}: {e}", file=sys.stderr)
            continue
    
    # Write JSONL file
    print(f"[INFO] Writing to {output_path}...")
    try:
        with output_path.open("w", encoding="utf-8") as f:
            for item in converted_data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
    except Exception as e:
        print(f"[ERR] Failed to write output file: {e}", file=sys.stderr)
        sys.exit(1)
    
    print(f"[DONE] Successfully converted {len(converted_data)} records")
    print(f"[DONE] Output written to {output_path}")
    
    # Show example
    if converted_data:
        print("\n[Example] First record:")
        print(json.dumps(converted_data[0], ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()

