from benchmarks.BrowseComp.loader import BrowseCompDataset

import argparse
import json

from workflow.qwq.workflow_manager import WorkflowManager


# MAX_TEST = 10

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--topic", type=str, default="All")
    parser.add_argument("--max_test", type=int, default=10)
    args = parser.parse_args()

    # load the dataset
    if args.topic == "All":
        dataset = BrowseCompDataset().get_full_set()
    else:
        dataset = BrowseCompDataset().get_by_topic(args.topic)

    test_count = 0
    results = []

    for example in dataset:
        test_count += 1
        if test_count > args.max_test:
            break
        
        # Retry logic - attempt up to 3 times
        max_attempts = 3
        success = False
        last_error = None
        
        for attempt in range(1, max_attempts + 1):
            try:
                print(f"Processing question {test_count} (attempt {attempt}/{max_attempts}): {example['problem'][:100]}...")
                workflow_manager = WorkflowManager(example["problem"], base_workspace=f"qwq_browsecomp_workspace_{args.topic}")
                result = workflow_manager.run()
                answer = result.get('final_answer', result.get('current_summary', 'No answer found'))
                results.append({
                    "question": example["problem"],
                    "topic": example["problem_topic"],
                    "answer": answer,
                    "attempt": attempt
                })
                success = True
                print(f"✅ Success on attempt {attempt}")
                break
            except Exception as e:
                last_error = e
                print(f"❌ Attempt {attempt} failed: {e}")
                if attempt < max_attempts:
                    print(f"🔄 Retrying... ({attempt + 1}/{max_attempts})")
                else:
                    print(f"💥 All {max_attempts} attempts failed for question: {example['problem'][:100]}")
        
        if not success:
            # Record the failed attempt
            results.append({
                "question": example["problem"],
                "topic": example["problem_topic"],
                "answer": "FAILED_ALL_ATTEMPTS",
                "error": str(last_error),
                "attempts": max_attempts
            })

    # save the results to a json file
    with open(f"qwq_browsecomp_results_{args.topic}_without_memory.json", "w") as f:
        json.dump(results, f)