{
  "evaluation_metadata": {
    "timestamp": "2025-08-07 14:05:26",
    "domain": "retail",
    "model": "/checkpoints/jiateng-sandbox/saves/qwen3-32b/tau_bench_self_cot_cpt_sft",
    "total_tasks_available": 115,
    "tasks_evaluated": 115
  },
  "performance_metrics": {
    "success_rate_percentage": 25.22,
    "successful_tasks": 29,
    "failed_tasks": 86,
    "average_turns_per_task": 24.09
  },
  "status_breakdown": {
    "completed_successfully": 29,
    "completed_incorrectly": 70,
    "incomplete": 16
  },
  "tool_usage_statistics": {
    "FindUserIdByNameZip": 78,
    "GetOrderDetails": 276,
    "GetProductDetails": 76,
    "ExchangeDeliveredOrderItems": 40,
    "TransferToHumanAgents": 431,
    "ListAllProductTypes": 9,
    "GetUserDetails": 100,
    "ModifyPendingOrderItems": 34,
    "ReturnDeliveredOrderItems": 43,
    "CancelPendingOrder": 35,
    "FindUserIdByEmail": 31,
    "Think": 14,
    "Calculate": 3,
    "ModifyUserAddress": 10,
    "ModifyPendingOrderPayment": 1,
    "ModifyPendingOrderAddress": 22,
    "EndConversation": 3
  },
  "detailed_results_file": "tau_bench_retail__checkpoints_jiateng_sandbox_saves_qwen3_32b_tau_bench_self_cot_cpt_sft_20250807_074253.json"
}