import asyncio
import json
import time
import uuid
import sys
import os
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
from pathlib import Path

# Ensure we can import coordinator module
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# Import frontend components
from google.adk.events import Event
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.genai import types
from google.adk import Agent

# Import coordinator
try:
    from coordinator_benchtest_finance import get_root_agent
except ImportError:
    print("Warning: Could not import coordinator_benchtest_finance. Using mock coordinator.")
    def get_root_agent():
        return None

@dataclass
class FunctionCall:
    """Function calll"""
    function: str
    args: Dict[str, Any]

@dataclass
class FinanceSearchTask:
    """Finance search task definition """
    id: str
    prompt: str
    ground_truth_output: str
    ground_truth_calls: List[FunctionCall]
    difficulty: str = "easy"
    comment: str = ""
    category: str = "basic"  # basic, advanced, analytics, recommendation

@dataclass 
class TaskResult:
    """Task result """
    task_id: str
    prompt: str
    response: str
    ground_truth_output: str
    utility_score: float  
    response_time: float
    error: Optional[str] = None
    function_calls_made: List[str] = None
    tool_calls_observed: List[Dict] = None
    a2a_calls_made: int = 0
    category: str = "basic"
    
    def __post_init__(self):
        if self.function_calls_made is None:
            self.function_calls_made = []
        if self.tool_calls_observed is None:
            self.tool_calls_observed = []

class FinanceSearchBenchmark:
    def __init__(self):
        self.tasks = self._create_finance_search_tasks()
        self.results: List[TaskResult] = []
        
        # Use same components as frontend
        self.APP_NAME = 'coordinator_app'
        self.USER_ID = 'benchmark_user'
        self.SESSION_ID = f'benchmark_session_{int(time.time())}'
        
        self.session_service = InMemorySessionService()
        
        self.coordinator_runner = None
        
    async def initialize_coordinator(self):
        """Initialize coordinator using coordinator_benchtest_finance.py for benchmark testing"""
        try:
            # Import the BENCHTEST coordinator components - same pattern as search_tasks.py
            from coordinator_benchtest_finance import CoordinatorAgent  # 注意：需要创建这个文件
            from google.adk.runners import Runner
            
            print("🔧 Initializing BENCHTEST coordinator (coordinator_benchtest_finance.py)...")
            
            # Create coordinator instance using coordinator_benchtest_finance.py logic - direct await
            coordinator_instance = await CoordinatorAgent.create(
                remote_agent_addresses=[
                    os.getenv('CLIENT_SEARCH_AGENT_URL', 'http://localhost:10001'),  
                    os.getenv('ADVISOR_SEARCH_AGENT_URL', 'http://localhost:10003'),
                    os.getenv('TRADING_SEARCH_AGENT_URL', 'http://localhost:10002'),
                ]
            )
            
            # Create agent using coordinator_benchtest_finance.py create_agent() method
            coordinator_agent = coordinator_instance.create_agent()
            
            # Create runner using same configuration as frontend
            self.coordinator_runner = Runner(
                agent=coordinator_agent,
                app_name=self.APP_NAME,
                session_service=self.session_service,
            )
            print("✅ BENCHTEST Coordinator initialized successfully")
            print("🎯 Using coordinator_benchtest_finance.py for benchmark testing!")
            return self.coordinator_runner
            
        except Exception as e:
            print(f"❌ Failed to initialize coordinator: {e}")
            return None

    def _create_finance_search_tasks(self) -> List[FinanceSearchTask]:
        """Create 100 comprehensive finance search tasks covering clients, advisors, and trading products"""
        return [
            # 1. Client age, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_price",
                prompt="1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product EQ001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product price",
                ground_truth_output="35\n12\n175.50",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product EQ001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 2. Client risk score, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_risk",
                prompt="1. What is the risk assessment score of client CL002? 2. What are the specialties of advisor AD002? 3. What is the risk level of product EQ002?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product risk level",
                ground_truth_output="3.2\n['Retirement Planning', 'Education Planning', 'Tax Planning']\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL002? 2. What are the specialties of advisor AD002? 3. What is the risk level of product EQ002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 3. Client income, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_name",
                prompt="1. What is the annual income of client CL003? 2. What is the client satisfaction rating of advisor AD003? 3. What is the name of product ETF001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product name",
                ground_truth_output="300000\n4.4\nSPDR S&P 500 ETF Trust",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL003? 2. What is the client satisfaction rating of advisor AD003? 3. What is the name of product ETF001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 4. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name",
                prompt="1. What is the age of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product BOND001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="41\n8\niShares Core U.S. Aggregate Bond ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product BOND001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 5. Client risk score, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_price",
                prompt="1. What is the risk assessment score of client CL005? 2. What are the specialties of advisor AD005? 3. What is the current price of product CRYPTO001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product price",
                ground_truth_output="7.8\n['Retirement Planning', 'Income Planning', 'Social Security Optimization']\n42500.00",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL005? 2. What are the specialties of advisor AD005? 3. What is the current price of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 6. Client income, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_risk",
                prompt="1. What is the annual income of client CL006? 2. What is the client satisfaction rating of advisor AD006? 3. What is the risk level of product COMM001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product risk level",
                ground_truth_output="150000\n4.5\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL006? 2. What is the client satisfaction rating of advisor AD006? 3. What is the risk level of product COMM001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 7. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk",
                prompt="1. What is the age of client CL007? 2. What is the years of experience of advisor AD007? 3. What is the risk level of product FX001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="60\n9\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL007? 2. What is the years of experience of advisor AD007? 3. What is the risk level of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 8. Client risk score, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_name",
                prompt="1. What is the risk assessment score of client CL008? 2. What are the specialties of advisor AD008? 3. What is the name of product OPTION001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product name",
                ground_truth_output="5.2\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\nSPY Call Option",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL008? 2. What are the specialties of advisor AD008? 3. What is the name of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 9. Client income, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_price",
                prompt="1. What is the annual income of client CL009? 2. What is the client satisfaction rating of advisor AD009? 3. What is the current price of product REIT001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product price",
                ground_truth_output="180000\n4.7\n78.90",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL009? 2. What is the client satisfaction rating of advisor AD009? 3. What is the current price of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 10. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name_2",
                prompt="1. What is the age of client CL010? 2. What is the years of experience of advisor AD010? 3. What is the name of product CURRENCY001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="26\n16\nInvesco DB US Dollar Index Bullish Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL010? 2. What is the years of experience of advisor AD010? 3. What is the name of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 11. Client risk score, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_price_2",
                prompt="1. What is the risk assessment score of client CL001? 2. What are the specialties of advisor AD001? 3. What is the current price of product INTERNATIONAL001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product price",
                ground_truth_output="6.5\n['High Net Worth Individuals', 'Portfolio Management', 'Alternative Investments']\n58.75",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL001? 2. What are the specialties of advisor AD001? 3. What is the current price of product INTERNATIONAL001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 12. Client income, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_risk_2",
                prompt="1. What is the annual income of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product VOLATILITY001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product risk level",
                ground_truth_output="85000\n4.6\nVery High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 13. Client age, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_price_2",
                prompt="1. What is the age of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product SECTOR001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product price",
                ground_truth_output="52\n6\n185.40",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product SECTOR001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 14. Client risk score, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_risk_2",
                prompt="1. What is the risk assessment score of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product COMMODITY001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product risk level",
                ground_truth_output="7.3\n['Business Owners', 'Corporate Executives', 'International Clients']\nMedium-High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product COMMODITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 15. Client income, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_name_2",
                prompt="1. What is the annual income of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the name of product EMERGING001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product name",
                ground_truth_output="180000\n4.8\niShares MSCI Emerging Markets ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the name of product EMERGING001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 16. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk_2",
                prompt="1. What is the age of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the risk level of product CURRENCY001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="38\n7\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the risk level of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 17. Client risk score, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_price_3",
                prompt="1. What is the risk assessment score of client CL007? 2. What are the specialties of advisor AD007? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product price",
                ground_truth_output="3.8\n['Tax Planning', 'Tax-Loss Harvesting', 'Retirement Account Optimization']\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL007? 2. What are the specialties of advisor AD007? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 18. Client income, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_rating_trading_name_3",
                prompt="1. What is the annual income of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the name of product REIT001?",
                comment="Comprehensive search covering client income, advisor rating, and trading product name",
                ground_truth_output="75000\n4.7\nVanguard Real Estate ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the name of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 19. Client age, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_price_3",
                prompt="1. What is the age of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the current price of product COMM002?",
                comment="Comprehensive search covering client age, advisor experience, and trading product price",
                ground_truth_output="45\n15\n72.15",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the current price of product COMM002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 20. Client risk score, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_risk_3",
                prompt="1. What is the risk assessment score of client CL010? 2. What are the specialties of advisor AD010? 3. What is the risk level of product FX001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product risk level",
                ground_truth_output="4.8\n['International Clients', 'Cross-Border Planning', 'Currency Hedging', 'Global Diversification']\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL010? 2. What are the specialties of advisor AD010? 3. What is the risk level of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 21. Client age, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_price_4",
                prompt="1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product BOND002?",
                comment="Comprehensive search covering client age, advisor experience, and trading product price",
                ground_truth_output="35\n12\n76.20",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product BOND002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 22. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_4",
                prompt="1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product CRYPTO001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="3.2\n4.6\nBitcoin",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 23. Client income, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_risk_4",
                prompt="1. What is the annual income of client CL003? 2. What are the specialties of advisor AD003? 3. What is the risk level of product COMMODITY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product risk level",
                ground_truth_output="300000\n['Digital Wealth Management', 'Tax-Loss Harvesting', 'Goal-Based Investing']\nMedium-High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL003? 2. What are the specialties of advisor AD003? 3. What is the risk level of product COMMODITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 24. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name_4",
                prompt="1. What is the age of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="41\n8\nEuro/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 25. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_4",
                prompt="1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="7.8\n4.8\n8.50",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 26. Client income, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_name_4",
                prompt="1. What is the annual income of client CL006? 2. What are the specialties of advisor AD006? 3. What is the name of product REIT001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product name",
                ground_truth_output="150000\n['ESG Investing', 'Impact Investing', 'Sustainable Finance']\nVanguard Real Estate ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL006? 2. What are the specialties of advisor AD006? 3. What is the name of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 27. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk_4",
                prompt="1. What is the age of client CL007? 2. What is the years of experience of advisor AD007? 3. What is the risk level of product CURRENCY001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="60\n9\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL007? 2. What is the years of experience of advisor AD007? 3. What is the risk level of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 28. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_5",
                prompt="1. What is the risk assessment score of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="5.2\n4.7\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 29. Client income, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_risk_5",
                prompt="1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the risk level of product INTERNATIONAL001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product risk level",
                ground_truth_output="180000\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the risk level of product INTERNATIONAL001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 30. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name_5",
                prompt="1. What is the age of client CL010? 2. What is the years of experience of advisor AD010? 3. What is the name of product SECTOR001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="26\n16\nTechnology Select Sector SPDR Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL010? 2. What is the years of experience of advisor AD010? 3. What is the name of product SECTOR001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 31. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_6",
                prompt="1. What is the risk assessment score of client CL001? 2. What is the client satisfaction rating of advisor AD001? 3. What is the current price of product COMMODITY002?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="6.5\n4.8\n18.45",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL001? 2. What is the client satisfaction rating of advisor AD001? 3. What is the current price of product COMMODITY002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 32. Client income, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_name_5",
                prompt="1. What is the annual income of client CL002? 2. What are the specialties of advisor AD002? 3. What is the name of product EMERGING001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product name",
                ground_truth_output="85000\n['Retirement Planning', 'Education Planning', 'Tax Planning']\niShares MSCI Emerging Markets ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL002? 2. What are the specialties of advisor AD002? 3. What is the name of product EMERGING001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 33. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk_5",
                prompt="1. What is the age of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the risk level of product BOND001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="52\n6\nLow",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the risk level of product BOND001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 34. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_5",
                prompt="1. What is the risk assessment score of client CL004? 2. What is the client satisfaction rating of advisor AD004? 3. What is the name of product CRYPTO001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="7.3\n4.9\nBitcoin",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL004? 2. What is the client satisfaction rating of advisor AD004? 3. What is the name of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 35. Client income, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_price_5",
                prompt="1. What is the annual income of client CL005? 2. What are the specialties of advisor AD005? 3. What is the current price of product COMM001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product price",
                ground_truth_output="180000\n['Retirement Planning', 'Income Planning', 'Social Security Optimization']\n185.75",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL005? 2. What are the specialties of advisor AD005? 3. What is the current price of product COMM001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 36. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name_6",
                prompt="1. What is the age of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="38\n7\nEuro/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 37. Client risk score, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_risk_4",
                prompt="1. What is the risk assessment score of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product risk level",
                ground_truth_output="3.8\n4.5\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 38. Client income, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_price_6",
                prompt="1. What is the annual income of client CL008? 2. What are the specialties of advisor AD008? 3. What is the current price of product REIT001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product price",
                ground_truth_output="75000\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\n78.90",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL008? 2. What are the specialties of advisor AD008? 3. What is the current price of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 39. Client age, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_name_7",
                prompt="1. What is the age of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product CURRENCY001?",
                comment="Comprehensive search covering client age, advisor experience, and trading product name",
                ground_truth_output="45\n15\nInvesco DB US Dollar Index Bullish Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 40. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_7",
                prompt="1. What is the risk assessment score of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="4.8\n4.9\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 41. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name",
                prompt="1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the name of product BOND002?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="120000\n12\niShares iBoxx $ High Yield Corporate Bond ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the name of product BOND002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 42. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk",
                prompt="1. What is the age of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="28\n4.6\nVery High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 43. Client risk score, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_price_6",
                prompt="1. What is the risk assessment score of client CL003? 2. What are the specialties of advisor AD003? 3. What is the current price of product COMM002?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product price",
                ground_truth_output="8.7\n['Digital Wealth Management', 'Tax-Loss Harvesting', 'Goal-Based Investing']\n72.15",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL003? 2. What are the specialties of advisor AD003? 3. What is the current price of product COMM002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 44. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_2",
                prompt="1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="200000\n8\nSwiss Franc/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 45. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price",
                prompt="1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="29\n4.8\n8.50",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 46. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name",
                prompt="1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="4.5\n7\niShares U.S. Real Estate ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 47. Client income, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_risk_6",
                prompt="1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the risk level of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product risk level",
                ground_truth_output="80000\n['Tax Planning', 'Tax-Loss Harvesting', 'Retirement Account Optimization']\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the risk level of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 48. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_2",
                prompt="1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="33\n4.7\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 49. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name_2",
                prompt="1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="7.8\n15\nVanguard Total International Stock ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 50. Client income, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_price_7",
                prompt="1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the current price of product SECTOR001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product price",
                ground_truth_output="30000\n['International Clients', 'Cross-Border Planning', 'Currency Hedging', 'Global Diversification']\n185.40",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the current price of product SECTOR001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 51. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk_6",
                prompt="1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product COMMODITY003?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="35\n12\nMedium-High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product COMMODITY003?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 52. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_6",
                prompt="1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="3.2\n4.6\niShares MSCI Emerging Markets ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 53. Client income, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_price",
                prompt="1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product BOND001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product price",
                ground_truth_output="300000\n6\n98.45",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product BOND001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 54. Client age, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_specialties_trading_risk",
                prompt="1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor specialties, and trading product risk level",
                ground_truth_output="41\n['Business Owners', 'Corporate Executives', 'International Clients']\nVery High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 55. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_8",
                prompt="1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product COMMODITY001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="7.8\n4.8\n18.45",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product COMMODITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 56. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_3",
                prompt="1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="150000\n7\nCanadian Dollar/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 57. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk_3",
                prompt="1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="60\n4.5\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 58. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name_3",
                prompt="1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the name of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="5.2\n11\nSPDR Dow Jones REIT ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the name of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 59. Client income, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_price_8",
                prompt="1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the current price of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product price",
                ground_truth_output="180000\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\n28.15",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the current price of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 60. Client age, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_name",
                prompt="1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the name of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product name",
                ground_truth_output="26\n4.9\niPath Series B S&P 500 VIX Short-Term Futures ETN",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the name of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 61. Client income, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_risk",
                prompt="1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product BOND002?",
                comment="Comprehensive search covering client income, advisor experience, and trading product risk level",
                ground_truth_output="120000\n12\nMedium-High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product BOND002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 62. Client age, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_specialties_trading_price_2",
                prompt="1. What is the age of client CL002? 2. What are the specialties of advisor AD002? 3. What is the current price of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor specialties, and trading product price",
                ground_truth_output="28\n['Retirement Planning', 'Education Planning', 'Tax Planning']\n42500.00",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL002? 2. What are the specialties of advisor AD002? 3. What is the current price of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 63. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_7",
                prompt="1. What is the risk assessment score of client CL003? 2. What is the client satisfaction rating of advisor AD003? 3. What is the name of product COMM001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="8.7\n4.4\nSPDR Gold Trust",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL003? 2. What is the client satisfaction rating of advisor AD003? 3. What is the name of product COMM001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 64. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_4",
                prompt="1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="200000\n8\nAustralian Dollar/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 65. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk_4",
                prompt="1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the risk level of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="29\n4.8\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the risk level of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 66. Client risk score, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_price",
                prompt="1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the current price of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product price",
                ground_truth_output="4.5\n7\n78.90",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the current price of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 67. Client income, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_name_6",
                prompt="1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the name of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product name",
                ground_truth_output="80000\n['Tax Planning', 'Tax-Loss Harvesting', 'Retirement Account Optimization']\nInvesco DB US Dollar Index Bullish Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the name of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 68. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_3",
                prompt="1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="33\n4.7\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 69. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name_4",
                prompt="1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="7.8\n15\nVanguard Total International Stock ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 70. Client income, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_risk_7",
                prompt="1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the risk level of product SECTOR001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product risk level",
                ground_truth_output="30000\n['International Clients', 'Cross-Border Planning', 'Currency Hedging', 'Global Diversification']\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the risk level of product SECTOR001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 71. Client age, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_price_5",
                prompt="1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product COMMODITY004?",
                comment="Comprehensive search covering client age, advisor experience, and trading product price",
                ground_truth_output="35\n12\n2200.50",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the current price of product COMMODITY004?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 72. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_8",
                prompt="1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="3.2\n4.6\niShares MSCI Emerging Markets ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 73. Client income, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_risk_2",
                prompt="1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the risk level of product BOND001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product risk level",
                ground_truth_output="300000\n6\nLow",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the risk level of product BOND001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 74. Client age, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_specialties_trading_price_3",
                prompt="1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the current price of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor specialties, and trading product price",
                ground_truth_output="41\n['Business Owners', 'Corporate Executives', 'International Clients']\n42500.00",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the current price of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 75. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_9",
                prompt="1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the name of product COMM002?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="7.8\n4.8\nUnited States Oil Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the name of product COMM002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 76. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_5",
                prompt="1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="150000\n7\nNew Zealand Dollar/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 77. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk_5",
                prompt="1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="60\n4.5\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 78. Client risk score, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_price_2",
                prompt="1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the current price of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product price",
                ground_truth_output="5.2\n11\n65.25",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the current price of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 79. Client income, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_name_7",
                prompt="1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the name of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product name",
                ground_truth_output="180000\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\nInvesco DB US Dollar Index Bullish Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the name of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 80. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_4",
                prompt="1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="26\n4.9\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 81. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_6",
                prompt="1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the name of product BOND001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="120000\n12\niShares 20+ Year Treasury Bond ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the name of product BOND001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 82. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk_6",
                prompt="1. What is the age of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="28\n4.6\nVery High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the risk level of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 83. Client risk score, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_specialties_trading_price_9",
                prompt="1. What is the risk assessment score of client CL003? 2. What are the specialties of advisor AD003? 3. What is the current price of product COMMODITY001?",
                comment="Comprehensive search covering client risk score, advisor specialties, and trading product price",
                ground_truth_output="8.7\n['Digital Wealth Management', 'Tax-Loss Harvesting', 'Goal-Based Investing']\n18.45",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL003? 2. What are the specialties of advisor AD003? 3. What is the current price of product COMMODITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 84. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_7",
                prompt="1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="200000\n8\nMexican Peso/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL004? 2. What is the years of experience of advisor AD004? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 85. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_5",
                prompt="1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="29\n4.8\n8.50",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 86. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name_5",
                prompt="1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="4.5\n7\niShares Global REIT ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 87. Client income, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_risk_8",
                prompt="1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the risk level of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product risk level",
                ground_truth_output="80000\n['Tax Planning', 'Tax-Loss Harvesting', 'Retirement Account Optimization']\nMedium",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL007? 2. What are the specialties of advisor AD007? 3. What is the risk level of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 88. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_6",
                prompt="1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="33\n4.7\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL008? 2. What is the client satisfaction rating of advisor AD008? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 89. Client risk score, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_name_6",
                prompt="1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product name",
                ground_truth_output="7.8\n15\nVanguard Total International Stock ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL009? 2. What is the years of experience of advisor AD009? 3. What is the name of product INTERNATIONAL001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 90. Client income, advisor specialties, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_price_9",
                prompt="1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the current price of product SECTOR001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product price",
                ground_truth_output="30000\n['International Clients', 'Cross-Border Planning', 'Currency Hedging', 'Global Diversification']\n185.40",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL010? 2. What are the specialties of advisor AD010? 3. What is the current price of product SECTOR001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 91. Client age, advisor experience, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_experience_trading_risk_7",
                prompt="1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product COMMODITY005?",
                comment="Comprehensive search covering client age, advisor experience, and trading product risk level",
                ground_truth_output="35\n12\nMedium-High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL001? 2. What is the years of experience of advisor AD001? 3. What is the risk level of product COMMODITY005?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 92. Client risk score, advisor rating, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_name_10",
                prompt="1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product name",
                ground_truth_output="3.2\n4.6\niShares MSCI Emerging Markets ETF",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL002? 2. What is the client satisfaction rating of advisor AD002? 3. What is the name of product EMERGING001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 93. Client income, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_price_3",
                prompt="1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product BOND002?",
                comment="Comprehensive search covering client income, advisor experience, and trading product price",
                ground_truth_output="300000\n6\n76.20",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL003? 2. What is the years of experience of advisor AD003? 3. What is the current price of product BOND002?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 94. Client age, advisor specialties, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_specialties_trading_risk_4",
                prompt="1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product CRYPTO001?",
                comment="Comprehensive search covering client age, advisor specialties, and trading product risk level",
                ground_truth_output="41\n['Business Owners', 'Corporate Executives', 'International Clients']\nVery High",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL004? 2. What are the specialties of advisor AD004? 3. What is the risk level of product CRYPTO001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 95. Client risk score, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_rating_trading_price_10",
                prompt="1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product COMM010?",
                comment="Comprehensive search covering client risk score, advisor rating, and trading product price",
                ground_truth_output="7.8\n4.8\n185.75",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL005? 2. What is the client satisfaction rating of advisor AD005? 3. What is the current price of product COMM010?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 96. Client income, advisor experience, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_experience_trading_name_8",
                prompt="1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?",
                comment="Comprehensive search covering client income, advisor experience, and trading product name",
                ground_truth_output="150000\n7\nSwedish Krona/US Dollar",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL006? 2. What is the years of experience of advisor AD006? 3. What is the name of product FX001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 97. Client age, advisor rating, trading risk level
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_risk_7",
                prompt="1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product risk level",
                ground_truth_output="60\n4.5\nHigh",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL007? 2. What is the client satisfaction rating of advisor AD007? 3. What is the risk level of product OPTION001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 98. Client risk score, advisor experience, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_risk_advisor_experience_trading_price_3",
                prompt="1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the current price of product REIT001?",
                comment="Comprehensive search covering client risk score, advisor experience, and trading product price",
                ground_truth_output="5.2\n11\n72.15",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the risk assessment score of client CL008? 2. What is the years of experience of advisor AD008? 3. What is the current price of product REIT001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 99. Client income, advisor specialties, trading name
            FinanceSearchTask(
                id="comprehensive_search_client_income_advisor_specialties_trading_name_8",
                prompt="1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the name of product CURRENCY001?",
                comment="Comprehensive search covering client income, advisor specialties, and trading product name",
                ground_truth_output="180000\n['Private Equity', 'Hedge Funds', 'Real Estate', 'Commodities']\nInvesco DB US Dollar Index Bullish Fund",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the annual income of client CL009? 2. What are the specialties of advisor AD009? 3. What is the name of product CURRENCY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            ),

            # 100. Client age, advisor rating, trading price
            FinanceSearchTask(
                id="comprehensive_search_client_age_advisor_rating_trading_price_7",
                prompt="1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?",
                comment="Comprehensive search covering client age, advisor rating, and trading product price",
                ground_truth_output="26\n4.9\n12.85",
                ground_truth_calls=[
                    FunctionCall(
                        function="send_message",
                        args={"agent_name": "Finance Search Agent", "task": "1. What is the age of client CL010? 2. What is the client satisfaction rating of advisor AD010? 3. What is the current price of product VOLATILITY001?"}
                    )
                ],
                difficulty="easy",
                category="basic"
            )
        ]

    async def initialize_session(self):
        """Initialize session for benchmark"""
        await self.session_service.create_session(
            app_name=self.APP_NAME,
            user_id=self.USER_ID,
            session_id=self.SESSION_ID
        )

    async def run_single_task(self, task: FinanceSearchTask) -> TaskResult:
        """
        Run single task - COMPLETE EQUIVALENCE TO FRONTEND INPUT
        User Input → ORIGINAL Host Agent → A2A Protocol → Remote Agents
        """
        start_time = time.time()
        
        try:
            print(f"🧪 Running task: {task.id} [{task.category.upper()}]")
            print(f"   📝 Prompt: {task.prompt}")
            print(f"   🔄 Using ORIGINAL coordinator (frontend-equivalent)...")
            
            # COMPLETE EQUIVALENCE TO FRONTEND CALL - same as search_tasks.py
            event_iterator = self.coordinator_runner.run_async(
                user_id=self.USER_ID,
                session_id=self.SESSION_ID,
                new_message=types.Content(
                    role='user', parts=[types.Part(text=task.prompt)]
                ),
            )
            
            response_parts = []
            tool_calls_observed = []
            a2a_calls_made = 0
            
            # Process event stream - same logic as search_tasks.py
            async for event in event_iterator:
                if event.content and event.content.parts:
                    for part in event.content.parts:
                        if part.function_call:
                            # Record tool call
                            tool_call_info = {
                                "name": part.function_call.name,
                                "args": part.function_call.args
                            }
                            tool_calls_observed.append(tool_call_info)
                            print(f"   🛠️ Tool call: {part.function_call.name}")
                            
                            # Count A2A calls
                            if part.function_call.name == "send_message":
                                a2a_calls_made += 1
                                print(f"   🔗 A2A remote call #{a2a_calls_made}")
                            
                        elif part.function_response:
                            # Record tool response
                            print(f"   ⚡ Tool response: {part.function_response.name}")
                            
                if event.is_final_response():
                    if event.content and event.content.parts:
                        final_response_text = ''.join(
                            [p.text for p in event.content.parts if p.text]
                        )
                        response_parts.append(final_response_text)
                    elif event.actions and event.actions.escalate:
                        response_parts.append(f'Agent escalated: {event.error_message or "No specific message."}')
                    break
            
            response_time = time.time() - start_time
            response_str = ''.join(response_parts)
            
            print(f"   ✅ Response time: {response_time:.2f}s")
            print(f"   📄 Response length: {len(response_str)} chars")
            print(f"   🛠️ Tool calls: {len(tool_calls_observed)}")
            print(f"   🔗 A2A calls: {a2a_calls_made}")
            print(f"   📝 Full Response: {response_str}")
            
            # Extract function calls made
            function_calls_made = [call["name"] for call in tool_calls_observed]
            
            # Evaluate using semantic similarity
            utility_score = self._evaluate_utility_semantic(task, response_str)
            
            return TaskResult(
                task_id=task.id,
                prompt=task.prompt,
                response=response_str,
                ground_truth_output=task.ground_truth_output,
                utility_score=utility_score,
                response_time=response_time,
                function_calls_made=function_calls_made,
                tool_calls_observed=tool_calls_observed,
                a2a_calls_made=a2a_calls_made,
                category=task.category
            )
            
        except Exception as e:
            response_time = time.time() - start_time
            print(f"   ❌ Task failed: {e}")
            import traceback
            traceback.print_exc()
            
            return TaskResult(
                task_id=task.id,
                prompt=task.prompt,
                response="",
                ground_truth_output=task.ground_truth_output,
                utility_score=0.0,
                response_time=response_time,
                error=str(e),
                a2a_calls_made=0,
                category=task.category
            )

    def _evaluate_utility_semantic(self, task: FinanceSearchTask, model_output: str) -> float:
        """
        Hybrid evaluation: 70% semantic similarity + 30% objective metrics
        Returns single score from 0.0 to 1.0
        """
        if not model_output:
            return 0.0
        
        # 70% - Original semantic similarity evaluation
        semantic_score = self._calculate_semantic_similarity(task.ground_truth_output, model_output)
        
        # 30% - Objective metrics evaluation
        objective_score = self._calculate_objective_metrics(task, model_output)
        
        # Weighted combination
        final_score = 0.7 * semantic_score + 0.3 * objective_score
        
        print(f"     📊 Semantic score (70%): {semantic_score:.3f}")
        print(f"     📊 Objective score (30%): {objective_score:.3f}")
        print(f"     📊 Final hybrid score: {final_score:.3f}")
        
        return min(1.0, max(0.0, final_score))

    def _calculate_objective_metrics(self, task: FinanceSearchTask, model_output: str) -> float:
        """
        Calculate objective metrics score (30% weight portion)
        Includes: structure evaluation, numerical accuracy, completeness assessment
        """
        scores = []
        
        # 1. Answer structure evaluation (10%)
        structure_score = self._evaluate_answer_structure(task, model_output)
        scores.append(structure_score)
        
        # 2. Numerical accuracy evaluation (10%) 
        numerical_score = self._evaluate_numerical_accuracy(task, model_output)
        scores.append(numerical_score)
        
        # 3. Completeness evaluation (10%)
        completeness_score = self._evaluate_completeness(task, model_output)
        scores.append(completeness_score)
        
        # Return average score
        objective_score = sum(scores) / len(scores) if scores else 0.0
        
        print(f"       - Structure score: {structure_score:.3f}")
        print(f"       - Numerical score: {numerical_score:.3f}")
        print(f"       - Completeness score: {completeness_score:.3f}")
        
        return objective_score

    def _evaluate_answer_structure(self, task: FinanceSearchTask, model_output: str) -> float:
        """Evaluate the structural quality of the answer"""
        import re
        
        # Check if answers follow question structure (1. 2. 3.)
        question_pattern = r'\d+\.'
        gt_questions = len(re.findall(question_pattern, task.ground_truth_output))
        output_questions = len(re.findall(question_pattern, model_output))
        
        # Check for clear answer separation
        if gt_questions > 1:
            # Multi-question task, check structured answers
            if output_questions >= gt_questions:
                return 1.0  # Fully structured
            elif output_questions > 0:
                return 0.7  # Partially structured
            else:
                return 0.3  # No structure but has content
        else:
            # Single question task, check answer clarity
            return 1.0 if len(model_output.strip()) > 10 else 0.5

    def _evaluate_numerical_accuracy(self, task: FinanceSearchTask, model_output: str) -> float:
        """Evaluate numerical accuracy"""
        import re
        
        # Extract numbers from ground truth
        gt_numbers = re.findall(r'\b\d+(?:\.\d+)?\b', task.ground_truth_output)
        output_numbers = re.findall(r'\b\d+(?:\.\d+)?\b', model_output)
        
        if not gt_numbers:
            return 1.0  # No numerical requirements, full score
        
        # Calculate numerical match rate
        matched_numbers = 0
        for gt_num in gt_numbers:
            if gt_num in output_numbers:
                matched_numbers += 1
        
        accuracy = matched_numbers / len(gt_numbers)
        
        # Numerical accuracy scoring
        if accuracy >= 0.8:
            return 1.0
        elif accuracy >= 0.6:
            return 0.8
        elif accuracy >= 0.4:
            return 0.6
        else:
            return 0.3

    def _evaluate_completeness(self, task: FinanceSearchTask, model_output: str) -> float:
        """Evaluate answer completeness"""
        import re
        
        # Analyze number of questions
        questions_in_prompt = len(re.findall(r'\d+\.', task.prompt))
        
        if questions_in_prompt <= 1:
            # Single question, check answer length and keywords
            if len(model_output.strip()) > 20:
                return 1.0
            elif len(model_output.strip()) > 10:
                return 0.7
            else:
                return 0.3
        else:
            # Multi-question, check if all are answered
            lines_with_content = [line for line in model_output.split('\n') if line.strip()]
            
            # Expect at least one substantial line per question
            if len(lines_with_content) >= questions_in_prompt:
                return 1.0
            elif len(lines_with_content) >= questions_in_prompt * 0.7:
                return 0.8
            elif len(lines_with_content) >= questions_in_prompt * 0.5:
                return 0.6
            else:
                return 0.4

    def _evaluate_response_quality(self, task: FinanceSearchTask, model_output: str) -> float:
        """Evaluate overall response quality"""
        quality_indicators = 0
        total_indicators = 4
        
        # 1. Contains relevant financial terms
        finance_terms = ['client', 'advisor', 'product', 'price', 'risk', 'income', 'experience', 'rating']
        if any(term in model_output.lower() for term in finance_terms):
            quality_indicators += 1
        
        # 2. Proper formatting and readability
        if len(model_output.strip()) > 15 and '\n' in model_output:
            quality_indicators += 1
        
        # 3. No obvious errors or nonsense
        if not re.search(r'error|failed|null|undefined|none', model_output.lower()):
            quality_indicators += 1
        
        # 4. Contains specific identifiers (CL001, AD001, etc.)
        if re.search(r'[A-Z]{2,}\d{3}', model_output):
            quality_indicators += 1
        
        return quality_indicators / total_indicators

    def _evaluate_contextual_relevance(self, task: FinanceSearchTask, model_output: str) -> float:
        """Evaluate contextual relevance to the prompt"""
        import re
        
        # Extract key entities from prompt
        prompt_entities = set(re.findall(r'\b[A-Z]{2,}\d{3}\b', task.prompt))  # CL001, AD001, etc.
        output_entities = set(re.findall(r'\b[A-Z]{2,}\d{3}\b', model_output))
        
        if not prompt_entities:
            return 1.0  # No specific entities required
        
        # Calculate entity coverage
        entity_coverage = len(prompt_entities.intersection(output_entities)) / len(prompt_entities)
        
        # Bonus for addressing all parts of multi-part questions
        question_keywords = ['age', 'income', 'risk', 'experience', 'rating', 'price', 'name', 'specialties']
        prompt_keywords = set(word.lower() for word in re.findall(r'\b\w+\b', task.prompt) if word.lower() in question_keywords)
        output_keywords = set(word.lower() for word in re.findall(r'\b\w+\b', model_output) if word.lower() in question_keywords)
        
        keyword_coverage = len(prompt_keywords.intersection(output_keywords)) / len(prompt_keywords) if prompt_keywords else 1.0
        
        # Combine entity and keyword coverage
        relevance_score = (entity_coverage + keyword_coverage) / 2
        
        return min(1.0, relevance_score)

    def _calculate_semantic_similarity(self, ground_truth: str, model_output: str) -> float:
        try:
            import re
            
            stop_words = {
                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
                'will', 'would', 'could', 'should', 'may', 'might', 'can', 'what', 'where', 'when', 'how',
                'there', 'here', 'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their',
                


            }
            
            gt_tokens = set()
            

            numbers = re.findall(r'\b\d+(?:\.\d+)?\b', ground_truth)
            for num in numbers:
                if num not in stop_words:  
                    gt_tokens.add(num)
            

            words = re.findall(r'\b[a-zA-Z]+\b', ground_truth.lower())
            for word in words:
                if word not in stop_words and len(word) >= 3:
                    gt_tokens.add(word)
            
            print(f"       - GT tokens: {sorted(gt_tokens)}")
            

            model_lower = model_output.lower()
            contained_tokens = 0
            found_tokens = []
            
            for token in gt_tokens:
                if token in model_lower:
                    contained_tokens += 1
                    found_tokens.append(token)
            
            containment_rate = contained_tokens / len(gt_tokens) if gt_tokens else 0.0
            
            print(f"       - Total GT tokens: {len(gt_tokens)}")
            print(f"       - Found tokens: {contained_tokens} {found_tokens}")
            print(f"       - Containment rate: {containment_rate:.3f}")
            

            if containment_rate >= 0.8:
                final_score = min(1.0, containment_rate + 0.15)
            elif containment_rate >= 0.6:
                final_score = min(1.0, containment_rate + 0.1)
            else:
                final_score = containment_rate
                
            print(f"       - Final score: {final_score:.3f}")
            return final_score
            
        except Exception as e:
            print(f"       - Error in similarity calculation: {e}")
            return self._simple_similarity_fallback(ground_truth, model_output)



    def _simple_similarity_fallback(self, ground_truth: str, model_output: str) -> float:
        """Simple fallback similarity calculation"""
        from difflib import SequenceMatcher
        return SequenceMatcher(None, ground_truth.lower(), model_output.lower()).ratio()

    def _adjust_basic_task_score(self, task: FinanceSearchTask, model_output: str, base_score: float) -> float:
        """Adjust score for basic tasks"""
        # Basic tasks should have straightforward answers
        if "client" in task.prompt.lower() and "client" in model_output.lower():
            base_score += 0.1
        if "advisor" in task.prompt.lower() and "advisor" in model_output.lower():
            base_score += 0.1
        return min(1.0, base_score)

    def _adjust_advanced_task_score(self, task: FinanceSearchTask, model_output: str, base_score: float) -> float:
        """Adjust score for advanced tasks"""
        return min(1.0, base_score)

    def _adjust_recommendation_task_score(self, task: FinanceSearchTask, model_output: str, base_score: float) -> float:
        """Adjust score for recommendation tasks"""
        return min(1.0, base_score)

    def _adjust_analytics_task_score(self, task: FinanceSearchTask, model_output: str, base_score: float) -> float:
        """Adjust score for analytics tasks"""
        return min(1.0, base_score)

    async def run_benchmark(self, verbose: bool = True) -> Dict[str, Any]:
        """Run the complete benchmark"""
        print("🚀 Starting Finance Search Benchmark...")
        
        await self.initialize_coordinator()
        await self.initialize_session()
        
        for i, task in enumerate(self.tasks):
            if verbose:
                print(f"📋 Running task {i+1}/{len(self.tasks)}: {task.id}")
            
            result = await self.run_single_task(task)
            self.results.append(result)
            
            if verbose:
                print(f"✅ Task completed - Score: {result.utility_score:.3f}")
        
        # Calculate statistics
        stats = self._calculate_statistics()
        
        if verbose:
            self._print_results(stats)
        
        return stats

    def _calculate_statistics(self) -> Dict[str, Any]:
        """Calculate comprehensive statistics"""
        if not self.results:
            return {}
        
        scores = [r.utility_score for r in self.results]
        response_times = [r.response_time for r in self.results]
        
        # Overall statistics
        stats = {
            'total_tasks': len(self.results),
            'mean_score': sum(scores) / len(scores),
            'median_score': sorted(scores)[len(scores) // 2],
            'min_score': min(scores),
            'max_score': max(scores),
            'std_score': (sum((s - sum(scores) / len(scores)) ** 2 for s in scores) / len(scores)) ** 0.5,
            'mean_response_time': sum(response_times) / len(response_times),
            'total_a2a_calls': sum(r.a2a_calls_made for r in self.results),
            'error_count': len([r for r in self.results if r.error]),
        }
        
        # Score distribution
        score_ranges = {
            '0.0-0.2': len([s for s in scores if 0.0 <= s < 0.2]),
            '0.2-0.4': len([s for s in scores if 0.2 <= s < 0.4]),
            '0.4-0.6': len([s for s in scores if 0.4 <= s < 0.6]),
            '0.6-0.8': len([s for s in scores if 0.6 <= s < 0.8]),
            '0.8-1.0': len([s for s in scores if 0.8 <= s <= 1.0]),
        }
        stats['score_distribution'] = score_ranges
        
        # Category-wise statistics
        categories = set(r.category for r in self.results)
        category_stats = {}
        for category in categories:
            cat_results = [r for r in self.results if r.category == category]
            cat_scores = [r.utility_score for r in cat_results]
            if cat_scores:
                category_stats[category] = {
                    'count': len(cat_results),
                    'mean_score': sum(cat_scores) / len(cat_scores),
                    'min_score': min(cat_scores),
                    'max_score': max(cat_scores),
                }
        stats['category_breakdown'] = category_stats
        
        return stats

    def _print_results(self, stats: Dict[str, Any]):
        """Print comprehensive benchmark results"""
        print("\n" + "="*80)
        print("🎯 FINANCE SEARCH BENCHMARK RESULTS")
        print("="*80)
        
        print(f"📊 Overall Performance:")
        print(f"   Total Tasks: {stats['total_tasks']}")
        print(f"   Mean Score: {stats['mean_score']:.3f}")
        print(f"   Median Score: {stats['median_score']:.3f}")
        print(f"   Score Range: {stats['min_score']:.3f} - {stats['max_score']:.3f}")
        print(f"   Standard Deviation: {stats['std_score']:.3f}")
        
        print(f"\n⏱️  Performance Metrics:")
        print(f"   Mean Response Time: {stats['mean_response_time']:.2f}s")
        print(f"   Total A2A Calls: {stats['total_a2a_calls']}")
        print(f"   Error Count: {stats['error_count']}")
        
        print(f"\n📈 Score Distribution:")
        for range_name, count in stats['score_distribution'].items():
            percentage = (count / stats['total_tasks']) * 100
            print(f"   {range_name}: {count} tasks ({percentage:.1f}%)")
        
        if stats['category_breakdown']:
            print(f"\n🏷️  Category Breakdown:")
            for category, cat_stats in stats['category_breakdown'].items():
                print(f"   {category.title()}:")
                print(f"     Count: {cat_stats['count']}")
                print(f"     Mean Score: {cat_stats['mean_score']:.3f}")
                print(f"     Range: {cat_stats['min_score']:.3f} - {cat_stats['max_score']:.3f}")
        
        print("="*80)

async def run_finance_search_benchmark():
    """Run the finance search benchmark with result saving"""
    try:
        print("🔧 Initializing Finance Search Benchmark...")
        
        # Create benchmark instance
        benchmark = FinanceSearchBenchmark()
        
        # Run benchmark
        stats = await benchmark.run_benchmark(verbose=True)
        
        # Save results
        timestamp = int(time.time())
        results_file = f"benchmark_results/finance_search_benchmark_{timestamp}.json"
        
        results_data = {
            "benchmark_info": {
                "name": "finance_search_benchmark",
                "version": "1.0.0",
                "timestamp": time.time(),
                "total_tasks": len(benchmark.tasks),
                "evaluation_method": "semantic_similarity_utility_scoring",
                "agent_flow": "User Input → Host Agent → A2A Protocol → Remote Finance Agents",
                "description": "Finance search benchmark covering clients, advisors, and trading products",
                "categories": ["basic", "advanced", "analytics", "recommendation"],
            },
            "tasks": [
                {
                    "id": task.id,
                    "prompt": task.prompt,
                    "comment": task.comment,
                    "ground_truth_output": task.ground_truth_output,
                    "difficulty": task.difficulty,
                    "category": task.category
                }
                for task in benchmark.tasks
            ],
            "results": [asdict(result) for result in benchmark.results],
            "statistics": stats
        }
        
        Path(results_file).parent.mkdir(parents=True, exist_ok=True)
        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(results_data, f, ensure_ascii=False, indent=2)
        
        print(f"💾 Results saved to: {results_file}")
        return stats
        
    except Exception as e:
        print(f"❌ Benchmark failed: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    asyncio.run(run_finance_search_benchmark())