
import argparse
import asyncio
from flex_attention_vllm.launcher import SystemLauncher
from flex_attention_vllm.vllm_simulator.api_server import start_simulation

def parse_args():
    parser = argparse.ArgumentParser(description="LLM Serving System")
    parser.add_argument("--replica-urls", type=str, required=True)
    parser.add_argument("--model", type=str, default="Qwen/Qwen1.5-7B") 
    parser.add_argument("--model-name", type=str, default="qwen1.5-7B")
    parser.add_argument("--replica-num", type=int, default="4") 
    parser.add_argument("--replica-dram", type=int, default="128")
    parser.add_argument("--request-generate-qps", type=float, default="0.5")
    parser.add_argument("--request-num", type=int, default="1000")
    parser.add_argument("--request-share-ratio", type=float, default="0.6")
    parser.add_argument("--request-first-prefill-len", type=int, default="640")
    parser.add_argument("--request-decode-len", type=int, default="16")
    parser.add_argument("--request-round", type=int, default="3")
    parser.add_argument("--request-active-timeout", type=int, default="5")
    parser.add_argument("--dataset-type", type=str, required=True)
    parser.add_argument("--request-dataset-file", type=str, required=True)
    parser.add_argument("--global-scheduler-type", type=str, default="round_robin")
    parser.add_argument("--update-replica-info", type=bool, default="False")
    parser.add_argument("--window-duration", type=int, default="30")
    parser.add_argument("--balance-type", type=str, default="no_balance")
    parser.add_argument("--ttft-slo", type=int, default="5")

    return parser.parse_args()

async def main():
    args = parse_args()
    # start vllm_simulator.api_server
    vllm_simulation_task = asyncio.create_task(start_simulation())

    launcher = SystemLauncher(args)
    await launcher.initialize(args)
    await launcher.run()

    # wait for vllm_simulation_task finished
    await vllm_simulation_task

if __name__ == "__main__":
    asyncio.run(main())