from vllm import LLM, SamplingParams

MODEL_PATH = "qwen3_8b_vllm/Qwen/Qwen3-8B"

if __name__ == "__main__":
    llm = LLM(
        model=MODEL_PATH,
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
        dtype="float16",
        max_model_len=30000,
    )

    params = SamplingParams(temperature=0.7, top_k=50, max_tokens=8)
    outputs = llm.generate(
        [
            "请你介绍一下什么是 Multi-Modal LLM？",
            "什么是大模型的蒸馏？",
            "机器学习系统的监控指标有哪些？",
        ],
        sampling_params=params,
    )
    # for i, output in enumerate(outputs):
    #     print(f"request {i} =>", output.outputs[0].text)
