import requests
import json

# 服务基础URL
BASE_URL = "http://localhost:8008"

def generate_text(input_messages, max_tokens=128, temperature=0.7):
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json"
    }
    
    data = {
        "input_messages": input_messages,
        "temperature": temperature,
        "max_tokens": max_tokens, 
    }
    
    response = requests.post(
        f"{BASE_URL}/generate",
        headers=headers,
        json=data
    )
    response.raise_for_status()
    return response.json()

if __name__ == "__main__":
    from transformers import AutoModelForCausalLM, AutoTokenizer
    model_name = "Qwen3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    input_messages = []
    for _ in range(3):
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who won the world series in 2020?"},
            {
                "role": "assistant",
                "content": "The Los Angeles Dodgers won the World Series in 2020.",
            },
            {"role": "user", "content": "Where was it played?"},
        ]
        prompts = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False  # True is the default value for enable_thinking
        )
        input_messages.append(prompts)
    
    results = generate_text(input_messages)
    print(results["generated_texts"])
