from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 模型本地路径（改成你自己保存的路径）
model_path = "/path/to/qwen2.5-7b-instruct"
# /group/40059/yuujiefeng/Backbones/Qwen2.5-3B-Instruct_ours_stage1_withbox_2

# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",        # 自动分配到 GPU / CPU
    torch_dtype=torch.float16, # 用半精度节省显存
    trust_remote_code=True
)
model.eval()

# 构造输入
prompt = "请解释一下量子计算和经典计算的区别。"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9
    )

# 解码
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
