def infer_hf():
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import PeftModel
    from modelscope import snapshot_download
    model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct')
    adapter_dir = snapshot_download('swift/test_lora')
    model = AutoModelForCausalLM.from_pretrained(
        model_dir, torch_dtype='auto', device_map='auto', trust_remote_code=True)
    model = PeftModel.from_pretrained(model, adapter_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

    messages = [{
        'role': 'system',
        'content': 'You are a helpful assistant.'
    }, {
        'role': 'user',
        'content': 'who are you?'
    }]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors='pt', add_special_tokens=False).to(model.device)

    generated_ids = model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f'response: {response}')
    return response


def infer_swift():
    from swift.llm import get_model_tokenizer, get_template, InferRequest, RequestConfig, PtEngine
    from modelscope import snapshot_download
    from swift.tuners import Swift
    model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct')
    adapter_dir = snapshot_download('swift/test_lora')
    model, tokenizer = get_model_tokenizer(model_dir, device_map='auto')
    model = Swift.from_pretrained(model, adapter_dir)
    template = get_template(model.model_meta.template, tokenizer)
    engine = PtEngine.from_model_template(model, template)

    messages = [{
        'role': 'system',
        'content': 'You are a helpful assistant.'
    }, {
        'role': 'user',
        'content': 'who are you?'
    }]
    request_config = RequestConfig(max_tokens=512, temperature=0)
    resp_list = engine.infer([InferRequest(messages=messages)], request_config=request_config)
    response = resp_list[0].choices[0].message.content
    print(f'response: {response}')
    return response


if __name__ == '__main__':
    response = infer_hf()
    response2 = infer_swift()
    assert response == response2
