import os

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM

def load_models(args):

    print('cuda device_count:', torch.cuda.device_count())
    print('cuda is_available:', torch.cuda.is_available())

    model_name = args.model_name_or_path

    llm_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if llm_tokenizer.pad_token is None:
        llm_tokenizer.pad_token = llm_tokenizer.eos_token

    if args.multi_gpu:
        llm_model = LLM(model=model_name, dtype=torch.bfloat16, tensor_parallel_size=torch.cuda.device_count(), distributed_executor_backend="mp")
    else:
        llm_model = LLM(model=model_name, dtype=torch.bfloat16, gpu_memory_utilization=0.98)

    return llm_tokenizer, llm_model

