import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers import MllamaForConditionalGeneration, AutoModelForImageTextToText





def init_model(args):

    activated_models = []
    model_dict={}

    print('init qwen2 vl 72b model')
    # qwen2_vl_72b_model = Qwen2VLForConditionalGeneration.from_pretrained(
    #     args.qwen2_vl_72b_model_path, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation='flash_attention_2',
    # )

    qwen2_vl_72b_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", cache_dir='./huggingfaceModel',  force_download=False, device_map="auto")
    qwen2_vl_72b_model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", cache_dir='./huggingfaceModel',  force_download=False, device_map="auto")

    # min_pixels = 256*28*28
    # max_pixels = 1280*28*28
    # qwen2_vl_72b_processor = AutoProcessor.from_pretrained(args.qwen2_vl_72b_model_path, min_pixels=min_pixels, max_pixels=max_pixels)

    activated_models.append('qwen2_vl_72b')
    model_dict['qwen2_vl_72b'] = {'model': qwen2_vl_72b_model, 'processor': qwen2_vl_72b_processor}
    
    return activated_models, model_dict

