import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"
import torch
import torch.nn as nn
import torch.optim as optim
from merge_method.cal_mixtral_layer import dump_mixtral_moe_metrics
from eval.minipile import get_calib_dataloder
import numpy as np

from transformers import MixtralForCausalLM, AutoTokenizer

model = MixtralForCausalLM.from_pretrained("/Path/Mixtral-8x7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("/Path/Mixtral-8x7B-v0.1", trust_remote_code=True)

dataloader = get_calib_dataloder(
    dataset="wikitext",
    tokenizer=tokenizer,
)

dump_mixtral_moe_metrics(model, metric=["all"], dataloader=dataloader, save_dir="./results/metrics", save_name="mixtral_moe_metrics.pt", override=True)
