from data_utils import load_dataset
from utils import construct_prompt, random_sampling, construct_prompt_without_test, construct_prompt_instance_prompt_text, construct_prompt_without_test_emptyanswer
import numpy as np
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, LlamaForCompressionCausalLM, AutoConfig
import argparse
from typing import Dict, Optional, Sequence
import itertools
import copy
import json
import random
from openpyxl import Workbook
# import deepcopy
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    # model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    print("num_new_tokens = ", num_new_tokens)
    return num_new_tokens

def load_model_lora(base_model, device, lora_weights, compression, compression_length, use_partial_mask):
    config = AutoConfig.from_pretrained(
        base_model,
        cache_dir='.cache',
    )
    if compression:
        config.compression_size = compression_length
        config.use_partial_mask = use_partial_mask
        print('use_partial_mask = ', use_partial_mask)
        
    if device == "cuda":
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model,
                load_in_8bit=False,
                torch_dtype=torch.float16,
                device_map="auto",
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                load_in_8bit=False,
                torch_dtype=torch.float16,
                device_map="auto",
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            torch_dtype=torch.float16,
        )
    elif device == "mps":
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model,
                device_map={"": device},
                torch_dtype=torch.float16,
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                device_map={"": device},
                torch_dtype=torch.float16,
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
    else:
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model, device_map={"": device}, low_cpu_mem_usage=True,
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model, device_map={"": device}, low_cpu_mem_usage=True
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
        )
    return model

def main(model, lora_weight, dataset, num_seeds, all_shots, subsample_test_set, compression, compression_length, use_partial_mask, compression_without_input, with_prompt_text, compression_without_prompt_text, with_sequence_order, compression_token_initialization, add_final_article, include_answer, load_in_8bit, load_in_4bit, only_answer, include_punctuation, include_content, include_template, include_colon):
    test_inference = True

    print("lora_weight = ", lora_weight)
    print("model = ", model)
    print("dataset = ", dataset)
    print("with_prompt_text = ", with_prompt_text)
    print("compression_without_prompt_text = ", compression_without_prompt_text)




    if test_inference:
        if torch.cuda.is_available():
            device = "cuda"
        else:
            device = "cpu"
        base_model = 'decapoda-research/llama-7b-hf' if lora_weight is not None else model
        # base_model = model

        

        # model = LlamaForCausalLM.from_pretrained(
        #     base_model, device_map={"": device}, low_cpu_mem_usage=True, cache_dir='.cache',
        # )
        # model = PeftModel.from_pretrained(
        #     model,
        #     lora_weights,
        #     device_map={"": device},
        # )
        if model == 'decapoda-research/llama-7b-hf':
            tokenizer = LlamaTokenizer.from_pretrained('decapoda-research/llama-7b-hf')
        elif model == 'openlm-research/open_llama_3b':
            tokenizer = LlamaTokenizer.from_pretrained('openlm-research/open_llama_3b')
        else:
            tokenizer = LlamaTokenizer.from_pretrained(base_model)
        # if lora_weight is None:

        IGNORE_INDEX = -100
        DEFAULT_PAD_TOKEN = "[PAD]"
        DEFAULT_EOS_TOKEN = "</s>"
        DEFAULT_BOS_TOKEN = "<s>"
        DEFAULT_UNK_TOKEN = "<unk>"
        special_tokens_dict = dict()
        if tokenizer.pad_token is None:
            special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
        if tokenizer.eos_token is None:
            special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
        if tokenizer.bos_token is None:
            special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
        if tokenizer.unk_token is None:
            special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

        
        num_new_tokens = smart_tokenizer_and_embedding_resize(
            special_tokens_dict=special_tokens_dict,
            tokenizer=tokenizer,
            # model=model,
        )

        if lora_weight is not None:
            model = load_model_lora(base_model, device, lora_weight, compression, compression_length, use_partial_mask)
                        # unwind broken decapoda-research config
            # model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
            # model.config.bos_token_id = 1
            # model.config.eos_token_id = 2

            print("model = ", model)
        else:
            config = transformers.AutoConfig.from_pretrained(
                base_model,
                cache_dir='.cache',
            )
            # if model == 'decapoda-research/llama-7b-hf':
            #     config = transformers.AutoConfig.from_pretrained(
            #         'decapoda-research/llama-7b-hf',
            #         cache_dir='.cache',
            #     )
            # else:
            #     config = transformers.AutoConfig.from_pretrained(
            #         'openlm-research/open_llama_3b',
            #         cache_dir='.cache',
            #     )
            config.compression_token_initialization = compression_token_initialization
            if compression_token_initialization:
                config.initialize_ids = tokenizer("Article: N/A \n\n Answer: N/A \n\n", return_tensors="pt")['input_ids'].tolist()
            if compression: 
                
                config.vocab_size += num_new_tokens
                config.compression_size = compression_length
                config.use_partial_mask = use_partial_mask
                print("use_partial_mask = ", use_partial_mask)

                # if '13b' in base_model:
                model = LlamaForCompressionCausalLM.from_pretrained(
                    base_model,
                    config=config,
                    load_in_8bit=load_in_8bit,
                    trust_remote_code=True,
                    # tie_weights=True,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    cache_dir='.cache',
                )   
                # else:
                #     model = LlamaForCompressionCausalLM.from_pretrained(
                #         base_model,
                #         config=config,
                #         # load_in_8bit=load_8bit,
                #         trust_remote_code=True,
                #         # tie_weights=True,
                #         torch_dtype=torch.float16,
                #         device_map="auto",
                #         cache_dir='.cache',
                #     )                
            else:
                model = LlamaForCausalLM.from_pretrained(
                    base_model,
                    config=config,
                    # load_in_8bit=load_8bit,
                    # tie_weights=True,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    cache_dir='.cache',
                )
            
        # num_new_tokens = smart_tokenizer_and_embedding_resize(
        #     special_tokens_dict=special_tokens_dict,
        #     tokenizer=tokenizer,
        #     # model=model,
        # )
        temperature=0.8
        top_p=0.75
        top_k=40
        num_beams=4
        max_new_tokens=2 if compression else 1
        generation_config = GenerationConfig(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            # use_cache=True,
        )

    # dataset_name = ['agnews', 'atis', 'cb', 'dbpedia', 'lama', 'rte', 'slot-movies', 'sst2', 'trec']

     


    # if dataset == 'atis':
    model.eval()


    
    # Different ids got different training set, so we can not just mix them up.
    if dataset == "lama":
        all_lamas = [1001,101,103,106,108,127,1303,131,136,1376,138,140,1412,159,17,176,178,19,190,20,264,27,276,279,30,31,36,361,364,37,39,407,413,449,463,47,495,527,530,740,937]
        all_params = []
        for which_lama in all_lamas:
            # p = deepcopy(default_params)
            p = {}
            p['dataset'] = f"lama_{which_lama}"
            all_params.append(p)
        
        correct = 0
        test_samples_num = 0
        samples_num = 0
        for param_index, params in enumerate(all_params):
            orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_dataset(params)
            
            np.random.seed(num_seeds)
            # AgNews 7600
            # samples_num = subsample_test_set
            few_shot = all_shots
            if test_samples_num == 0:
                test_sentences, test_labels = orig_test_sentences, orig_test_labels
                samples_num += len(orig_test_labels)

            else:
                test_sentences, test_labels = random_sampling(orig_test_sentences, orig_test_labels, samples_num)

            train_sentences, train_labels = random_sampling(orig_train_sentences, orig_train_labels, few_shot)
            # print("----------Train-----------")
            # print(orig_train_sentences[:5], orig_train_labels[:5])
            # print("----------Test-----------")
            # print(test_sentences[0], test_labels[0])



            # print('----------Prompt--------------')

            for i, each_test in enumerate(test_sentences):
                # print("i = ", i, end='\r')
                prompt = construct_prompt(params, train_sentences, train_labels, each_test)
                # print("constructed prompt = ", prompt)


                if test_inference:
                    # prompt = 'The highest mountain in the world is '
                    inputs = tokenizer(prompt, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)
                    # generate_params = {
                    #     "input_ids": input_ids,
                    #     "generation_config": generation_config,
                    #     "return_dict_in_generate": True,
                    #     "output_scores": True,
                    #     "max_new_tokens": max_new_tokens,
                    # }
                    # print("yesyesyes???")
                    with torch.no_grad():
                        if not compression:
                            generation_output = model.generate(
                                input_ids=input_ids,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                max_new_tokens=max_new_tokens,
                            )
                        else:
                            generation_output = model.compression_generate(
                                input_ids=input_ids,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                max_new_tokens=max_new_tokens,
                            )

                    s = generation_output.sequences[0]
                    output = tokenizer.decode(s)

                    # answer = output.split()[-1]
                    answer = output.split(':')[-1].strip()
                    # print("------------Model Output------------")
                    
                    # print("label = ", params['label_dict'][test_labels[i]])
                    # print("label = ", test_labels[i])
                    # print("generation_output = ", generation_output)
                    # print("output = ", output)
                    # if i > 20: 
                    #     exit()
                    if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec']:
                        if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                            correct += 1
                    else:
                        if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                            correct += 1
        print("model = ", model)
        print("dataset = ", dataset)
        print("num_seeds = ", num_seeds)
        print("all_shots = ", all_shots)
        print("ACC = ", correct / samples_num)

    # if dataset == "slot-movies":
    else:
        params = {
            'dataset': dataset,
        }
        orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_dataset(params)

    # train_sentences = orig_train_sentences[:3]
    # train_labels = orig_train_labels[:3]
    # test_sentences = orig_test_sentences[0]
    # test_labels = orig_test_labels[0]


        np.random.seed(num_seeds)
        # AgNews 7600
        few_shot = all_shots
        if subsample_test_set == 0:
            test_sentences, test_labels = orig_test_sentences, orig_test_labels
            samples_num = len(orig_test_labels)
        else:
            samples_num = min(subsample_test_set, len(orig_test_labels))
            test_sentences, test_labels = random_sampling(orig_test_sentences, orig_test_labels, samples_num)


        train_sentences, train_labels = random_sampling(orig_train_sentences, orig_train_labels, few_shot)
        print("----------Train-----------")
        print(train_sentences[:8],train_labels[:8])

 
        # easier_train_sentences, easier_train_labels, easier_idx = construct_easier_order(train_sentences, train_labels)
        # harder_train_sentences, harder_train_labels, harder_idx = construct_harder_order(train_sentences, train_labels)
        # adjusted_train_sentences = [easier_train_sentences, train_sentences, harder_train_sentences]
        # adjusted_train_labels = [easier_train_labels, train_labels, harder_train_labels]

        def generate_permutations(n):
            # 生成1到n的数字列表
            nums = list(range(1, n+1))
            # 使用itertools.permutations生成全排列
            permutations = list(itertools.permutations(nums))
            return permutations

        all_rank = generate_permutations(few_shot)
        adjusted_train_sentences = [[train_sentences[i - 1] for i in each_rank] for each_rank in all_rank]
        adjusted_train_labels = [[train_labels[i - 1] for i in each_rank] for each_rank in all_rank]


        # answer_tokens = params['inv_label_dict'] 
        

        # count = 3
        # while count > 0:
        count = 0
        # best_res = 0
        # best_rank = None
        # res_list = []
        final_res = {}
        position_res = {}
        final_attention_mask = {}
        final_anti_attention_mask = {}



        print("inlcude_punctuation = ", include_punctuation)
        print("inlcude_template = ", include_template)
        print("inlcude_answer = ", include_answer)
        print("only_answer = ", only_answer)
        print("include_content = ", include_content)
        print("add_final_article = ", add_final_article)
        if include_punctuation:
            tmp_params = {}
            for each in params:
                tmp_params[each] = copy.deepcopy(params[each])
            if include_colon:
                tmp_params["prompt_prefix"] += '\n\n : '
            # print("punctuation prompt = ", tmp_params["prompt_prefix"] + '\n\n') 
            punctuation_prompt = tokenizer(tmp_params["prompt_prefix"] + '\n\n', return_tensors="pt")['input_ids'].to(device)        
        if include_template:
            tmp_params = {}
            for each in params:
                tmp_params[each] = copy.deepcopy(params[each])
            tmp_params["prompt_prefix"] = ''
            if include_answer:
                if isinstance(adjusted_train_sentences[0][0], dict):
                    empty_sentences = [{'hypothesis':'', 'premise': ''} for each in adjusted_train_labels[0]]
                else:
                    empty_sentences = ['' for each in adjusted_train_labels[0]]
                    if only_answer:
                        tmp_params["q_prefix"] = ': ' 
                        tmp_params["a_prefix"] = ': ' 
                template_prompt, _ = construct_prompt_without_test(tmp_params, empty_sentences, adjusted_train_labels[0], '')
                # print("template_prompt = ", template_prompt)
                template_prompt = tokenizer(template_prompt, return_tensors="pt")['input_ids'].to(device)
            else:
                empty_sentences = ['' for each in adjusted_train_labels[0]]
                template_prompt, _ = construct_prompt_without_test_emptyanswer(tmp_params, empty_sentences, empty_sentences, '')
                # print("template_prompt = ", template_prompt)
                template_prompt = tokenizer(template_prompt, return_tensors="pt")['input_ids'].to(device)           
        if include_content:
            tmp_params = {}
            for each in params:
                tmp_params[each] = copy.deepcopy(params[each])
            # content_prompt = tokenizer(' '.join(train) + '\n\n', return_tensors="pt")['input_ids'].to(device)   
            tmp_params["prompt_prefix"] = ''
            if isinstance(adjusted_train_sentences[0][0], dict):
                empty_sentences = [{'hypothesis':'', 'premise': ''} for each in adjusted_train_labels[0]]
                tmp_params["q_prefix"] = ''
            else:
                empty_sentences = ['' for each in adjusted_train_labels[0]]
            # here use the original params to exclude the instruction tokens.
            # no it should not be excluded since the following code do that.
            # also this setting should include the answer
            # if include_answer, then the anti mask should not include the answer

            # use this to include : into content words
            if include_colon:
                tmp_params["q_prefix"] = tmp_params["q_prefix"].split(':')[0] + ' '
                tmp_params["a_prefix"] = tmp_params["a_prefix"].split(':')[0] + ' '
            if include_answer:
                new_template_prompt, _ = construct_prompt_without_test_emptyanswer(tmp_params, empty_sentences, ['' for each in adjusted_train_labels[0]], '')
            else:
                new_template_prompt, _ = construct_prompt_without_test(tmp_params, empty_sentences, adjusted_train_labels[0], '')
            # print("anti_content_prompt = ", new_template_prompt)
            new_template_prompt = tokenizer(new_template_prompt, return_tensors="pt")['input_ids'].to(device)
            anti_content_prompt = new_template_prompt





        # elif include_punctuation
        for tmp_train_sentences, tmp_train_labels in zip(adjusted_train_sentences, adjusted_train_labels):
            correct = 0
            prompt, _ = construct_prompt_without_test(params, tmp_train_sentences, tmp_train_labels, '')
            if add_final_article:
                prompt += 'Article: '
            # print("prompt = ", prompt)



            if test_inference:
                inputs = tokenizer(prompt, return_tensors="pt")
                input_ids = inputs["input_ids"].to(device)

                # template tokens mask
                tmp_mask = torch.zeros_like(input_ids).to(device)
                # if include_answer:
                    # print("template_prompt = ", template_prompt)
                    # exit()
                if include_template:
                    for each in template_prompt[0]:
                        tmp_mask = tmp_mask | input_ids.eq(each)
                if include_punctuation:
                    for each in punctuation_prompt[0]:
                        tmp_mask = tmp_mask | input_ids.eq(each)
                if include_content:
                    new_tmp_mask = torch.zeros_like(input_ids).to(device).bool()
                    for each in anti_content_prompt[0]:
                        new_tmp_mask = new_tmp_mask | input_ids.eq(each).bool()
                    tmp_mask = tmp_mask | (~new_tmp_mask).bool()
                # else:
                #     tmp_mask = input_ids.eq(13)|input_ids.eq(30985)|input_ids.eq(31871)|input_ids.eq(5092)|input_ids.eq(2055)|input_ids.eq(265)



                # tmp_anti_mask = ~tmp_mask


                position_ids = torch.ones(input_ids.size()).to(input_ids.device)
                position_ids = position_ids.long().cumsum(-1) - 1

                with torch.no_grad():
                    # generation_output = model.generate(
                    #     input_ids=input_ids,
                    #     generation_config=generation_config,
                    #     return_dict_in_generate=True,
                    #     output_scores=True,
                    #     max_new_tokens=max_new_tokens,
                    # )
                    output = model(input_ids, output_hidden_states=True)
                    hidden_states = output.hidden_states

                # 依次取出每一个sample每一层的representation
                # 然后存在一个文件里，变成list然后json就可以
                # 现在所有的rep都看，到时候做增强的时候需要把instruction的也加进来。
                # 写代码的时候，还是需要qkv的结果，emmm
                # 还是写一个representation 转kv的吧，这样的话会好弄一点，至少不用在generation_utils里边做很多修改？。
                all_mask = input_ids.eq(13).long()
                all_mask = torch.cumsum(all_mask, dim=-1)
                # # print("all_mask = ", all_mask)
                # # 矫正一下位置
                all_mask =torch.cat((all_mask[:, :1], all_mask[:, :-1]), dim=-1)
                # # print("all_mask2 = ", all_mask)
                # # print("all_mask.gt = ", all_mask.ge(2))
                # # print("all_mask.le = ", all_mask.lt(5))



                # mask_list = []
                # for i in range(few_shot):
                #     tmp_mask = all_mask.ge(2 + (i * 3)) & all_mask.lt(2 + (i + 1) * 3)
                #     mask_list.append(tmp_mask)

                if count == 0:
                    instruction_mask = all_mask.lt(2)
                #     instruction_rep = [torch.masked_select(hidden_states[i], instruction_mask.unsqueeze(-1).repeat(1, 1, hidden_states[i].size(-1))).view(1, -1, hidden_states[i].size(-1)) for i in range(len(hidden_states))]
                #     instruction_pos = torch.masked_select(position_ids, instruction_mask).view(1, -1)
                #     # 这里要管instruction_rep
                #     # print("instruction_rep = ", instruction_rep.size())
                #     # print(instruction_rep)


                # result_list = [[] for i in range(len(mask_list))]
                # position_list = [None for i in range(len(mask_list))]
                
                # for demo, demo_mask in enumerate(mask_list):
                #     # real_demo = all_rank[count][demo]
                #     # print("real_demo = ", real_demo)
                #     for layer, hidden_state in enumerate(hidden_states):
                #         sentence_rep =  torch.masked_select(hidden_state, demo_mask.unsqueeze(-1).repeat(1, 1, hidden_state.size(-1))).view(1, -1,hidden_state.size(-1))
                #         # 这里记录的是每个句子的。
                #         # result_list[demo].append(sentence_rep.mean(dim=-2).tolist())
                #         result_list[demo].append(sentence_rep)
                #         # print("sentence_rep = ", sentence_rep.size())
                #         # print(sentence_rep)
                #         # exit()
                #     position_list[demo] = torch.masked_select(position_ids, demo_mask).view(1, -1)
                # print("prompt = ", prompt)
                # print("input_ids = ", input_ids)
                # print("tmp_mask = ", tmp_mask.masked_fill(instruction_mask, True))
                # print("anti_tmp_mask = ",  (~tmp_mask).masked_fill(instruction_mask, True))
                # exit()

                position_res[str(all_rank[count])] = position_ids
                # 存成一个文件，然后做一个py处理这个文件，算一个相似度矩阵。
                # 在读的时候做一个all permutation转字符串的操作吧，这样应该就可以了。
                final_res[str(all_rank[count])] = list(hidden_states)
                final_attention_mask[str(all_rank[count])] = tmp_mask.bool().masked_fill(instruction_mask, True)
                final_anti_attention_mask[str(all_rank[count])] = (~(tmp_mask.bool())).masked_fill(instruction_mask, True)
                if count == 0:
                    print("final_unmasked_tokens = ", tokenizer.decode((input_ids * final_attention_mask[str(all_rank[count])].long())[0]))
                    exit()
                count += 1





        # print('final_res = ', final_res)

        # origin_prefix = copy.deepcopy(params['prompt_prefix'])
        
        count = 0
        for tmp_train_sentences, tmp_train_labels in zip(adjusted_train_sentences, adjusted_train_labels):
            # only_single_sentence = tmp_train_sentences[-1:]
            # only_single_label = tmp_train_labels[-1:]
            current_rank = all_rank[count]
            # # print("---------------------------------------")
            # # print("current_rank = ", current_rank)


            # correct_single = 0
            # # print("prompt = ", prompt)
            # # for single sentence
            # for i, each_test in enumerate(test_sentences):
            #     params['prompt_prefix'] = origin_prefix
            #     # print("origin_prefix = ", or)
            #     prompt = construct_prompt(params, only_single_sentence, only_single_label, each_test)
            #     if test_inference:
            #         inputs = tokenizer(prompt, return_tensors="pt")
            #         input_ids = inputs["input_ids"].to(device)

            #         with torch.no_grad():
            #             generation_output = model.generate(
            #                 input_ids=input_ids,
            #                 generation_config=generation_config,
            #                 return_dict_in_generate=True,
            #                 output_scores=True,
            #                 max_new_tokens=max_new_tokens,
            #             )

            #         s = generation_output.sequences[0]
            #         s = s.masked_fill(s.eq(-99), 0)
            #         output = tokenizer.decode(s)
            #         answer = output.split(':')[-1].replace("</s>", '').strip()
                    
            #         if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
            #             if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
            #                 correct_single += 1
            #         else:
            #             if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
            #                 correct_single += 1
            #     # pass


            # # print("model = ", model)
            # # print("dataset = ", dataset)
            # # print("num_seeds = ", num_seeds)
            # # print("all_shots = ", all_shots)
            # # print("single_sentence = ", only_single_sentence)
            # print("tmp_rank = ", all_rank[count])
            # print("ACC_single = ", correct_single / samples_num)
            # # count += 1
        

            correct_rep = 0
            correct_anti_rep = 0
            # print("prompt = ", prompt)
            # for single sentence
        
            for i, each_test in enumerate(test_sentences):
                # prev_prefix = params['prompt_prefix']
                params['prompt_prefix'] = ""
                if add_final_article:
                    params["q_prefix"] = ""

                prompt = construct_prompt(params, [] , [], each_test)
                # prompt = each_test

                input_reps1 = final_res[str(current_rank)][:]
                input_reps2 = final_res[str(current_rank)][:]
                cur_attention_mask = final_attention_mask[str(current_rank)]
                anti_attention_mask = final_anti_attention_mask[str(current_rank)]
                # layer_num * seqlen * dim
                # print("instruction_rep = ", instruction_rep.size())
                # print(instruction_rep)
                # print("input_rep = ")
                # print(input_rep)
                # layers
                # print("input_rep = ", len(input_reps))
                # print(input_rep.size())
                # 1 * seqlen * dim
                # print(input_reps[0].size())
                # input_reps = [torch.concat((instruction_rep[i], each_rep), dim=-2) for i, each_rep in enumerate(input_reps)]
                # print("new input_rep = ", input_reps[0])
                # exit()
                if test_inference:
                    inputs = tokenizer(prompt, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)

                    # attention
                    prompt_pos = torch.ones(input_ids.size()).to(input_ids.device)
                    prompt_pos = prompt_pos.long().cumsum(-1) + position_res[str(current_rank)][0][-1]



                    attention_mask = torch.ones(prompt_pos.size()).to(input_ids.device)

                    anti_attention_mask = torch.cat((anti_attention_mask, attention_mask), dim=-1)
                    attention_mask = torch.cat((cur_attention_mask, attention_mask), dim=-1)



                    position_ids = torch.cat((position_res[str(current_rank)], prompt_pos), dim=-1)
                    # print("position_ids = ", position_ids.size())
                    # print(position_ids)


                    with torch.no_grad():
                        # print("??")
                        generation_output = model.generate(
                            input_ids=input_ids,
                            input_reps=input_reps1,
                            position_ids=position_ids,
                            attention_mask=attention_mask,
                            generation_config=generation_config,
                            return_dict_in_generate=True,
                            output_scores=True,
                            max_new_tokens=max_new_tokens,
                        )
                        # print("generation_output = ", generation_output)
                    # exit()

                    s = generation_output.sequences[0]
                    s = s.masked_fill(s.eq(-99), 0)
                    output = tokenizer.decode(s)
                    answer = output.split(':')[-1].replace("</s>", '').strip()
                    
                    # print('output = ', output)
                    # print('answer = ', answer)
                    # print("label[i] = ", params['label_dict'][test_labels[i]])
                    # if i >= 20:
                    #     exit()

                    # exit()
                    # 
                    if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                        if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                            correct_rep += 1
                    else:
                        if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                            correct_rep += 1
                        



                    with torch.no_grad():
                        # print("??")
                        generation_output = model.generate(
                            input_ids=input_ids,
                            input_reps=input_reps2,
                            position_ids=position_ids,
                            attention_mask=anti_attention_mask,
                            generation_config=generation_config,
                            return_dict_in_generate=True,
                            output_scores=True,
                            max_new_tokens=max_new_tokens,
                        )
                        # print("generation_output = ", generation_output)
                    # exit()

                    s = generation_output.sequences[0]
                    s = s.masked_fill(s.eq(-99), 0)
                    output = tokenizer.decode(s)
                    answer = output.split(':')[-1].replace("</s>", '').strip()
                    
                    # print('output = ', output)
                    # print('answer = ', answer)
                    # print("label[i] = ", params['label_dict'][test_labels[i]])
                    # if i >= 20:
                    #     exit()

                    # exit()
                    # 
                    if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                        if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                            correct_anti_rep += 1
                    else:
                        if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                            correct_anti_rep += 1
                # pass



            # print("dataset = ", dataset)
            # print("num_seeds = ", num_seeds)
            # print("all_shots = ", all_shots)
            # print("single_sentence = ", only_single_sentence)
            print("tmp_rank = ", all_rank[count])
            print("ACC_rep = ", correct_rep / samples_num)
            print("Anti ACC_rep = ", correct_anti_rep / samples_num)
        


        

            count += 1
        # file_prefix = 'rep_save/'
        # file_name = file_prefix + base_model.split('/')[-1] + "_"+ str(num_seeds) + '.json'
        # res_name = file_prefix + base_model.split('/')[-1] + "_"+ str(num_seeds) + '.res'

        # json_data = json.dumps(final_res, indent=4)
        # with open(file_name, 'w') as file:
        #     file.write(json_data)
        # with open(res_name, 'w') as file:
        #     file.write(' ')
             
        # exit()



                

                # s = generation_output.sequences[0]
                # s = s.masked_fill(s.eq(-99), 0)
                # exit()
                # output = tokenizer.decode(s)

                # answer = output.split()[-1]
                # answer = output.split(':')[-1].replace("</s>", '').strip()
                # if compression:
                #     print("------------Model Output------------")
                    
                #     print("s = ", s)
                #     print("output = ", output)
                #     print("answer = ", answer)
                #     print("label = ", params['label_dict'][test_labels[i]])
                #     print("label = ", test_labels[i])
                #     if i > 20: 
                #         exit()
                    # print("output = ", output)
                    # print("generation_output = ", generation_output)
                    # exit()
                # exit()

                    
            #         if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
            #             if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
            #                 correct += 1
            #         else:
            #             if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
            #                 correct += 1

        # print final results
        # harder_rank, easier_rank, full_rank

        # print("rank_number = ", len(all_rank))
        # print("all_permutation = ", all_rank)


        # sorted_zipped = sorted(zip(res_list, all_rank), key=lambda x: x[0], reverse=True)

        # new_res_list, new_rank = zip(*sorted_zipped)
        # harder_idx = tuple([each + 1 for each in harder_idx])
        # easier_idx = tuple([each + 1 for each in easier_idx])
        # print("harder_idx = ", harder_idx)
        # print("easier_idx = ", easier_idx)
        # print("harder_rank = ", new_rank.index(harder_idx))
        # print("harder_res = ", new_res_list[new_rank.index(harder_idx)])
        # print("easier_rank = ", new_rank.index(easier_idx))
        # print("easier_res = ", new_res_list[new_rank.index(easier_idx)])

        # print("best_res = ", best_res)
        # print("best_permutation = ", best_rank)
        # print("worst_res = ", new_res_list[-1])
        # print("worst_permutation = ", new_rank[-1])

        # print("average_res = ", sum(new_res_list)/len(new_res_list))
        # print("mid_res = ", new_res_list[len(new_res_list) // 2])



        
            # print("")
            # count -= 1
            # change the order of the training sentences
            # combined = list(zip(train_sentences, train_labels))
            # random.shuffle(combined)
            # train_sentences, train_labels = zip(*combined)










if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # required arguments
    parser.add_argument('--model', dest='model', action='store', required=True, help='name of model(s), e.g., GPT2-XL')
    parser.add_argument('--lora_weight', dest='lora_weight', action='store', required=False, default=None, help='name of model(s), e.g., GPT2-XL')
    parser.add_argument('--dataset', dest='dataset', action='store', required=True, help='name of dataset(s), e.g., agnews')
    parser.add_argument('--num_seeds', dest='num_seeds', action='store', required=True, help='num seeds for the training set', type=int)
    parser.add_argument('--add_final_article', dest='add_final_article', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_answer', dest='include_answer', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_punctuation', dest='include_punctuation', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_template', dest='include_template', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_content', dest='include_content', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')                        
    parser.add_argument('--include_colon', dest='include_colon', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')    
    # parser.add_argument('--include_punctuation', dest='include_punctuation', action='store_const', const=True, default=False,
    #                     help='whether to test the sequence order of the model')
    parser.add_argument('--only_answer', dest='only_answer', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    # other arguments
    parser.add_argument('--subsample_test_set', dest='subsample_test_set', action='store', required=False, type=int,
                        default=None, help='size of test set to use to speed up eval. None means using all test set')
    # parser.add_argument('--api_num_log_prob', dest='api_num_log_prob', action='store', required=False, type=int,
    #                     default=100, help='number of top tokens to ask for when querying the model. Capped at 100 for OpenAI GPT-3 API')
    # parser.add_argument('--bs', dest='bs', action='store', required=False, type=int, default=None,
    #                     help='batch size for model queries. For OpenAI API, capped at 20. For local running, set this to max out your GPU memory.')
    # flags
    # parser.add_argument('--use_saved_results', dest='use_saved_results', action='store_const', const=True, default=False,
    #                     help='whether to load the results from pickle files and not run the model')
    parser.add_argument('--compression', dest='compression', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--load_in_8bit', dest='load_in_8bit', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--load_in_4bit', dest='load_in_4bit', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--compression_without_input', dest='compression_without_input', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--use_partial_mask', dest='use_partial_mask', action='store_const', const=True, default=False,
                        help='whether to use the partial_mask')
    parser.add_argument('--with_prompt_text', dest='with_prompt_text', action='store_const', const=True, default=False,
                        help='whether to use the prompt text')
    parser.add_argument('--compression_without_prompt_text', dest='compression_without_prompt_text', action='store_const', const=True, default=False,
                        help='whether to use the prompt text')
    parser.add_argument('--with_sequence_order', dest='with_sequence_order', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--compression_token_initialization', dest='compression_token_initialization', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    # parser.add_argument('--with_prompt_text', dest='with_prompt_text', action='store_const', const=True, default=False,
    #                     help='whether to use the prompt text')
    parser.add_argument('--compression_length', dest='compression_length', action='store', required=False, default=None, help='num of compression_tokens', type=int)
    parser.add_argument('--all_shots', dest='all_shots', action='store', required=True, help='num training examples to use', type=int)
    # compression_token_initialization

    args = parser.parse_args()
    args = vars(args)

    # simple processing
    # def convert_to_list(items, is_int=False):
    #     if is_int:
    #         return [int(s.strip()) for s in items.split(",")]
    #     else:
    #         return [s.strip() for s in items.split(",")]

    # args['models'] = convert_to_list(args['models'])
    # args['datasets'] = convert_to_list(args['datasets'])
    # args['all_shots'] = convert_to_list(args['all_shots'], is_int=True)

    main(**args)