
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import re
import pdb
# from bitsandbytes import BitsAndBytesConfig
from transformers import BitsAndBytesConfig
import time
import datetime
import argparse
from tqdm import tqdm

parser=argparse.ArgumentParser()
parser.add_argument('--digits', type=int, nargs='+', help='value of digits')
parser.add_argument("--sample_size", help="sample_size", type=int, default=100)
#parser.add_argument("--digits", help="value of digits", type=int, default=4)
parser.add_argument("--model", help="model size", type=str, default="4B")

#for M in 0.5B 1.8B; do python3 QwenTryAgainSept12.py --sample_size 1000 --digits 4 6 8 10 12 14 16 18 20 --model ${M}; done; 

args=parser.parse_args()
print(args.sample_size)
print(args.model)
print(args.digits)

device = "cuda" # the device to load the model onto

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Set computation dtype (e.g., float16)
    bnb_4bit_use_double_quant=True,  # Optionally enable double quantization
    bnb_4bit_quant_type="nf4"  # Quantization type (e.g., 'nf4')
)

model_name= f"Qwen/Qwen1.5-{args.model}-Chat"  #14B y, 7B y, 4B y, 1.8B y, 0.5B y
model = AutoModelForCausalLM.from_pretrained(
model_name,  
torch_dtype="auto",
device_map="cuda:0",
load_in_4bit=True, # previous version does not request it 
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
def _format_chat_template(prompt, few_shots = {'3+4+5+6': '18', '1+2+4+6': '13'}):
    messages = [
        {"role": "system", "content": "You are a helpful math AI that is good at summation."},
    ]
    for k in few_shots:
        messages.append({"role": "user", "content": k})
        messages.append({"role": "model", "content": few_shots[k]})
    messages.append({"role": "user", "content": prompt})
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return text

def response(prompt):
    model_inputs = tokenizer([_format_chat_template(prompt)], return_tensors="pt").to(device)
    generated_ids = model.generate(
    
      model_inputs.input_ids,
    
      max_new_tokens=256, ## assume enough for math multiplication
    
    )
    
    generated_ids = [
    
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    multi_result=response
    multi_result = ''.join([c for c in multi_result if c != ','])
    numbers = re.findall(r'\d+', multi_result)
    last_number = numbers[-1] if numbers else None
    if last_number==None:
      print('raw', response)
    return last_number 


for digit in args.digits:
    print(f"digit={digit}")
    filename = f"{digit}_Digits_plus.json"
    with open(filename, 'r') as json_file:
        data = json.load(json_file)
    # inputs = tokenizer('5*5=25 \n 6*4=24 \n  5432*4321=', return_tensors='pt')
    numbers=[]
    question=[]
    correct_answers=[]
    count_None=0
    k=args.sample_size #number of multiplications to test <=5000
    for i in tqdm(range(k)):
        resp=response(f'What is the final answer of {data[i]["x"]}?')  
        if resp==None:
            count_None+=1
        question.append(data[i]["x"])
        correct_answers.append(data[i]["y"])
        numbers.append(str(resp))
    print(question[:5])
    print(correct_answers[:5])
    print(numbers[:5])
    dataset = [{"x": xi, "y": yi, "z": zi} for xi, yi, zi in zip(question, correct_answers, numbers)]
    filename_prefix = f"{digit}_Digits_plus_withAnswer"
    filename = f"{filename_prefix}_{args.model}-Chat_k={k}.json"
    print(filename)  
    with open(filename, 'w') as json_file:
        json.dump(dataset, json_file, indent=4)
    
    print(f"Dataset saved to {filename} for value k={k}, model={model_name}")