from transformers import AutoTokenizer 
import os 
import json 
import string 















def main ():

    OUT_DIR ="/path/to/home/llm-recipes/scripts/instruction/convert_datasets"
    os .makedirs (OUT_DIR ,exist_ok =True )

    tokenizer =AutoTokenizer .from_pretrained ("meta-llama/Meta-Llama-3-8B-Instruct")
    vocab =tokenizer .get_vocab ()


    allowed_chars =set (string .ascii_letters +string .punctuation +string .whitespace )

    skip_cnt =0 
    data_cnt =0 

    batch_size =12280 
    file_count =1 
    tokens_data =[]


    for token ,token_id in sorted (vocab .items (),key =lambda x :x [1 ]):

        if not all (ch in allowed_chars for ch in token ):
            skip_cnt +=1 
            continue 

        if "<|"in token :
            skip_cnt +=1 
            continue 

        data_cnt +=1 




        json_line ={
        "input":[
        {
        "role":"user",
        "content":f"<code>\nprint(len('{token }'))\n</code>"
        }
        ],
        "output":{
        "role":"assistant",
        "content":f"<answer>\n{len (token )}\n</answer>"
        }
        }

        tokens_data .append (json_line )


        if len (tokens_data )==batch_size :
            file_path =os .path .join (OUT_DIR ,f"LLTM-llama-token-len-{file_count }-numeric-depth-val.jsonl")
            with open (file_path ,"w",encoding ="utf-8")as f :
                for item in tokens_data :
                    json .dump (item ,f ,ensure_ascii =False )
                    f .write ("\n")
            print (f"Wrote file: {file_path } ({len (tokens_data )} tokens)")
            tokens_data =[]
            file_count +=1 


    if tokens_data :
        file_path =os .path .join (OUT_DIR ,f"LLTM-llama-token-len-{file_count }-numeric-depth-val.jsonl")
        with open (file_path ,"w",encoding ="utf-8")as f :
            for item in tokens_data :
                json .dump (item ,f ,ensure_ascii =False )
                f .write ("\n")
        print (f"Wrote file: {file_path } ({len (tokens_data )} tokens)")

    print ("Total data tokens:",data_cnt )
    print ("Total skipped tokens:",skip_cnt )

if __name__ =="__main__":
    main ()
