import json 
import torch 

import sys 
import os 

from tqdm import tqdm 

current_path :str =os .getcwd ()
sys .path .append (f"{current_path }/src")
sys .path .append (current_path )

from llama_recipes .utils .instruction_tuning import InstructDataset 
from transformers import AutoTokenizer 
from megatron_lm .megatron .global_vars import set_global_variables 

from llama_recipes .arguments import parse_args 


def main ():
    args =parse_args ()
    set_global_variables (args =args ,build_tokenizer =False )

    hf_tokenizer =AutoTokenizer .from_pretrained (
    pretrained_model_name_or_path =args .hf_transformer_model_dir 
    )

    dataset =InstructDataset (
    tokenizer =hf_tokenizer ,
    data_path =args .instruction_train_data_path ,
    )


    lengths =[]
    for i in tqdm (range (len (dataset ))):
        _ =dataset [i ]
        lengths .append (dataset .length_bins [-1 ]if False else len (_ ["input_ids"]))




    l_tensor =torch .tensor (lengths )
    print (f"\n--- Token length stats ---")
    print (f"min:   {int (l_tensor .min ())}")
    print (f"mean:  {float (l_tensor .float ().mean ()):.2f}")
    print (f"max:   {int (l_tensor .max ())}\n")

    dataset .print_length_bins ()


if __name__ =="__main__":
    main ()
