# import jsonlines
from transformers import AutoTokenizer
from utils.utils import load_jsonl, CodexTokenizer
import copy

# 配置参数
model_name = "codegen2-7b"  # 替换成你的模型名称（如 "bert-base-uncased", "t5-small" 等）
add_special_tokens = False  # 是否包含模型特殊标记（如 [CLS]、[SEP]）

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(f"./models_cache/{model_name}-tokenizer/", trust_remote_code=True)
#tokenizer = CodexTokenizer()

total_tokens = 0

input_cases = load_jsonl(f"./generation_results/{model_name}/line_level.python.coarse2fine.10.0.retrieval.{model_name}.gen_res.jsonl")


for case in input_cases:
        # text = obj.get("generate_response", "")  # 提取字段内容
        case_res = copy.deepcopy(case)
        text = case_res['generate_response']

        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)

print(f"Total tokens using '{model_name}' tokenizer: {total_tokens}")