import json 
dataset_path = "synthetic_generate/vllm_generated_1000_samples_temp0.6.jsonl"

dataset = []
with open(dataset_path, "r") as f:
    for line in f:
        dataset.append(json.loads(line))

print(f"dataset length = {len(dataset)}")


def length_statistics(dataset):
    """
    count average length.
    """
    question_all_count = 0
    solution_all_count = 0
    for sample in dataset:
        question = sample["question"]
        solution = sample["solution"]

        question_count = len(question.split(" "))
        solution_count = len(solution.split(" "))

        question_all_count += question_count
        solution_all_count += solution_count 

    print(f"Avg question length = {question_all_count / len(dataset)}")
    print(f"Avg solution length = {solution_all_count / len(dataset)}")

# length_statistics(dataset)

def synthetic_generate(dataset):
    right_count = 0
    content_all_count = 0
    for sample in dataset:
        content = sample["generated_text"]
        if "**Final Answer**\n\\boxed" in content:
            right_count += 1
        content_count = len(content.split(" "))
        content_all_count += content_count 
    print(f"Avg content length = {content_all_count / len(dataset)}")
    print(f"Right count = {right_count}")

synthetic_generate(dataset)


