import transformers
import torch
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('YOUR TOKEN HERE')
import json
import re
import sys
import time
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset

def llama_pipeline(model_id="meta-llama/Meta-Llama-3-70B-Instruct"):
    pipe = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id
    pipe.tokenizer.padding_side = 'left'
    return pipe

def extract_scores(generated_text):
    match = re.search(
        r"Evaluation Criteria[:\-]?\s*\[?\s*(\d+)\s*[,/]\s*(\d+)\s*\]?\s*[,\-]?\s*Final Score[:\-]?\s*([0-9]*\.?[0-9]+)",
        generated_text,
        re.IGNORECASE
    )
    if match:
        score1 = int(match.group(1))
        score2 = int(match.group(2))
        final_score = float(match.group(3))
        return {'scores': [score1, score2], 'final_score': final_score}
    else:
        return None

def extract_hi_scores(generated_text):
    match = re.search(
        r"Evaluation Criteria[:\-]?\s*\[?\s*(\d+)\s*\]?\s*[,\-]?\s*Final Score[:\-]?\s*([0-9]*\.?[0-9]+)",
        generated_text,
        re.IGNORECASE
    )
    if match:
        score1 = int(match.group(1))
        final_score = float(match.group(2))
        return {'scores': [score1], 'final_score': final_score}
    else:
        return None

def evaluate_similarity_batch(prompts, pipeline, max_new_tokens=500, batch_size=8,hi_flag=False):
    dataset = Dataset.from_dict({'prompt': prompts})
    dataloader = DataLoader(dataset, batch_size=batch_size)
    scores = []
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    for batch in tqdm(dataloader, desc="Processing batches"):
        batch_prompts = batch['prompt']

        outputs = pipeline(
            batch_prompts,
            max_new_tokens=max_new_tokens,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_full_text=False,
            batch_size=batch_size
        )

        for output in outputs:
            output_dict = output[0]
            generated_text = output_dict['generated_text']
            if hi_flag:
                score = extract_hi_scores(generated_text)
            else:
                score = extract_scores(generated_text)
            if score is None:
                print("No matching:")
                print(generated_text)
            scores.append(score)

    return scores

def process_json_file(json_file, instruction, repeated_num, batch_size=8, verbose_reasoning=False, pipeline=None,hi_flag=False):
    start_time = time.time()
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if pipeline is None:
        raise ValueError("Pipeline is not initialized.")

    total_final_scores = []
    video_final_scores = {}

    for tiou_key in data:
        if tiou_key == 'Con-sim':
            continue
        tiou_results = data[tiou_key]
        for video_id in tqdm(tiou_results, desc=f"Processing tIoU {tiou_key}"):
            video_total_score = 0
            video_count = 0
            items = tiou_results[video_id]
            prompts = []
            indices = []
            for idx, item in enumerate(items):
                prediction = item['prediction']
                matched_gts = item['matched_gts']
                for gt in matched_gts:
                    if verbose_reasoning:
                        prompt = f"""{instruction}

                                (Q) Then, what is the evaluation result for the given prediction and GT below?

                                Prediction:
                                {prediction}

                                GT:
                                {gt}

                                Your output form is like below:
                                Evaluation Criteria: [score1, score2], Final Score: average of Score1-Score2 .

                                **Important Rule: Don't show your reasoning, just show me the score result.
                                Think step-by-step the result of the evaluation and output form, then answer me.  

                                (A)
                                """
                    else:
                        if hi_flag:
                            prompt = f"""{instruction}

                                    (Q) Then, what is the evaluation result for the given prediction and GT below?

                                    Prediction:
                                    {prediction}

                                    GT:
                                    {gt}

                                    Your output form is like below:
                                    Evaluation Criteria: [score1], Final Score: Score1 .

                                    (A)
                                """                        
                        else:
                            prompt = f"""{instruction}

                                        (Q) Then, what is the evaluation result for the given prediction and GT below?

                                        Prediction:
                                        {prediction}

                                        GT:
                                        {gt}

                                        Your output form is like below:
                                        Evaluation Criteria: [score1, score2], Final Score: average of Score1-Score2 .

                                        (A)
                                    """
                    prompts.append(prompt)
                    indices.append((idx, gt))
            batch_scores = evaluate_similarity_batch(prompts, pipeline, max_new_tokens, batch_size,hi_flag)
            for (item_idx, gt), evaluation in zip(indices, batch_scores):
                final_score = evaluation['final_score'] if evaluation is not None else None
                items[item_idx].setdefault('Con-sim', []).append(final_score)
                if final_score is not None:
                    total_final_scores.append(final_score)
                    video_total_score += final_score
                    video_count += 1
            if video_count > 0:
                video_average = video_total_score / video_count
                video_final_scores[video_id] = video_average
                items.append({"Con-sim": video_average})
            else:
                video_final_scores[video_id] = None
                items.append({"Con-sim": None})

    if total_final_scores:
        overall_average = sum(total_final_scores) / len(total_final_scores)
        data['Con-sim'] = overall_average
    else:
        data['Con-sim'] = None

    with open(json_file+f'_updated_{repeated_num}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    end_time = time.time()
    total_time = end_time - start_time
    print(f"Total execution time: {total_time:.2f} seconds")

    return data

if __name__ == "__main__":
    instruction = """
                You are an intelligent chatbot designed for evaluating the quality of generative outputs for movie scene descriptions. Your task is to compare the predicted scene descriptions with the correct scene descriptions and determine its level of match, considering mainly the semantic similarity for the scene understanding.

                Here's how you can accomplish the task:

                **1. Semantic Similarity for Scene Understanding**: Evaluate whether the predicted scene description effectively captures the semantic information of the ground truth (GT) without relying on the specific words or expressions of the GT. The evaluation should consider whether the scene can be understood. Therefore, consider synonyms or paraphrases as valid matches.

                **2. Person Matching**: Consider pronouns like 'he' or 'she' as valid matches with character names.

                **3. Evaluation Method**: Evaluate each item with 1-10 points.
                1) "Semantic Similarity": What is the level of semantic similarity between the scene description prediction and the ground truth (GT)?
                2) "Temporal Alignment": Even if the prediction's scene and the GT's scene point to different times, is the description about a scene with the same theme?

                This is an example of evaluation situation.

                Given set:

                "prediction": "[126.568, 341.12], In a tranquil rural setting, a family photo is shown in a house. A man goes through his morning routine and visits the children's room. Noticing the absence of the children, the man, along with another man, runs towards the sound of the children in the cornfield. They find the daughter between the split corn rows. He asks the daughter about the whereabouts of the son and finds him. The family discovers a mysterious circle drawn in the field. Other circles around the family are shown from an aerial perspective."

                "matched_gts": "[180.0, 328.616], Graham, with a toothbrush in his mouth, walks to the children's room and checks inside. Merrill, a younger man, wakes suddenly and listens intently before both men rush outside towards a barn, crossing through a field of maize. They find Graham’s daughter, Bo, standing on a path. Graham questions her, asking about her brother, Morgan. Merrill picks up Bo, and Graham continues, finding Morgan, who explains that barking dogs woke them. Morgan directs his father's gaze to a large circular pattern of flattened maize stalks in the field. They approach, observing multiple crop circles. Graham stands in awe as the family surveys the eerie formations."

                (Q) What is the final evaluation score for the given example set?

                (Reasoning) To answer the question, we need to consider three aspects.

                1) "Semantic Similarity": The prediction's description is semantically similar to the GT, even though the detailed descriptions and expressions are different. (Score: 9)
                2) "Temporal Alignment": The prediction covers a time earlier than the GT, so it describes scenes not expressed in the GT. However, in the overlapping time interval, it describes the same themed scene. (Score: 8)

                (A)
                Considering each evaluation criterion, the result is ...
                Evaluation Criteria: [9,8], Final Score:8.5 .
                """

    hi_instruction = """
                    You are an intelligent chatbot designed for evaluating the quality of generative outputs for movie scene descriptions. Your task is to compare the predicted scene descriptions with the correct scene descriptions and determine its level of match, considering mainly the semantic similarity for the scene understanding.

                    Here's how you can accomplish the task:

                    **1. Semantic Similarity for Scene Understanding**: Evaluate whether the predicted scene description effectively captures the semantic information of the ground truth (GT) without relying on the specific words or expressions of the GT. The evaluation should consider whether the scene can be understood. Therefore, consider synonyms or paraphrases as valid matches.

                    **2. Person Matching**: Consider pronouns like 'he' or 'she' as valid matches with character names.

                    **3. Evaluation Method**: Evaluate each item with 1-10 points.
                    1) "Semantic Similarity": What is the level of semantic similarity between the scene description prediction and the ground truth (GT)?

                    This is an example of evaluation situation.

                    Given set:

                    "prediction": "[126.568, 341.12], In a tranquil rural setting, a family photo is shown in a house. A man goes through his morning routine and visits the children's room. Noticing the absence of the children, the man, along with another man, runs towards the sound of the children in the cornfield. They find the daughter between the split corn rows. He asks the daughter about the whereabouts of the son and finds him. The family discovers a mysterious circle drawn in the field. Other circles around the family are shown from an aerial perspective."

                    "matched_gts": "[180.0, 328.616], Graham, with a toothbrush in his mouth, walks to the children's room and checks inside. Merrill, a younger man, wakes suddenly and listens intently before both men rush outside towards a barn, crossing through a field of maize. They find Graham’s daughter, Bo, standing on a path. Graham questions her, asking about her brother, Morgan. Merrill picks up Bo, and Graham continues, finding Morgan, who explains that barking dogs woke them. Morgan directs his father's gaze to a large circular pattern of flattened maize stalks in the field. They approach, observing multiple crop circles. Graham stands in awe as the family surveys the eerie formations."

                    (Q) What is the final evaluation score for the given example set?

                    (Reasoning) To answer the question, we need to consider three aspects.

                    1) "Semantic Similarity": The prediction's description is semantically similar to the GT, even though the detailed descriptions and expressions are different. (Score: 9)

                    (A)
                    Considering each evaluation criterion, the result is ...
                    Evaluation Criteria: [9], Final Score:9.
                    """

    hi_flag=int(sys.argv[5])
    if hi_flag == 1:
        hi_flag=True
    if hi_flag:
        folder_path = sys.argv[1]
        json_file = folder_path + '/HourHDVC_eval_hi_preds.json'
    else:
        folder_path = sys.argv[1]
        json_file = folder_path + '/HourHDVC_eval_0.3tiou_preds.json'

    summary_file = folder_path + '/HourHDVC_sumsummary.json'
    batch_size = int(sys.argv[2])
    verbose_reasoning = int(sys.argv[3]) == 1
    repeat_num = int(sys.argv[4])

    with open(summary_file, 'r') as f:
        summary_dict = json.load(f)

    llm_sim_values = []
    pipeline = llama_pipeline()

    for i in range(repeat_num):
        updated_data = process_json_file(
            json_file, instruction, i + 1, batch_size, verbose_reasoning, pipeline,hi_flag
        )
        print(f"Result of {i + 1}th Con-sim.")

        llm_sim = updated_data['Con-sim']
        llm_sim_values.append(llm_sim)
        summary_dict[f'Con-sim_{i + 1}'] = llm_sim

        for tiou_key in updated_data:
            if tiou_key == 'Con-sim':
                continue
            tiou_results = updated_data[tiou_key]
            for video_id in tiou_results:
                print(f"Video ID: {video_id}")
                for item in tiou_results[video_id]:
                    if 'prediction' in item:
                        print(f"Prediction: {item['prediction']}")
                        print(f"Matched GTs: {item['matched_gts']}")
                        print(f"Con-sim: {item.get('Con-sim')}")
                    else:
                        print(f"Video Con-sim Average: {item['Con-sim']}")
                print("-" * 50)
        print(f"Overall Con-sim Average: {llm_sim}")
        print("=" * 50)
        del updated_data
        del llm_sim
    mean_llm_sim = np.mean(llm_sim_values)
    std_llm_sim = np.std(llm_sim_values)

    print(f"Mean of Con-sim values over {repeat_num} runs: {mean_llm_sim}")
    print(f"Standard deviation of Con-sim values over {repeat_num} runs: {std_llm_sim}")

    summary_dict['Con-sim_mean'] = mean_llm_sim
    summary_dict['Con-sim_std'] = std_llm_sim

    with open(summary_file, 'w') as f:
        json.dump(summary_dict, f)

    print("Updated summary_file with Con-sim values, mean, and std.")
