import os
import json
import tqdm
import time
import random
import argparse
from utlis.use_vllm_api import VLLMClient
from utlis.use_XXX_api import call_openai
from utlis.use_XXX_api import XXXClient



prompt = \
"""<<Task>>
You are an evaluation expert responsible for scoring metaphor interpretation generated by a VLM (Vision-Language Model). The metaphor interpretation typically explains "which video contents implicitly convey which underlying meanings".

The evaluation involves two scoring criteria with different strictness levels:
1. **Strict Score**: A binary score (0 or 1). Requires the interpretation to be both COMPLETE and ERROR-FREE — must cover all major metaphorical meanings from the golden analysis AND contain no significant errors or contradictions. Only award 1 if both conditions are fully satisfied.
2. **Loose Score**: A graded score (0 to 10). Requires only CORE CORRECTNESS — as long as the main interpretation direction is correct, minor omissions or small errors are acceptable.

<<Model-generated Interpretation>>
{model_analysis}

<<Golden Interpretation>>
{golden_analysis}

<<Scoring Guidelines>>

---

**I. Strict Score (Binary: 0 or 1)**

A strict binary assessment of whether the interpretation meets high standards.

| Score | Criteria |
|-------|----------|
| 0 | The interpretation fails to meet EITHER condition: (1) missing any major metaphorical meaning from the golden analysis, OR (2) containing any significant error or contradiction |
| 1 | The interpretation meets BOTH conditions: (1) covers ALL major metaphorical meanings from the golden analysis, AND (2) contains NO significant errors or contradictions |

**Key Principle**: This is a strict pass/fail evaluation. Any notable omission of major content OR any significant error should result in a score of 0. Only a comprehensive and accurate interpretation earns a score of 1.

---

**II. Loose Score (Graded: 0 to 10)**

A lenient assessment focusing on whether the core metaphorical meaning is captured correctly.

| Score | Criteria |
|-------|----------|
| 0 | Completely misses the core meaning, interpretation direction is fundamentally wrong |
| 1-2 | Barely touches the core meaning, interpretation direction is largely incorrect or confused |
| 3-4 | Shows some understanding of the metaphor but the core meaning is only partially correct or somewhat off-track |
| 5-6 | Captures the general direction of the core meaning, but with noticeable gaps in understanding; interpretation is on the right track but imprecise |
| 7-8 | Correctly captures the core metaphorical meaning, interpretation direction is accurate; minor omissions or small errors do not affect this score |
| 9-10 | Clearly and accurately captures the core meaning with good precision; interpretation demonstrates solid understanding of the main metaphor |

**Key Principle**: Focus on whether the main interpretation direction is correct. Minor omissions, small errors, or incomplete coverage should NOT significantly affect the score as long as the core meaning is captured.

---

<<Scoring Procedure>>

**Step 1: Identify Core vs. Supporting Elements**
- Identify the CORE metaphorical meaning from the golden analysis (the central message/theme)
- Identify SUPPORTING elements (specific visual details, secondary meanings, elaborations)

**Step 2: Evaluate for Strict Score**
- Check: Are ALL major metaphorical meanings covered?
- Check: Are there ANY significant errors or contradictions?
- If BOTH conditions are satisfied → Strict Score = 1
- If EITHER condition fails → Strict Score = 0

**Step 3: Evaluate for Loose Score**
- Focus primarily on core meaning correctness
- Be tolerant of omissions and minor errors
- Assign a graded score (0-10) based on how well the core meaning is captured

---

<<Important Notes>>

1. **Semantic Equivalence**: Focus on semantic essence during evaluation; exact wording match is not required. Content with different expressions but the same meaning should be considered a match.

2. **Score Independence**: The two scores evaluate different aspects. A model might score 0 on Strict (due to one missing element or one error) but still score high on Loose (if core meaning is correct).

3. **Reasonable Extensions**: If the model output contains content not mentioned in the golden analysis but is genuinely reasonable and grounded, do not consider it as an error.

4. **Definition of "Major" vs "Minor"**: 
   - Major elements: Central themes, primary metaphorical mappings, key messages
   - Minor elements: Specific details, secondary interpretations, elaborations
   - For Strict Score: Missing major elements → 0
   - For Loose Score: Missing minor elements → minimal impact

5. **Definition of "Significant Error"**:
   - Significant: Contradicts the golden analysis, misinterprets the core meaning, or introduces clearly wrong information
   - Minor: Slightly imprecise wording, over-elaboration that doesn't contradict the main meaning

---

<<Output Format>>

Please output strictly in the following JSON format, starting with ```json and ending with ```:

```json
{{
  "reasoning": "Your reasoning process, including: 1) Identification of core meaning and major elements from golden analysis; 2) Completeness check for Strict Score; 3) Error check for Strict Score; 4) Core meaning correctness evaluation for Loose Score; 5) Justification for both scores",
  "strict_score": score (0 or 1),
  "loose_score": score (integer from 0-10)
}}
```"""



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="")
    args = parser.parse_args()
    print(f"model {args.model_name}")

    video_id_2_golden_answer = {data["video_id"]: data["golden_answers"] for data in [json.loads(line) for line in open("/XXXX/benchmark/datas.jsonl")]}
    video_id_2_predict_answer = {data["video_id"]: data["analysis_dict"] for data in [json.loads(line) for line in open(f"/XXXX/output/{args.model_name}.jsonl")]}

    save_path = f"/XXXX/score/{args.model_name}.jsonl"
    fw = open(save_path, "a")
    alredy_processed_data_ids = [data['video_id'] for data in [json.loads(line) for line in open(save_path, "r").readlines()]]
    print(f"already processed {len(alredy_processed_data_ids)} / {len(video_id_2_predict_answer)} data")

    client_list = [
      XXXClient(model_name=f"DeepSeek-V3_2_{i}") for i in range(12)
    ]

    for video_id in tqdm.tqdm(list(video_id_2_predict_answer.keys())):
        if video_id in alredy_processed_data_ids:
            print(f"\nskip {video_id}")
            continue
        
        print(f"\nwill process {video_id}")
        predict_answer = video_id_2_predict_answer[video_id]

        for try_time in range(5):
            time.sleep(8)
            print(f"try {try_time} time")

            client = random.choice(client_list)
            
            try:
                golden_answer = video_id_2_golden_answer[video_id]

                input_text = prompt.format(model_analysis=predict_answer, golden_analysis=golden_answer)
                response, _ = client.call_openai(input_text)
                response_clear = response.split("```json")[-1].split("```")[0]
                response_clear_json = json.loads(response_clear)
                assert "reasoning" in response_clear_json and "strict_score" in response_clear_json and "loose_score" in response_clear_json, f"response format error: {response_clear_json}"

                score = {
                    "video_id": video_id, 
                    "strict_score": int(response_clear_json["strict_score"]), 
                    "loose_score": int(response_clear_json["loose_score"]), 
                    "reasoning": response_clear_json['reasoning']
                }

                fw.write(json.dumps(score, ensure_ascii=False) + "\n")
                fw.flush()
                break

            except Exception as e:
                print(f"error: {e}")
                continue

fw.close()
