import json
import requests
import os
import re
import random
import time
from collections import defaultdict

from requests import models

# --- CONFIGURATION ---
# Please fill in your details here.

# 1. API Key and Proxy URL for your Gemini service
API_KEY = "XX"  # <-- IMPORTANT: Replace with your actual API key
BASE_URL = "XXX"      # Base URL for the proxy service

# 2. Gemini Model to use for evaluation
EVALUATION_MODEL = "gemini-2.5-pro-thinking"
chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
# list_ = [f"{c1}{c2}{c3}" for c1 in chars for c2 in chars for c3 in chars]
list_ = [f"{c1}{c2}{c3}" for c1 in chars for c2 in chars for c3 in chars]
random.shuffle(list_)
models_name_all = {}
# 3. List of file paths for the models you want to evaluate.
#    The script will use the filename (e.g., "SDE_Pscore_10_1e3_120") as the model's name.

FILE_PATHS = [
    "XXXXX.txt",
    "XXXXX.txt",
]

# --- GEMINI API PROMPT TEMPLATE for RANKING ---
# This prompt instructs the Gemini model to rank multiple segments.
# 
PROMPT_TEMPLATE = """
You are an impartial and expert evaluator of AI-generated text. Your task is to analyze the following text segments, each generated by a different AI model. Please rank them from best to worst based on an overall assessment of their quality.

Consider the following criteria in your judgment:
1.  **Coherence:** Logical flow and consistency.
2.  **Fluency:** Naturalness and grammatical correctness.
3.  **Informativeness:** The depth and value of the content.

Please provide your response strictly in the JSON format specified below. The "ranking" should be a list of the model identifiers, ordered from best to worst.

**Text Segments to Evaluate:**
---
{all_segments}
---

**Your Evaluation (JSON format only):**
```json
{{
    justification: <A detailed, overall justification for your ranking, explaining why the top model was better and the bottom model was worse.>,
    ranking: [<best_model_name>, <second_best_model_name>, ...]
}}
"""

def parse_text_files(file_paths):
    """
    Parses multiple text files to extract segments from different models.

    Args:
        file_paths (list): A list of paths to the text files.

    Returns:
        dict: A dictionary mapping model names to a list of their text segments.
            Returns None if file validation fails.
    """
    print("Parsing text files...")
    model_segments = {}
    for i, path in enumerate(file_paths):
        if not os.path.exists(path):
            print(f"Error: File not found at '{path}'. Please check the path.")
            return None

        model_name_ = os.path.basename(path).split('.')[0]
        model_name = list_[i]
        models_name_all[model_name] = model_name_
        with open(path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Split by the two known delimiters and filter out empty strings
        segments = re.split(r'={60,}', content)
        cleaned_segments = [seg.strip() for seg in segments if seg.strip()]
        model_segments[model_name] = cleaned_segments
        print(f"  - Found {len(cleaned_segments)} segments for model: {model_name}")


    return model_segments
def evaluate_and_rank_segments(segments_dict, api_key, base_url):
    """
    Sends a dictionary of segments (model_name: segment) to the Gemini API for ranking.

    Args:
        segments_dict (dict): The dictionary of segments to evaluate and rank.
        api_key (str): The API key.
        base_url (str): The base URL for the API service.

    Returns:
        dict: The parsed JSON response from the API, or None if an error occurs.
    """
    # Format the segments for the prompt
    formatted_segments = ""
    # print(segments_dict.keys())
    dic_keys = list(segments_dict.keys())
    random.shuffle(dic_keys)
    print("---Shuffled results:{}".format(dic_keys))
    for name in dic_keys:
        formatted_segments += f"[Model: {name}]\n{segments_dict[name]}\n---\n"

    final_prompt = PROMPT_TEMPLATE.format(all_segments=formatted_segments)

    url = f"{base_url}/v1/chat/completions"
    headers = {
        'Accept': 'application/json',
        'Authorization': f'Bearer {api_key}',
        'User-Agent': 'Python/Gemini-Ranking-Script',
        'Content-Type': 'application/json'
    }
    payload = json.dumps({
        "model": EVALUATION_MODEL,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
            {"role": "user", "content": final_prompt}
        ]
    })
    try:
        response = requests.post(url, headers=headers, data=payload, timeout=90)
        response.raise_for_status()

        # --- THIS IS THE CORRECTED LOGIC ---
        # 1. First, parse the entire response body as JSON
        api_response_data = response.json()

        # 2. Extract the content string from the nested structure
        content_string = api_response_data.get("choices", [{}])[0].get("message", {}).get("content")

        if not content_string:
            print("  - Error: 'content' field not found in the API response structure.")
            print("  - Raw Response:", json.dumps(api_response_data))
            return None

        # 3. Now, find the JSON markdown block within that content string
        json_match = re.search(r'```json\n({.*?})\n```', content_string, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
            return json.loads(json_str)
        else:
            print("  - Error: Could not find the JSON markdown block inside the 'content' field.")
            print("  - Content received:", content_string)
            return None
        # --- END CORRECTION ---

    except requests.exceptions.RequestException as e:
        print(f"  - Error making API request: {e}")
        return None
    except json.JSONDecodeError:
        print("  - Error: Failed to decode the main JSON from the API response body.")
        return None
    
def main():
    """
    Main function to run the evaluation protocol.
    """
    print("--- Starting Text Generation Quality Evaluation (Ranking Method) ---")

    if API_KEY == "YOUR_GEMINI_API_KEY":
        print("\nWARNING: Please replace 'YOUR_GEMINI_API_KEY' with your actual key in the script.\n")
        return

    # 1. Parse all model files
    model_segments = parse_text_files(FILE_PATHS)
    if not model_segments:
        return

    model_names = list(model_segments.keys())
    if len(model_names) < 2:
        print("\nError: You need at least two files/models to perform a ranking evaluation.")
        return

    num_segments = len(model_segments[model_names[0]])
    num_segments = 30
    repeat_num = 6*len(model_names)**2
    # Initialize a structure to store the rank for each model in each round
    results = defaultdict(list)

    random.shuffle(model_names)
    # 2. Start the evaluation loops
    for i in range(repeat_num):
        print(f"\n--- Starting Evaluation Round {i + 1}/{repeat_num} ---")

        # a. Get the segments for the current round from all models
        # random_elements = random.sample(model_names, 2)
        idx = [random.randint(0, num_segments-1) for i in range(2)]
        len_name = len(model_names)
        len_clip = (len_name // 2)*2
        if (2*i) % len_clip <=1:
            random.shuffle(model_names)
            temp_names = model_names[: len_clip]
        collec_names = temp_names[(2*i) % len_clip:(2*i) % len_clip+2]
        round_segments = {name: model_segments[name][idx[num]] for num, name in enumerate(collec_names)}

        # b. Send for evaluation and ranking
        print(f"  - Sending segments to Gemini for ranking...")
        evaluation = evaluate_and_rank_segments(round_segments, API_KEY, BASE_URL)

        # c. Process the ranking result
        if evaluation and "ranking" in evaluation and isinstance(evaluation["ranking"], list):
            ranking = evaluation["ranking"]
            print(f"  - Received ranking: {ranking}")
            print(f"  - Justification: {evaluation.get('justification', 'N/A')}")
            # for name in ranking:
            if len(ranking)>2:
                print(f"  Current ranking: {ranking}, length is {len(ranking)}, thus jump to next iter")
                continue
            # Store the rank (position in the list, 1-based) for each model
            for rank, model_name in enumerate(ranking, 1):
                if models_name_all[model_name] in results:
                    results[models_name_all[model_name]].append(rank)
                else:
                    results[models_name_all[model_name]]=[rank]

            print("Current results:{}".format(results))
        else:
            print("  - Failed: Could not get a valid ranking for this round.")

        time.sleep(1) # Small delay to avoid rate limiting

    # 3. Calculate and display the average ranks
    print("\n\n--- Final Evaluation Results (Lower Average Rank is Better) ---")

    # Calculate average ranks
    final_scores = {}
    for name, ranks in results.items():
        if ranks:
            final_scores[name] = sum(ranks) / len(ranks)
        else:
            final_scores[name] = float('inf') # No successful ranks

    # Sort models by their average rank (best to worst)
    sorted_models = sorted(final_scores.items(), key=lambda item: item[1])

    header = f"{'Final Rank':<12} | {'Model':<30} | {'Average Rank':<15}"
    print(header)
    print("-" * len(header))

    for final_rank, (model_name, avg_rank) in enumerate(sorted_models, 1):
        print(f"{final_rank:<12} | {model_name:<30} | {avg_rank:<15.2f}")

    print("\n--- Evaluation Complete ---")
main()