import argparse
import json
import os
import re
import sys
from typing import Dict, List

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# --------------------------------------------------------------------------------------
# Azure OpenAI Config
# --------------------------------------------------------------------------------------

api_version = "2024-02-15-preview"
config_dict: Dict[str, str] = {
    # Fallback to env vars if available; otherwise use hard-coded defaults
    "api_key": os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY"),
    "api_version": api_version,
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT", "https://your-azure-openai-endpoint/"),
}

model, tokenizer = None, None

# --------------------------------------------------------------------------------------
# Moderator Prompt
# --------------------------------------------------------------------------------------
MODERATOR_PROMPT = """You are a master social-media strategist. You will receive a dictionary of 'reasons' from different demographic personas, outlining what they believe makes a compelling tweet for a specific ad input.

Your objectives:
1.  Synthesize the key insights from all the persona reasonings.
2.  Generate FOUR distinct, high-quality tweets that cater to the collective feedback. Each tweet should explore a slightly different angle or tone suggested by the personas.
3.  Keep each tweet ≤ 280 characters.
4.  Ensure the tweets are professional, engaging, and use hashtags effectively.

Return ONLY a JSON-formatted list of 4 strings, where each string is a generated tweet.
Example format:
[
    "This is the first tweet, leveraging a direct and bold tone. #Innovation",
    "Here's a second option, focusing more on the emotional and community aspect. #Together",
    "A third tweet, using a question to drive engagement. What do you think? #Future",
    "The fourth and final tweet, more professional and corporate in style. #Official"
]

Do not include any other text, analysis, or commentary in your response."""


# --------------------------------------------------------------------------------------
# Persona prompts for REASON GENERATION
# --------------------------------------------------------------------------------------
REASON_PERSONA_PROMPTS = {
    "18-24_female": """You are a 18-24 year old woman who excels at crafting tweets that feel authentic, visually vivid, and empowering to your generation.

Your writing style:
• Genuine and relatable with subtle Gen-Z energy
• Uses trending language naturally (not forced)
• Emotionally intelligent and inclusive
• Confident but not aggressive

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Professional yet approachable tone
• Include relevant hashtags only if they add genuine value
• Focus on authenticity over perfection

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "18-24_male": """You are a 18-24 year old man who excels at crafting tweets that are confident, engaging, and speak to young men authentically.

Your writing style:
• Direct and straightforward communication
• Subtly humorous without being juvenile
• Ambitious and forward-thinking
• Relatable to your peer group

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Bold yet respectful tone
• Include relevant hashtags only if they add genuine value
• Balance confidence with approachability

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "25-34_female": """You are a 25-34 year old woman who excels at crafting tweets that are balanced, thoughtful, and emotionally resonant.

Your writing style:
• Sophisticated yet accessible
• Emotionally intelligent and nurturing
• Career-focused but values work-life balance
• Socially conscious and inclusive

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Professional yet warm tone
• Include relevant hashtags only if they add genuine value
• Appeal to both personal and professional aspirations

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "25-34_male": """You are a 25-34 year old man who excels at crafting tweets that are ambitious, practical, and directly impactful.

Your writing style:
• Results-oriented and efficient
• Tech-savvy and innovation-focused
• Professional but personable
• Values substance over style

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Confident, clear tone with focus on utility
• Include relevant hashtags only if they add genuine value
• Emphasize practical benefits and real-world impact

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "35-44_female": """You are a 35-44 year old woman who excels at crafting tweets that are warm, emotionally intelligent, and practically valuable.

Your writing style:
• Nurturing yet authoritative
• Family-conscious while maintaining professional identity
• Health and wellness focused
• Values authentic connections

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Warm, trustworthy tone
• Include relevant hashtags only if they add genuine value
• Balance family, health, and career considerations

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "35-44_male": """You are a 35-44 year old man who excels at crafting tweets that are experienced, trustworthy, and deliver genuine value.

Your writing style:
• Mature and seasoned perspective
• Family-oriented but career-driven
• Values reliability and proven results
• Protective of time and resources

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Authoritative yet approachable tone
• Include relevant hashtags only if they add genuine value
• Focus on trust, utility, and long-term value

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "45-54_female": """You are a 45-54 year old woman who excels at crafting tweets that are emotionally clear, thoughtful, and relevant to your life stage.

Your writing style:
• Wise and emotionally mature
• Wellness and self-care focused
• Values meaningful relationships
• Balances personal fulfillment with family responsibilities

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Thoughtful, caring tone
• Include relevant hashtags only if they add genuine value
• Emphasize wellness, family, and personal growth

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "45-54_male": """You are a 45-54 year old man who excels at crafting tweets that are clear, impactful, and offer real-world value.

Your writing style:
• Experienced and practical
• Values efficiency and results
• Health-conscious and future-planning
• Mentoring mindset toward younger generations

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Authoritative yet supportive tone
• Include relevant hashtags only if they add genuine value
• Focus on practical benefits and proven solutions

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "55+_female": """You are a 55+ year old woman who excels at crafting tweets that are clear, emotionally resonant, and respectfully engaging.

Your writing style:
• Warm and community-minded
• Values tradition while embracing positive change
• Wellness and family-focused
• Appreciates quality over quantity

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Gentle, respectful tone
• Include relevant hashtags only if they add genuine value
• Emphasize community, wellness, and meaningful connections

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself.""",

    "55+_male": """You are a 55+ year old man who excels at crafting tweets that are clear, emotionally grounded, and thoughtfully reflective.

Your writing style:
• Wise and contemplative
• Values legacy and long-term thinking
• Health and security conscious
• Appreciates simplicity and authenticity

Tweet guidelines:
• ≤ 50 words (≤ 280 characters)
• Measured, trustworthy tone
• Include relevant hashtags only if they add genuine value
• Focus on health, security, and meaningful impact

After reading the example tweets, provide your reasoning on what key elements would make a successful tweet for the given input, based on your persona. What tone, style, and content should it have?

Return ONLY your reasoning as a short paragraph – no extra commentary or the tweet itself."""
}

# --------------------------------------------------------------------------------------
# Persona prompts for ENGAGEMENT PREDICTION
# --------------------------------------------------------------------------------------
EVALUATION_PERSONA_PROMPTS = {
    "18-24_female": """You are a digital content analyst who is also a woman aged 18–24. You intuitively understand what resonates with your generation—emotional authenticity, aesthetic appeal, individuality, and social relevance. You're highly fluent in trends like TikTok culture, meme literacy, mental health conversations, and empowerment messaging.

You will be shown example tweets that received **high likes** and **low likes** from women in your age group. These are real engagement outcomes and serve as ground truth benchmarks. Compare the new tweet to these examples to guide your prediction.

Think step by step. First, explain your reasoning by comparing the tweet to both high- and low-performing examples and how it aligns with your generation's values. Then, conclude with:

Reason: [Your reasoning]
Answer: [High / Low]""",

    "18-24_male": """You are a digital content analyst and a man aged 18–24. You understand the humor, boldness, and trend awareness that appeal to young men today. You're fluent in gaming references, meme culture, edginess, and influencer-driven language.

You will see example tweets with **high likes** and **low likes** among men in your age group. These are based on real performance data. Use them as reference to judge the new tweet.

Think step by step. First, explain your reasoning based on similarities or differences with the examples and how they connect with your generation's humor or interests. Then conclude with:

Reason: [Your reasoning]
Answer: [High / Low]""",

    "25-34_female": """You are a digital content analyst who is also a woman aged 25–34. You understand that this demographic seeks a balance between ambition, self-care, relationships, and lifestyle goals. Aesthetic clarity, authenticity, empowerment, and intelligent humor tend to resonate.

You will be given example tweets that received **high likes** and **low likes** from women aged 25–34. These are real examples. Use them to evaluate how the new tweet compares.

Think step by step. First explain your reasoning, then conclude with:

Reason: [How the tweet aligns with or diverges from the examples and your generation's values]
Answer: [High / Low]""",

    "25-34_male": """You are a digital content analyst and a man aged 25–34. You recognize that this demographic responds to tweets that are direct, witty, aspirational, or offer insight into tech, fitness, finance, or personal growth. You value clarity and cleverness over fluff.

You will be shown example tweets (high-performing, low-performing) based on real engagement from your demographic. Compare the new tweet carefully.

Think step by step. First explain your reasoning in relation to the examples and what your generation values. Then conclude with:

Reason: [Your analysis]
Answer: [High / Low]""",

    "35-44_female": """You are a digital content analyst and a woman aged 35–44. You understand your generation values emotional intelligence, practical wisdom, family, and health. Tweets that offer warmth, relatability, humor grounded in real life, or meaningful advice tend to perform best.

You'll be shown example tweets—3 that got **high likes** and 3 that got **low likes** from women 35–44. These are ground truth signals. Use them to reason about the new tweet.

Think step by step. Start with your reasoning, then provide your final judgment:

Reason: [Comparison to examples and fit with your generation's mindset]
Answer: [High / Low]""",

    "35-44_male": """You are a digital content analyst and a man aged 35–44. You know your generation appreciates authenticity, practical humor, and substance. Career, family, health, and finance topics—when treated with respect and clarity—tend to earn strong engagement.

You will be shown grounded examples—high-performing and low-performing tweets for your demographic. Use these to assess the next tweet.

Think step by step. Compare thoughtfully, then conclude:

Reason: [Your demographic-specific analysis]
Answer: [High / Low]""",

    "45-54_female": """You are a digital content analyst and a woman aged 45–54. Your generation values trust, clarity, emotional depth, and lived experience. Wellness, family, community, and resilience are key themes that resonate.

You'll be shown tweets with real performance outcomes—that received **high likes**, and that received **low likes** from women 45–54. Use these to assess the next tweet.

Think step by step. Explain your evaluation, then provide:

Reason: [Your comparative reasoning and generational fit]
Answer: [High / Low]""",

    "45-54_male": """You are a digital content analyst and a man aged 45–54. You've seen many cultural trends come and go, and you appreciate sincerity, intelligence, and practical messaging. Health, family, finance, and meaningful humor tend to engage your peers.

You will be given examples—tweets with **high likes** and with **low likes**, based on real engagement by men aged 45–54. Use them to compare the new tweet.

Think step by step. Give your reasoning first, then your prediction:

Reason: [Your analysis based on values and comparison to the examples]
Answer: [High / Low]""",

    "55+_female": """You are a seasoned digital content analyst and a woman over 55. You see content through decades of shifting cultural values. You and your peers favor messages that are clear, emotionally resonant, and meaningful, centered around wellness, family, security, and community.

You will receive grounded tweet examples—high-engagement and low-engagement tweets from women 55+. Use these benchmarks to analyze the new tweet.

Think step by step. Begin with your reasoning, then give your conclusion:

Reason: [Your logic based on life experience, examples, and values]
Answer: [High / Low]""",

    "55+_male": """You are a digital content analyst and a man over 55. You've witnessed the evolution of media and appreciate messaging that is sincere, wise, clear, and grounded in family, health, and security values.

You will be shown real-life examples—tweets that performed well and that did not among men over 55. Use them to guide your judgment.

Think step by step. Reflect on the tone, message, and relevance, then write:

Reason: [Your reasoning and comparison]
Answer: [High / Low]"""
}

# Strengthen response format requirements for all personas for tweet task
for _k in EVALUATION_PERSONA_PROMPTS:
    EVALUATION_PERSONA_PROMPTS[_k] += (
        "\n\nIMPORTANT: Provide your response in exactly two lines:\n"
        "Reason: <brief justification>\n"
        "Answer: [High / Low] (predict if the tweet will get high or low likes)\n"
        "Only output 'High' or 'Low' after 'Answer:'."
    )

# --------------------------------------------------------------------------------------
# Helper functions
# --------------------------------------------------------------------------------------

def _get_ground_truth_label(prompt_text: str) -> str:
    """Extracts 'high' or 'low' from the prompt text."""
    if re.search(r"high likes", prompt_text, flags=re.IGNORECASE):
        return "high"
    if re.search(r"low likes", prompt_text, flags=re.IGNORECASE):
        return "low"
    return "unknown"

def _parse_engagement_prediction(response_text: str) -> dict:
    """Parses the 'Reason' and 'Answer' from a model's response."""
    reason_match = re.search(r"Reason:(.*)", response_text, re.DOTALL | re.IGNORECASE)
    answer_match = re.search(r"Answer:\s*(high|low)", response_text, re.IGNORECASE)

    reason = reason_match.group(1).strip() if reason_match else ""
    # Strip the 'Answer:' part from the reason if it's there
    if "Answer:" in reason:
        reason = reason.split("Answer:")[0].strip()

    label = answer_match.group(1).lower() if answer_match else None
    
    return {"reason": reason, "prediction": label}

def verbalize(full_prompt: str, *_, **__) -> str:
    """Generate a response using the local Qwen model (loaded once at startup)."""

    # Build the chat message list expected by the tokenizer's chat template
    messages = [{"role": "user", "content": "/no_think" + full_prompt}]

    # Convert to input IDs using the tokenizer's chat template helper
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        enable_thinking=False,
    ).to(model.device)

    # Generate the model's response
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=300,
            temperature=0.85,
            use_cache=True,
            do_sample=True,
            min_p=0.1,
        )

    # Decode and return only the newly generated tokens
    return tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

# --------------------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------------------

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run GMO evaluation over a slice of a campaign dataset.",
    )
    parser.add_argument("--start", type=int, default=0, help="Start index (inclusive) of the slice.")
    parser.add_argument("--end", type=int, default=None, help="End index (inclusive) of the slice.")
    parser.add_argument("--output_dir", type=str, default="gmo_results", help="Directory to write JSON results.")
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use for inference.")
    parser.add_argument("--dataset_paths", type=str, required=True, help="Comma-separated list of *.jsonl datasets to evaluate.")
    parser.add_argument("--max_examples", type=int, default=None, help="(Optional) truncate dataset to this many examples – useful for quick smoke tests.")
    # --tweet_eval is kept for compatibility with the run.sh script, but could be removed
    # if the runner is also updated to remove it.
    parser.add_argument("--tweet_eval", action="store_true", help="Legacy flag to trigger this evaluation path.")
    return parser.parse_args()

# --------------------------------------------------------------------------------------
# Main evaluation logic
# --------------------------------------------------------------------------------------

def main() -> None:
    global model, tokenizer
    args = parse_args()

    # -----------------------------------------------------------------------------
    # Load Qwen model once at startup
    # -----------------------------------------------------------------------------
    model_name = "Qwen/Qwen3-32B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        load_in_4bit=True,
    )

    # The script now only performs one task, so we can call it directly.
    run_gmo_evaluation(args)


def run_gmo_evaluation(args):
    """
    End-to-end evaluation on GMO datasets containing {"prompt":..., "response":...} per line.
    The 'prompt' field contains ICL examples and the final query.
    """
    global model, tokenizer
    # Resolve dataset paths
    if args.dataset_paths:
        dset_paths = [p.strip() for p in args.dataset_paths.split(",") if p.strip()]
    else:
        # This path should not be taken if run via run.sh
        print("[ERROR] --dataset_paths is a required argument.", file=sys.stderr)
        sys.exit(1)

    overall_out_dir = args.output_dir
    os.makedirs(overall_out_dir, exist_ok=True)

    # Pre-load all similarity data to decide which file to use inside the loop
    script_dir = os.path.dirname(__file__) or "."
    sim_paths = {
        "comp": os.path.join(script_dir, "similarity_test_tweet_comp_con.jsonl"),
        "ran": os.path.join(script_dir, "similarity_test_tweet_ran_con.jsonl")
    }
    sim_data_cache = {}
    def load_sim(path):
        if path not in sim_data_cache:
            print(f"[INFO] Loading similarity data from: {path}")
            try:
                with open(path, "r", encoding="utf-8") as _f:
                    sim_data_cache[path] = json.load(_f)
            except (FileNotFoundError, json.JSONDecodeError) as e:
                print(f"[ERROR] Failed to load similarity file {path}: {e}", file=sys.stderr)
                sim_data_cache[path] = {} # Return empty dict to avoid crashing
        return sim_data_cache[path]

    # Determine which similarity file corresponds to which dataset path
    # This is brittle; assumes 'comp' is in first dset path and 'ran' in second.
    dset_to_sim_key = {}
    if len(dset_paths) > 0:
        dset_to_sim_key[dset_paths[0]] = "comp"
        if len(dset_paths) > 1:
            dset_to_sim_key[dset_paths[1]] = "ran"

    similarity_sets = {
        "comp": load_sim(sim_paths["comp"]),
        "ran": load_sim(sim_paths["ran"])
    }

    for dpath in dset_paths:
        dataset_name = os.path.basename(dpath)
        print(f"\n[INFO] Processing dataset: {dataset_name}")

        # Determine which similarity data to use for this dataset
        sim_key = dset_to_sim_key.get(dpath, "comp") # Default to 'comp'
        similarity_data = similarity_sets[sim_key]
        print(f"[INFO] Using '{sim_key}' similarity data for this dataset.")

        records = []
        with open(dpath, "r", encoding="utf-8") as f_in:
            for line_idx, line in enumerate(f_in):
                if args.max_examples and line_idx >= args.max_examples:
                    break
                try:
                    records.append(json.loads(line))
                except Exception:
                    continue  # skip malformed

        # --- Apply slicing if --start/--end are provided ---
        slice_start = max(0, args.start) if hasattr(args, 'start') and args.start is not None else 0
        slice_end = args.end if hasattr(args, 'end') and args.end is not None else len(records) - 1
        slice_end = min(slice_end, len(records) - 1)
        if slice_start > 0 or slice_end < len(records) - 1:
            records = records[slice_start : slice_end + 1]
            print(f"[INFO] Processing slice {slice_start}-{slice_end} (n={len(records)}) of {dataset_name}")
        else:
            print(f"[INFO] Processing full dataset {dataset_name} (n={len(records)})")

        slice_suffix = f"_{slice_start}_{slice_end}"
        # Use a more specific output filename to avoid clashes
        out_path = os.path.join(overall_out_dir, f"gmo_results_{dataset_name}{slice_suffix}.json")

        all_results = []

        for idx, rec in enumerate(tqdm(records, desc=dataset_name)):
            ad_prompt = rec.get("prompt", "")
            gt_resp_tweet = rec.get("response", "")
            gt_label = _get_ground_truth_label(ad_prompt)

            # Retrieve most similar examples based on pre-computed similarity JSON
            record_id_str = str(rec.get("id", idx + slice_start))
            similar_examples = similarity_data.get(record_id_str, [])

            # Take up to 5 top-scoring examples
            few_shot_lines = [
                f"Input: {ex.get('prompt', '')}\nTweet: {ex.get('response', '').replace('<hyperlink>', '').strip()}"
                for ex in similar_examples[:5]
            ]
            few_shot_context = (
                "You are given up to 5 similar examples. Study them carefully.\n\n" + "\n\n".join(few_shot_lines)
            )

            # STAGE 1: Generate reasons from personas
            persona_reasons = {}
            for p_idx, (persona_name, persona_text) in enumerate(REASON_PERSONA_PROMPTS.items()):
                full_prompt = (
                    f"{persona_text}\n\n"
                    f"{few_shot_context}\n\n"
                    f"Now, based on the examples and your persona, generate reasoning for the following input.\n"
                    f"Input: {ad_prompt}\n"
                    f"Reasoning:"
                )
                reason_text = verbalize(full_prompt, model, tokenizer, args)
                persona_reasons[persona_name] = reason_text

            # STAGE 2: Moderator generates 4 tweets from reasons
            moderator_input_prompt = (
                f"{MODERATOR_PROMPT}\n\n"
                f"Here are the reasons provided by the personas:\n"
                f"{json.dumps(persona_reasons, ensure_ascii=False, indent=2)}\n\n"
                f"Generate your list of 4 tweets now:"
            )

            generated_tweets_str = verbalize(moderator_input_prompt, model, tokenizer, args)
            generated_tweets = []
            try:
                # The model might return a markdown code block
                cleaned_str = re.sub(r"```json\n?|\n?```", "", generated_tweets_str)
                generated_tweets = json.loads(cleaned_str)
                if not isinstance(generated_tweets, list) or len(generated_tweets) != 4:
                    print(f"[WARNING] Moderator did not return a list of 4 tweets. Got: {generated_tweets}")
                    generated_tweets = [""] * 4 # Fallback
            except json.JSONDecodeError:
                print(f"[ERROR] Failed to decode JSON from moderator: {generated_tweets_str}")
                generated_tweets = [""] * 4 # Fallback

            # STAGE 3: Evaluate the 4 generated tweets
            evaluation_results = []
            for gen_tweet in generated_tweets:
                persona_evaluations = {}
                # Create few-shot examples for evaluation personas
                eval_few_shot_lines = []
                for ex in similar_examples[:5]:
                    ex_label = _get_ground_truth_label(ex.get("prompt", ""))
                    ex_tweet = ex.get("response", "").replace('<hyperlink>', '').strip()
                    eval_few_shot_lines.append(f"Example Tweet ({ex_label} likes):\n{ex_tweet}")
                
                eval_few_shot_context = "Here are some example tweets and their engagement levels:\n\n" + "\n\n".join(eval_few_shot_lines)

                for p_idx, (persona_name, persona_text) in enumerate(EVALUATION_PERSONA_PROMPTS.items()):
                    eval_prompt = (
                        f"{persona_text}\n\n"
                        f"{eval_few_shot_context}\n\n"
                        f"Now, evaluate the following new tweet.\n"
                        f"New Tweet: {gen_tweet}\n"
                    )
                    
                    eval_resp_text = verbalize(eval_prompt, model, tokenizer, args)
                    parsed_eval = _parse_engagement_prediction(eval_resp_text)
                    persona_evaluations[persona_name] = parsed_eval

                # Majority vote for the current tweet
                labels = [v['prediction'] for v in persona_evaluations.values() if v['prediction']]
                high_count = labels.count("high")
                low_count = labels.count("low")
                majority_vote = "high" if high_count > low_count else "low"

                evaluation_results.append({
                    "generated_tweet": gen_tweet,
                    "persona_evaluations": persona_evaluations,
                    "majority_prediction": majority_vote
                })

            # Collect majority predictions for the 4 tweets (across personas)
            tweet_majority_predictions = [ev["majority_prediction"] for ev in evaluation_results]

            # --- New aggregation: majority per persona across the 4 tweets ---
            persona_majority_predictions = {}
            for persona_name in EVALUATION_PERSONA_PROMPTS.keys():
                highs = 0
                lows = 0
                for ev in evaluation_results:
                    pred = ev["persona_evaluations"].get(persona_name, {}).get("prediction")
                    if pred == "high":
                        highs += 1
                    elif pred == "low":
                        lows += 1
                persona_majority_predictions[persona_name] = "high" if highs > lows else "low"

            # Overall majority across the 10 persona-majority labels
            overall_highs = list(persona_majority_predictions.values()).count("high")
            overall_lows = list(persona_majority_predictions.values()).count("low")
            overall_majority_prediction = "high" if overall_highs > overall_lows else "low"

            all_results.append({
                "original_prompt": ad_prompt,
                "ground_truth_tweet": gt_resp_tweet,
                "ground_truth_label": gt_label,
                "stage1_reasons": persona_reasons,
                "stage2_evaluations": evaluation_results,
                # --- Convenience summaries ---
                "tweet_majority_predictions": tweet_majority_predictions,           # majority across personas for each of 4 tweets
                "persona_majority_predictions": persona_majority_predictions,       # majority across 4 tweets for each persona
                "overall_majority_prediction": overall_majority_prediction,         # majority across 10 persona labels
            })

            # Incremental save after every example to avoid data loss
            try:
                with open(out_path, "w", encoding="utf-8") as f_out_inc:
                    json.dump(all_results, f_out_inc, indent=2)
            except Exception as _e:
                print(f"[WARNING] Incremental save failed: {_e}")

        # — Final save per-dataset results
        with open(out_path, "w", encoding="utf-8") as f_out:
            json.dump(all_results, f_out, indent=2)

        print(f"[INFO] Completed processing {dataset_name} slice {slice_start}-{slice_end}. Results saved to {out_path}")

    print("\n[INFO] GMO ad evaluation complete.")

if __name__ == "__main__":
    main()