
import sys
import os
import json
import numpy as np
from typing import List, Dict, Any, Tuple, Optional
from collections import defaultdict
from tqdm import tqdm
from scipy.stats import chi2_contingency
from concurrent.futures import ThreadPoolExecutor, as_completed

parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from utils import load_jsonl, METAINFO_DIR
from evaluate_utils import get_duration_by_youtube_id
from visualization_utils import plot_likelihood_distribution, plot_scatter_with_trend
from model_config import get_all_model_ids
from need_type_utils import normalize_need_type_label
import csv

HUMAN_ANNOTATION_LIKELIHOOD_SCORE = 9.0

try:
    from openpyxl import load_workbook
    HAS_OPENPYXL = True
except ImportError:
    HAS_OPENPYXL = False

def load_video_classifications(csv_file: str = None) -> Dict[str, Dict[str, str]]:
    if csv_file is None:
        csv_file = os.path.join(parent_dir, "data", "metainfo", "video_classess_patch.csv")
    classifications = {}
    if os.path.exists(csv_file):
        with open(csv_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                youtube_id = row.get('youtube_id', '')
                if youtube_id:
                    classifications[youtube_id] = {
                        'content_format': row.get('content_format', 'Unknown'),
                        'content_focus': row.get('content_focus', 'Unknown'),
                        'production_style': row.get('production_style', 'Unknown'),
                        'duration_cat': row.get('duration_cat', 'Unknown'),
                        'duration_sec': float(row.get('duration_sec', 0)) if row.get('duration_sec') else 0
                    }
    return classifications

def get_temporal_position(start_time: float, duration: float) -> str:
    if duration <= 0:
        return "unknown"
    position_ratio = start_time / duration
    if position_ratio < 0.33:
        return "early"
    elif position_ratio < 0.67:
        return "middle"
    else:
        return "late"

def get_likelihood_score(query: Dict[str, Any]) -> float:

    if query.get("data_type") == "human":
        return HUMAN_ANNOTATION_LIKELIHOOD_SCORE
    if "likelihood_scores" in query and query["likelihood_scores"]:
        scores = [s["score"] for s in query["likelihood_scores"] if isinstance(s, dict) and "score" in s]
        if scores:
            return sum(scores) / len(scores)
    return None

def load_annotated_queries_from_excel(excel_dir: str = None) -> Dict[str, List[Dict[str, Any]]]:
    if not HAS_OPENPYXL:
        return {}
    if excel_dir is None:
        excel_dir = os.path.join(parent_dir, "analysis", "human_annotations")
    if not os.path.exists(excel_dir):
        return {}
    import re
    annotated_queries_by_video = {}

    excel_files = [f for f in os.listdir(excel_dir) if f.endswith(".xlsx")]
    for filename in excel_files:
        video_id = filename.replace(".xlsx", "")
        filepath = os.path.join(excel_dir, filename)
        try:
            workbook = load_workbook(filename=filepath, data_only=True)
            sheetnames = workbook.sheetnames[:4]
            annotated_queries = []
            seen_queries = set()
            for sheetname in sheetnames:
                sheet = workbook[sheetname]
                current_url = None
                current_start_time = None
                current_end_time = None
                current_reason = None
                current_need = None
                current_question = None

                for row in range(1, sheet.max_row + 1):
                    url_cell = sheet[f'A{row}'].value
                    field_cell = sheet[f'B{row}'].value
                    value_cell = sheet[f'C{row}'].value

                    if url_cell and isinstance(url_cell, str) and 'youtu.be' in url_cell:

                        match = re.search(r't=(\d+)s', url_cell)
                        if match:
                            current_start_time = int(match.group(1))
                            current_url = url_cell

                    if field_cell and value_cell:
                        field_str = str(field_cell).strip()
                        value_str = str(value_cell).strip() if value_cell else ""
                        if field_str == "Start_time":
                            try:
                                current_start_time = int(float(value_str))
                            except:
                                pass
                        elif field_str == "End_time":
                            try:
                                current_end_time = float(value_str)
                            except:
                                pass
                        elif field_str == "Reason":
                            current_reason = value_str
                        elif field_str == "Need":
                            current_need = value_str
                        elif field_str == "Question":
                            current_question = value_str

                            if current_start_time is not None and current_question:
                                query_key = (current_start_time, current_question.strip().lower())
                                if query_key not in seen_queries:
                                    seen_queries.add(query_key)
                                    annotated_queries.append({
                                        'start_time': current_start_time,
                                        'end_time': current_end_time,
                                        'reason': current_reason,
                                        'need': current_need,
                                        'question': current_question
                                    })

                                current_end_time = None
                                current_reason = None
                                current_need = None
                                current_question = None
            if annotated_queries:
                annotated_queries_by_video[video_id] = annotated_queries
        except Exception as e:
            continue
    return annotated_queries_by_video

def load_human_annotation_scores_from_excel(excel_dir: str = None) -> Dict[str, Any]:
    if not HAS_OPENPYXL:
        return {
            "reason_scores": [],
            "need_scores": [],
            "question_scores": [],
            "total_needs": 0,
            "by_video": {},
            "error": "openpyxl not available"
        }
    if excel_dir is None:
        excel_dir = os.path.join(parent_dir, "analysis", "human_annotations")
    if not os.path.exists(excel_dir):
        return {
            "reason_scores": [],
            "need_scores": [],
            "question_scores": [],
            "total_needs": 0,
            "by_video": {},
            "error": f"Directory not found: {excel_dir}"
        }
    reason_scores = []
    need_scores = []
    question_scores = []
    by_video = {}

    excel_files = [f for f in os.listdir(excel_dir) if f.endswith(".xlsx")]
    for filename in excel_files:
        video_id = filename.replace(".xlsx", "")
        filepath = os.path.join(excel_dir, filename)
        try:
            workbook = load_workbook(filename=filepath, data_only=True)
            sheetnames = workbook.sheetnames[:4]
            video_reason = []
            video_need = []
            video_question = []
            for sheetname in sheetnames:
                sheet = workbook[sheetname]

                one_reason = sheet['D5'].value
                if one_reason is not None and isinstance(one_reason, (int, float)):
                    reason_scores.append(float(one_reason))
                    video_reason.append(float(one_reason))
                one_need = sheet['D6'].value
                if one_need is not None and isinstance(one_need, (int, float)):
                    need_scores.append(float(one_need))
                    video_need.append(float(one_need))
                one_question = sheet['D7'].value
                if one_question is not None and isinstance(one_question, (int, float)):
                    question_scores.append(float(one_question))
                    video_question.append(float(one_question))

                two_reason = sheet['D12'].value
                if two_reason is not None and isinstance(two_reason, (int, float)):
                    reason_scores.append(float(two_reason))
                    video_reason.append(float(two_reason))
                two_need = sheet['D13'].value
                if two_need is not None and isinstance(two_need, (int, float)):
                    need_scores.append(float(two_need))
                    video_need.append(float(two_need))
                two_question = sheet['D14'].value
                if two_question is not None and isinstance(two_question, (int, float)):
                    question_scores.append(float(two_question))
                    video_question.append(float(two_question))
            if video_reason or video_need or video_question:
                by_video[video_id] = {
                    "reason_scores": video_reason,
                    "need_scores": video_need,
                    "question_scores": video_question,
                    "count": len(video_question)
                }
        except Exception as e:
            continue
    return {
        "reason_scores": reason_scores,
        "need_scores": need_scores,
        "question_scores": question_scores,
        "total_needs": len(question_scores),
        "by_video": by_video
    }

def process_single_video_annotation(args):

    youtube_id, evaluation_dir, model_ids, video_classifications, annotated_queries_by_video = args
    try:
        ground_truth_file = os.path.join(parent_dir, f"output/{youtube_id}/jir_references_relevance_score.jsonl")
        if not os.path.exists(ground_truth_file):
            return None
        queries = load_jsonl(ground_truth_file)

        video_class = video_classifications.get(youtube_id, {})
        content_format = video_class.get('content_format', 'Unknown')
        content_focus = video_class.get('content_focus', 'Unknown')
        production_style = video_class.get('production_style', 'Unknown')
        duration_cat = video_class.get('duration_cat', 'Unknown')
        duration_sec = video_class.get('duration_sec', 0)

        num_needs = len(queries)
        need_density = num_needs / (duration_sec / 60) if duration_sec > 0 else 0.0

        annotated_queries_info = annotated_queries_by_video.get(youtube_id, [])

        annotated_times = set()
        for ann_query in annotated_queries_info:
            ann_time = ann_query.get('start_time', 0)

            annotated_times.add(round(ann_time))
            annotated_times.add(round(ann_time) - 1)
            annotated_times.add(round(ann_time) + 1)

        video_queries = []
        for query in queries:
            query_start_time = query.get("start_time", 0)
            query_question = query.get("question", "").strip().lower()
            rounded_time = round(query_start_time)

            is_annotated = False

            if rounded_time in annotated_times or (rounded_time - 1) in annotated_times or (rounded_time + 1) in annotated_times:

                for ann_query in annotated_queries_info:
                    ann_time = ann_query.get('start_time', 0)
                    ann_question = ann_query.get('question', '').strip().lower()

                    if abs(query_start_time - ann_time) <= 2:

                        question_match = False
                        if query_question == ann_question:
                            question_match = True
                        elif ann_question and query_question:

                            if len(ann_question) > 10 and len(query_question) > 10:
                                if ann_question in query_question or query_question in ann_question:
                                    question_match = True
                                else:

                                    ann_words = set(w for w in ann_question.split() if len(w) > 2)
                                    query_words = set(w for w in query_question.split() if len(w) > 2)
                                    if len(ann_words) >= 3 and len(query_words) >= 3:
                                        common_words = ann_words & query_words
                                        if len(common_words) >= min(3, max(len(ann_words), len(query_words)) * 0.4):
                                            question_match = True
                            else:

                                if ann_question == query_question or ann_question in query_question or query_question in ann_question:
                                    question_match = True

                        if question_match:
                            is_annotated = True
                            break

                        elif abs(query_start_time - ann_time) <= 1:

                            ann_words = set(w for w in ann_question.split() if len(w) > 2)
                            query_words = set(w for w in query_question.split() if len(w) > 2)
                            if ann_words and query_words and len(ann_words & query_words) >= 2:
                                is_annotated = True
                                break

            raw_need_type = query.get("type", "Unknown")
            need_type = normalize_need_type_label(raw_need_type)
            likelihood_score = get_likelihood_score(query)

            temporal_position = get_temporal_position(query_start_time, duration_sec)
            video_queries.append({
                "youtube_id": youtube_id,
                "type": need_type,
                "likelihood_score": likelihood_score,
                "is_annotated": is_annotated,
                "content_format": content_format,
                "content_focus": content_focus,
                "production_style": production_style,
                "duration_cat": duration_cat,
                "duration_sec": duration_sec,
                "need_density": need_density,
                "temporal_position": temporal_position,
                "start_time": query_start_time
            })

        performance_data = []
        for model_id in model_ids:
            model_dir = os.path.join(evaluation_dir, f"rebuttal_baseline_stream_runs_{model_id}")
            evaluation_file = os.path.join(model_dir, f"{youtube_id}.json")
            if not os.path.exists(evaluation_file):
                continue
            try:
                evaluation_data = json.load(open(evaluation_file, "r"))
                recall = evaluation_data.get("recall", {}).get("recall", 0.0)
                precision = evaluation_data.get("precision", {}).get("precision", 0.0)

                relevance = evaluation_data.get("relevance", {}).get("weighted_ndcg", 0.0)
                timeliness = evaluation_data.get("timeliness", {}).get("weighted_time_match", 0.0)
                has_annotated = any(q.get("data_type") == "human" for q in queries)
                performance_data.append({
                    "has_annotated": has_annotated,
                    "recall": recall,
                    "precision": precision,
                    "relevance": relevance,
                    "timeliness": timeliness
                })
            except:
                continue
        return {
            "queries": video_queries,
            "performance": performance_data
        }
    except Exception as e:
        return None

def analyze_annotation_coverage(
    evaluation_dir: str,
    output_dir: str,
    model_ids: List[str],
    num_workers: int = 8
) -> Dict[str, Any]:
    os.makedirs(output_dir, exist_ok=True)

    youtube_ids = load_jsonl(os.path.join(METAINFO_DIR, "lecture_.jsonl")) + \
                  load_jsonl(os.path.join(METAINFO_DIR, "paper_.jsonl"))
    youtube_ids = [item["youtube_id"] for item in youtube_ids]

    video_classifications = load_video_classifications()

    annotated_queries_by_video = load_annotated_queries_from_excel()
    total_annotated_queries = sum(len(queries) for queries in annotated_queries_by_video.values())
    for video_id, queries in list(annotated_queries_by_video.items())[:5]:
        pass

    human_scores = load_human_annotation_scores_from_excel()

    tasks = [(youtube_id, evaluation_dir, model_ids, video_classifications, annotated_queries_by_video) for youtube_id in youtube_ids]

    all_queries = []
    annotated_queries = []
    unannotated_queries = []
    type_distribution_annotated = defaultdict(int)
    type_distribution_unannotated = defaultdict(int)
    likelihood_annotated = []
    likelihood_unannotated = []

    content_format_dist_annotated = defaultdict(int)
    content_format_dist_unannotated = defaultdict(int)
    content_focus_dist_annotated = defaultdict(int)
    content_focus_dist_unannotated = defaultdict(int)
    production_style_dist_annotated = defaultdict(int)
    production_style_dist_unannotated = defaultdict(int)
    duration_dist_annotated = defaultdict(int)
    duration_dist_unannotated = defaultdict(int)
    temporal_dist_annotated = defaultdict(int)
    temporal_dist_unannotated = defaultdict(int)

    need_density_annotated = []
    need_density_unannotated = []
    performance_annotated = {
        "recall": [], "precision": [], "relevance": [], "timeliness": []
    }
    performance_unannotated = {
        "recall": [], "precision": [], "relevance": [], "timeliness": []
    }
    quality_checks = {
        "llm_quality_check": 0,
        "pre_retrieval_check": 0,
        "relevance_score": 0,
        "total": 0
    }

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_single_video_annotation, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing videos"):
            result = future.result()
            if result is None:
                continue

            for query_info in result["queries"]:
                all_queries.append(query_info)
                if query_info["is_annotated"]:
                    annotated_queries.append(query_info)
                    type_distribution_annotated[query_info["type"]] += 1
                    if query_info["likelihood_score"] is not None:
                        likelihood_annotated.append(query_info["likelihood_score"])

                    content_format_dist_annotated[query_info.get("content_format", "Unknown")] += 1
                    content_focus_dist_annotated[query_info.get("content_focus", "Unknown")] += 1
                    production_style_dist_annotated[query_info.get("production_style", "Unknown")] += 1
                    duration_dist_annotated[query_info.get("duration_cat", "Unknown")] += 1
                    temporal_dist_annotated[query_info.get("temporal_position", "unknown")] += 1
                    if query_info.get("need_density", 0) > 0:
                        need_density_annotated.append(query_info["need_density"])
                else:
                    unannotated_queries.append(query_info)
                    type_distribution_unannotated[query_info["type"]] += 1
                    if query_info["likelihood_score"] is not None:
                        likelihood_unannotated.append(query_info["likelihood_score"])

                    content_format_dist_unannotated[query_info.get("content_format", "Unknown")] += 1
                    content_focus_dist_unannotated[query_info.get("content_focus", "Unknown")] += 1
                    production_style_dist_unannotated[query_info.get("production_style", "Unknown")] += 1
                    duration_dist_unannotated[query_info.get("duration_cat", "Unknown")] += 1
                    temporal_dist_unannotated[query_info.get("temporal_position", "unknown")] += 1
                    if query_info.get("need_density", 0) > 0:
                        need_density_unannotated.append(query_info["need_density"])

            for perf in result["performance"]:
                if perf["has_annotated"]:
                    performance_annotated["recall"].append(perf["recall"])
                    performance_annotated["precision"].append(perf["precision"])
                    performance_annotated["relevance"].append(perf["relevance"])
                    performance_annotated["timeliness"].append(perf["timeliness"])
                else:
                    performance_unannotated["recall"].append(perf["recall"])
                    performance_unannotated["precision"].append(perf["precision"])
                    performance_unannotated["relevance"].append(perf["relevance"])
                    performance_unannotated["timeliness"].append(perf["timeliness"])

    for youtube_id in youtube_ids:
        ground_truth_file = os.path.join(parent_dir, f"output/{youtube_id}/jir_references_relevance_score.jsonl")
        if not os.path.exists(ground_truth_file):
            continue
        queries = load_jsonl(ground_truth_file)
        for query in queries:
            quality_checks["total"] += 1
            if "llm_quality_check" in query:
                quality_checks["llm_quality_check"] += 1
            if "references" in query and "llm_quality_check" in query.get("references", {}):
                quality_checks["pre_retrieval_check"] += 1
            if "references" in query and "document_relevance_score" in query.get("references", {}):
                quality_checks["relevance_score"] += 1

    total_samples = len(all_queries)
    annotated_samples = len(annotated_queries)
    coverage_percentage = annotated_samples / total_samples if total_samples > 0 else 0.0

    coverage_by_video_type = {}
    for youtube_id in youtube_ids:
        video_type = "lecture" if youtube_id in [item["youtube_id"] for item in load_jsonl(os.path.join(METAINFO_DIR, "lecture_.jsonl"))] else "paper"
        video_queries = [q for q in all_queries if q["youtube_id"] == youtube_id]
        video_annotated = [q for q in video_queries if q["is_annotated"]]
        if video_queries:
            if video_type not in coverage_by_video_type:
                coverage_by_video_type[video_type] = {"total": 0, "annotated": 0}
            coverage_by_video_type[video_type]["total"] += len(video_queries)
            coverage_by_video_type[video_type]["annotated"] += len(video_annotated)
    coverage_by_video_type = {
        vtype: {
            "total": stats["total"],
            "annotated": stats["annotated"],
            "coverage": stats["annotated"] / stats["total"] if stats["total"] > 0 else 0.0
        }
        for vtype, stats in coverage_by_video_type.items()
    }

    coverage_by_need_type = {}
    all_types = set(type_distribution_annotated.keys()) | set(type_distribution_unannotated.keys())
    for need_type in all_types:
        total = type_distribution_annotated[need_type] + type_distribution_unannotated[need_type]
        annotated = type_distribution_annotated[need_type]
        coverage_by_need_type[need_type] = {
            "total": total,
            "annotated": annotated,
            "coverage": annotated / total if total > 0 else 0.0
        }

    all_need_densities = need_density_annotated + need_density_unannotated
    need_density_percentiles = {}
    if all_need_densities:
        need_density_percentiles = {
            "p33": float(np.percentile(all_need_densities, 33)),
            "p67": float(np.percentile(all_need_densities, 67))
        }

    def group_need_density(density_list):

        grouped = defaultdict(int)
        if not density_list or not need_density_percentiles:
            return grouped
        p33 = need_density_percentiles["p33"]
        p67 = need_density_percentiles["p67"]
        for density in density_list:
            if density < p33:
                grouped["low"] += 1
            elif density < p67:
                grouped["medium"] += 1
            else:
                grouped["high"] += 1
        return grouped
    need_density_dist_annotated = group_need_density(need_density_annotated)
    need_density_dist_unannotated = group_need_density(need_density_unannotated)

    def calculate_dimension_similarity(dist_annotated, dist_unannotated):

        return calculate_distribution_similarity(dist_annotated, dist_unannotated)
    dimension_similarities = {
        "content_format": calculate_dimension_similarity(content_format_dist_annotated, content_format_dist_unannotated),
        "content_focus": calculate_dimension_similarity(content_focus_dist_annotated, content_focus_dist_unannotated),
        "production_style": calculate_dimension_similarity(production_style_dist_annotated, production_style_dist_unannotated),
        "duration": calculate_dimension_similarity(duration_dist_annotated, duration_dist_unannotated),
        "temporal_position": calculate_dimension_similarity(temporal_dist_annotated, temporal_dist_unannotated),
        "need_density": calculate_dimension_similarity(need_density_dist_annotated, need_density_dist_unannotated)
    }

    def calculate_coverage_by_dimension(dist_annotated, dist_unannotated):

        coverage = {}
        all_keys = set(dist_annotated.keys()) | set(dist_unannotated.keys())
        for key in all_keys:
            total = dist_annotated.get(key, 0) + dist_unannotated.get(key, 0)
            annotated = dist_annotated.get(key, 0)
            coverage[key] = {
                "total": total,
                "annotated": annotated,
                "unannotated": dist_unannotated.get(key, 0),
                "coverage": annotated / total if total > 0 else 0.0
            }
        return coverage
    coverage_by_content_format = calculate_coverage_by_dimension(content_format_dist_annotated, content_format_dist_unannotated)
    coverage_by_content_focus = calculate_coverage_by_dimension(content_focus_dist_annotated, content_focus_dist_unannotated)
    coverage_by_production_style = calculate_coverage_by_dimension(production_style_dist_annotated, production_style_dist_unannotated)
    coverage_by_duration = calculate_coverage_by_dimension(duration_dist_annotated, duration_dist_unannotated)
    coverage_by_temporal = calculate_coverage_by_dimension(temporal_dist_annotated, temporal_dist_unannotated)
    coverage_by_need_density = calculate_coverage_by_dimension(need_density_dist_annotated, need_density_dist_unannotated)

    type_similarity = calculate_distribution_similarity(
        type_distribution_annotated, type_distribution_unannotated
    )
    likelihood_similarity = 0.0
    if likelihood_annotated and likelihood_unannotated:
        mean_annotated = np.mean(likelihood_annotated)
        mean_unannotated = np.mean(likelihood_unannotated)
        std_annotated = np.std(likelihood_annotated)
        std_unannotated = np.std(likelihood_unannotated)

        mean_diff = abs(mean_annotated - mean_unannotated) / max(mean_annotated, mean_unannotated, 1.0)
        std_diff = abs(std_annotated - std_unannotated) / max(std_annotated, std_unannotated, 1.0)
        likelihood_similarity = 1.0 - (mean_diff + std_diff) / 2.0

    is_sufficient = annotated_samples >= 30

    perf_annotated = {
        "recall": float(np.mean(performance_annotated["recall"])) if performance_annotated["recall"] else 0.0,
        "precision": float(np.mean(performance_annotated["precision"])) if performance_annotated["precision"] else 0.0,
        "relevance": float(np.mean(performance_annotated["relevance"])) if performance_annotated["relevance"] else 0.0,
        "timeliness": float(np.mean(performance_annotated["timeliness"])) if performance_annotated["timeliness"] else 0.0
    }
    perf_unannotated = {
        "recall": float(np.mean(performance_unannotated["recall"])) if performance_unannotated["recall"] else 0.0,
        "precision": float(np.mean(performance_unannotated["precision"])) if performance_unannotated["precision"] else 0.0,
        "relevance": float(np.mean(performance_unannotated["relevance"])) if performance_unannotated["relevance"] else 0.0,
        "timeliness": float(np.mean(performance_unannotated["timeliness"])) if performance_unannotated["timeliness"] else 0.0
    }

    quality_coverage = {
        "llm_quality_check": quality_checks["llm_quality_check"] / quality_checks["total"] if quality_checks["total"] > 0 else 0.0,
        "pre_retrieval_check": quality_checks["pre_retrieval_check"] / quality_checks["total"] if quality_checks["total"] > 0 else 0.0,
        "relevance_score": quality_checks["relevance_score"] / quality_checks["total"] if quality_checks["total"] > 0 else 0.0
    }

    human_scores = load_human_annotation_scores_from_excel()

    human_score_statistics = {}
    if human_scores.get("total_needs", 0) > 0:
        human_score_statistics = {
            "total_annotated_needs": human_scores["total_needs"],
            "reason_scores": {
                "count": len(human_scores["reason_scores"]),
                "mean": float(np.mean(human_scores["reason_scores"])) if human_scores["reason_scores"] else 0.0,
                "std": float(np.std(human_scores["reason_scores"])) if human_scores["reason_scores"] else 0.0
            },
            "need_scores": {
                "count": len(human_scores["need_scores"]),
                "mean": float(np.mean(human_scores["need_scores"])) if human_scores["need_scores"] else 0.0,
                "std": float(np.std(human_scores["need_scores"])) if human_scores["need_scores"] else 0.0
            },
            "question_scores": {
                "count": len(human_scores["question_scores"]),
                "mean": float(np.mean(human_scores["question_scores"])) if human_scores["question_scores"] else 0.0,
                "std": float(np.std(human_scores["question_scores"])) if human_scores["question_scores"] else 0.0
            },
            "videos_annotated": len(human_scores.get("by_video", {})),
            "by_video": {
                video_id: {
                    "count": stats["count"],
                    "reason_mean": float(np.mean(stats["reason_scores"])) if stats["reason_scores"] else 0.0,
                    "need_mean": float(np.mean(stats["need_scores"])) if stats["need_scores"] else 0.0,
                    "question_mean": float(np.mean(stats["question_scores"])) if stats["question_scores"] else 0.0
                }
                for video_id, stats in human_scores.get("by_video", {}).items()
            }
        }
    else:
        human_score_statistics = {
            "total_annotated_needs": 0,
            "error": human_scores.get("error", "Unknown error")
        }

    if human_scores.get("total_needs", 0) > 0:
        annotated_samples = human_scores["total_needs"]
        coverage_percentage = annotated_samples / total_samples if total_samples > 0 else 0.0

    results = {
        "coverage_statistics": {
            "human_annotated": annotated_samples,
            "total_samples": total_samples,
            "coverage_percentage": coverage_percentage,
            "coverage_by_video_type": coverage_by_video_type,
            "coverage_by_need_type": coverage_by_need_type,
            "human_annotation_source": "Excel files in analysis/human_annotations/ directory",
            "human_annotation_count": human_scores.get("total_needs", 0),
            "coverage_by_content_format": coverage_by_content_format,
            "coverage_by_content_focus": coverage_by_content_focus,
            "coverage_by_production_style": coverage_by_production_style,
            "coverage_by_duration": coverage_by_duration,
            "coverage_by_temporal_position": coverage_by_temporal,
            "coverage_by_need_density": coverage_by_need_density,
            "need_density_percentiles": need_density_percentiles
        },
        "distribution_similarities": {
            "type_distribution": type_similarity,
            "likelihood_distribution": likelihood_similarity,
            "content_format": dimension_similarities["content_format"],
            "content_focus": dimension_similarities["content_focus"],
            "production_style": dimension_similarities["production_style"],
            "duration": dimension_similarities["duration"],
            "temporal_position": dimension_similarities["temporal_position"],
            "need_density": dimension_similarities["need_density"]
        },
        "dimension_distributions": {
            "content_format": {
                "annotated": dict(content_format_dist_annotated),
                "unannotated": dict(content_format_dist_unannotated),
                "similarity": dimension_similarities["content_format"]
            },
            "content_focus": {
                "annotated": dict(content_focus_dist_annotated),
                "unannotated": dict(content_focus_dist_unannotated),
                "similarity": dimension_similarities["content_focus"]
            },
            "production_style": {
                "annotated": dict(production_style_dist_annotated),
                "unannotated": dict(production_style_dist_unannotated),
                "similarity": dimension_similarities["production_style"]
            },
            "duration": {
                "annotated": dict(duration_dist_annotated),
                "unannotated": dict(duration_dist_unannotated),
                "similarity": dimension_similarities["duration"]
            },
            "temporal_position": {
                "annotated": dict(temporal_dist_annotated),
                "unannotated": dict(temporal_dist_unannotated),
                "similarity": dimension_similarities["temporal_position"]
            },
            "need_density": {
                "annotated": dict(need_density_dist_annotated),
                "unannotated": dict(need_density_dist_unannotated),
                "similarity": dimension_similarities["need_density"],
                "percentiles": need_density_percentiles
            }
        },
        "human_annotation_scores": human_score_statistics,
        "sufficiency_analysis": {
            "is_sufficient": is_sufficient,
            "annotated_samples": annotated_samples,
            "minimum_required": 30,
            "statistical_significance": "sufficient" if is_sufficient else "insufficient",
            "sample_representativeness": {
                "type_distribution_similarity": type_similarity,
                "likelihood_distribution_similarity": likelihood_similarity
            }
        },
        "quality_assurance": {
            "automated_checks": quality_coverage,
            "total_samples_checked": quality_checks["total"]
        },
        "annotated_vs_unannotated_comparison": {
            "type_distribution": {
                "annotated": dict(type_distribution_annotated),
                "unannotated": dict(type_distribution_unannotated),
                "similarity": type_similarity
            },
            "likelihood_distribution": {
                "annotated": {
                    "mean": float(np.mean(likelihood_annotated)) if likelihood_annotated else 0.0,
                    "std": float(np.std(likelihood_annotated)) if likelihood_annotated else 0.0
                },
                "unannotated": {
                    "mean": float(np.mean(likelihood_unannotated)) if likelihood_unannotated else 0.0,
                    "std": float(np.std(likelihood_unannotated)) if likelihood_unannotated else 0.0
                },
                "similarity": likelihood_similarity
            },
            "performance_comparison": {
                "annotated": perf_annotated,
                "unannotated": perf_unannotated,
                "difference": {
                    "recall": perf_annotated["recall"] - perf_unannotated["recall"],
                    "precision": perf_annotated["precision"] - perf_unannotated["precision"],
                    "relevance": perf_annotated["relevance"] - perf_unannotated["relevance"],
                    "timeliness": perf_annotated["timeliness"] - perf_unannotated["timeliness"]
                }
            }
        }
    }

    output_file = os.path.join(output_dir, "annotation_coverage_analysis.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    return results

def calculate_distribution_similarity(dist1: Dict[str, int], dist2: Dict[str, int]) -> float:

    all_keys = set(dist1.keys()) | set(dist2.keys())
    if not all_keys:
        return 1.0
    total1 = sum(dist1.values())
    total2 = sum(dist2.values())
    if total1 == 0 or total2 == 0:
        return 0.0

    norm1 = {k: dist1.get(k, 0) / total1 for k in all_keys}
    norm2 = {k: dist2.get(k, 0) / total2 for k in all_keys}

    dot_product = sum(norm1[k] * norm2[k] for k in all_keys)
    norm1_mag = np.sqrt(sum(v**2 for v in norm1.values()))
    norm2_mag = np.sqrt(sum(v**2 for v in norm2.values()))
    if norm1_mag == 0 or norm2_mag == 0:
        return 0.0
    similarity = dot_product / (norm1_mag * norm2_mag)
    return float(similarity)

if __name__ == "__main__":

    evaluation_dir = os.getenv("EVALUATION_DIR", "../evaluation_output")
    output_dir = os.getenv("OUTPUT_DIR", "../iclr_rebuttal/annotation_coverage_analysis")

    all_model_ids = get_all_model_ids()
    model_ids = [m for m in all_model_ids if "oracle" not in m.lower()]
    results = analyze_annotation_coverage(
        evaluation_dir=evaluation_dir,
        output_dir=output_dir,
        model_ids=model_ids,
        num_workers=8
    )
