"""
Shared topic list for all experiments (CKM, Pool, Ablations).
50 topics across diverse research domains.

Selection criteria (verified against arXiv 2019-2027):
  - Init (2019-2024): >= 30 papers
  - Evolution (2024-2025): >= 50 papers
  - Validation (2025-2027): >= 30 papers
  - Total >= 100 papers
"""
import re


def slugify_topic_name(name: str) -> str:
    slug = re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")
    return re.sub(r"_+", "_", slug)


TOPIC_SPECS = [
    # --- NLP / Translation ---
    ("Low-resource translation", "Low-resource machine translation"),
    ("Multilingual NLP", "Cross-lingual transfer for low-resource languages"),
    ("Multilingual LLM", "Multilingual large language models"),
    # --- Speech ---
    ("Speech recognition", "Low-resource speech recognition"),
    # --- Software Engineering ---
    ("Software engineering intelligence", "AI for software engineering"),
    ("Code security", "Security of code LLMs"),
    ("Code generation", "Code generation with large language models"),
    # --- LLM Evaluation ---
    ("LLM benchmarking", "Accuracy of automated evaluation for LLMs"),
    # --- LLM Core Capabilities ---
    ("Factual reliability", "Factual consistency in large language models"),
    ("Long-context modeling", "Long-context understanding in large language models"),
    ("LLM reasoning", "Chain-of-thought reasoning in large language models"),
    ("Prompt engineering", "Prompt engineering and in-context learning"),
    ("Instruction tuning", "Instruction tuning for large language models"),
    # --- Agents ---
    ("Agent reasoning", "Complex reasoning for AI agents"),
    ("LLM agents", "Tool-using agents with large language models"),
    # --- Alignment & Safety ---
    ("RLHF alignment", "Reinforcement learning from human feedback"),
    ("Explainable AI", "Explainability and interpretability of neural networks"),
    ("Adversarial robustness", "Adversarial robustness of deep learning models"),
    ("Fairness in ML", "Fairness and bias mitigation in machine learning"),
    # --- Text Understanding ---
    ("Text generation diversity", "Diversity evaluation in text generation"),
    ("Question answering", "Open-domain question answering with language models"),
    ("Sentiment analysis", "Aspect-based sentiment analysis"),
    ("Relation extraction", "Relation extraction from text"),
    ("Document understanding", "Document understanding and information extraction"),
    # --- Multimodal ---
    ("Vision-language models", "Vision-language models and multimodal learning"),
    ("Visual question answering", "Visual question answering with multimodal models"),
    ("Text-to-image generation", "Text-to-image generation and diffusion models"),
    # --- Information Retrieval ---
    ("Recommendation systems", "Deep learning for recommendation systems"),
    ("Conversational search", "Conversational information retrieval"),
    # --- Medical & Bio ---
    ("Medical NLP", "Clinical natural language processing"),
    ("Drug discovery", "Machine learning for drug discovery"),
    ("Medical image analysis", "Deep learning for medical image analysis"),
    ("Protein structure", "Protein structure prediction with deep learning"),
    # --- Scientific Discovery ---
    ("Scientific discovery support", "AI for hypothesis generation in science"),
    ("Knowledge graphs", "Knowledge graph completion and reasoning"),
    # --- Efficiency & Optimization ---
    ("LLM efficiency", "Efficient fine-tuning of large language models"),
    ("Small model scaling", "Knowledge distillation for small language models"),
    ("Model compression", "Model compression and pruning for neural networks"),
    ("Resource-aware serving", "Mixture of experts routing for language models"),
    # --- Data ---
    ("Data curation", "Data filtering for domain-specific models"),
    ("Synthetic data quality", "Synthetic data quality evaluation"),
    ("Data augmentation", "Data augmentation for natural language processing"),
    # --- Robustness & Generalization ---
    ("Distribution shift analysis", "Out-of-distribution detection for language models"),
    ("Continual learning", "Continual learning and catastrophic forgetting"),
    ("Transfer learning", "Domain adaptation and transfer learning"),
    ("Federated learning", "Federated learning for language models"),
    # --- Human-AI Interaction ---
    ("Human-AI interaction", "User experience evaluation for large language models"),
    # --- Scientific ML ---
    ("Scientific machine learning", "Surrogate modeling in physics-informed machine learning"),
    # --- Graph ---
    ("Graph neural networks", "Graph neural networks for natural language processing"),
    # --- Active Learning ---
    ("Active learning", "Active learning for natural language processing"),
]

TOPICS = [
    {
        "slug": slugify_topic_name(topic),
        "domain": domain,
        "name": topic,
    }
    for domain, topic in TOPIC_SPECS
]
