"""
Core data structures for AI Scientist system.
Following Linus principle: "Bad programmers worry about the code. Good programmers worry about data structures."
"""

from dataclasses import dataclass
from typing import Dict, List, Any, Optional
from datetime import datetime
from enum import Enum

class HypothesisType(Enum):
    """Types of methodological hypotheses the AI can investigate"""
    CALIBRATION = "calibration"
    ARCHITECTURE = "architecture" 
    CONSTRAINTS = "constraints"

class ExperimentStatus(Enum):
    """Status of experimental runs"""
    PLANNED = "planned"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"

@dataclass
class ResearchHypothesis:
    """A single methodological research hypothesis generated by AI"""
    id: str
    type: HypothesisType
    question: str
    method_a: str
    method_b: str
    metrics: List[str]
    expected_outcome: str
    confidence: float
    reasoning: str
    
    def __post_init__(self):
        """Validate hypothesis structure"""
        if self.confidence < 0 or self.confidence > 1:
            raise ValueError("Confidence must be between 0 and 1")
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert hypothesis to dictionary"""
        return {
            "id": self.id,
            "type": self.type.value if hasattr(self.type, 'value') else str(self.type),
            "question": self.question,
            "method_a": self.method_a,
            "method_b": self.method_b,
            "metrics": self.metrics,
            "expected_outcome": self.expected_outcome,
            "confidence": self.confidence,
            "reasoning": self.reasoning
        }

@dataclass 
class ExperimentalProtocol:
    """Complete experimental design generated by AI Meta-Scientist"""
    hypotheses: List[ResearchHypothesis]
    data_split: Dict[str, str]
    baselines: List[str]
    metrics: List[str]
    statistical_tests: List[str]
    sample_size: int
    random_seed: int
    created_by: str = "AI-Meta-Scientist"
    created_at: datetime = None
    
    def __post_init__(self):
        if self.created_at is None:
            self.created_at = datetime.now()
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert protocol to dictionary for JSON serialization"""
        return {
            "hypotheses": [h.to_dict() if hasattr(h, 'to_dict') else h.__dict__ for h in self.hypotheses],
            "data_split": self.data_split,
            "baselines": self.baselines, 
            "metrics": self.metrics,
            "statistical_tests": self.statistical_tests,
            "sample_size": self.sample_size,
            "random_seed": self.random_seed,
            "created_by": self.created_by,
            "created_at": self.created_at.isoformat() if self.created_at else None
        }

@dataclass
class ExperimentResult:
    """Results from a single experimental run"""
    hypothesis_id: str
    method: str
    baseline: str
    metrics_values: Dict[str, float]
    runtime_seconds: float
    status: ExperimentStatus
    error_message: Optional[str] = None
    artifacts_path: Optional[str] = None

@dataclass
class ResearchResults:
    """Complete results from all experimental runs"""
    protocol: ExperimentalProtocol
    experiment_results: List[ExperimentResult]
    statistical_analysis: Dict[str, Any]
    insights: List[str]
    figures: List[str]  # Paths to generated figures
    tables: List[str]   # Paths to generated tables
    
    @property
    def success_rate(self) -> float:
        """Calculate percentage of successful experiments"""
        if not self.experiment_results:
            return 0.0
        successful = sum(1 for r in self.experiment_results if r.status == ExperimentStatus.COMPLETED)
        return successful / len(self.experiment_results)

@dataclass
class AuthorshipLedger:
    """Tracks AI vs human contributions for conference compliance"""
    ai_contributions: Dict[str, int]
    human_contributions: Dict[str, int]
    total_tokens_generated: int
    claude_api_calls: int
    lines_code_generated: int
    figures_created: int
    ai_percentage: float
    lineage_complete: bool
    reproducible: bool
    
    def calculate_ai_percentage(self) -> float:
        """Calculate actual AI contribution percentage"""
        total_ai = sum(self.ai_contributions.values())
        total_human = sum(self.human_contributions.values())
        total_work = total_ai + total_human
        
        if total_work == 0:
            return 0.0
            
        return (total_ai / total_work) * 100

@dataclass
class ConferenceSubmission:
    """Complete conference submission package"""
    paper_latex: str
    figures_paths: List[str] 
    authorship_ledger: AuthorshipLedger
    ai_contribution_disclosure: Dict[str, str]
    responsible_ai_statement: str
    reproducibility_statement: str
    anonymized: bool = False
    
    def validate_conference_requirements(self) -> List[str]:
        """Check if submission meets all Stanford requirements"""
        issues = []
        
        if self.authorship_ledger.ai_percentage < 80:
            issues.append(f"AI contribution {self.authorship_ledger.ai_percentage:.1f}% below 80% threshold")
            
        if not self.authorship_ledger.lineage_complete:
            issues.append("Incomplete source lineage - some claims lack provenance")
            
        if not self.responsible_ai_statement:
            issues.append("Missing responsible AI statement")
            
        if not self.reproducibility_statement:
            issues.append("Missing reproducibility statement")
            
        if not self.anonymized:
            issues.append("Paper not anonymized for blind review")
            
        return issues

# Pre-defined research hypotheses following GPT-5's framework
H1_CALIBRATION = ResearchHypothesis(
    id="H1_calibration",
    type=HypothesisType.CALIBRATION,
    question="Does evidence grounding improve PTRS calibration vs prompt-only?",
    method_a="Evidence-grounded multi-agent system",
    method_b="Prompt-only LLM baseline",
    metrics=["brier_score", "log_loss", "calibration_slope", "pi_coverage_80"],
    expected_outcome="Evidence grounding reduces calibration error by >20%",
    confidence=0.8,
    reasoning="Source-grounded claims should provide more reliable probability estimates"
)

H2_ARCHITECTURE = ResearchHypothesis(
    id="H2_architecture", 
    type=HypothesisType.ARCHITECTURE,
    question="Do specialized agents outperform monolithic LLM?",
    method_a="Multi-agent pharmaceutical system",
    method_b="Single LLM with prompt engineering",
    metrics=["mape_peak_sales", "portfolio_rnpv", "decision_accuracy"],
    expected_outcome="Multi-agent system achieves 15%+ better accuracy",
    confidence=0.85,
    reasoning="Task specialization should reduce cognitive load and improve domain reasoning"
)

H3_CONSTRAINTS = ResearchHypothesis(
    id="H3_constraints",
    type=HypothesisType.CONSTRAINTS, 
    question="Do Bass constraints improve prediction intervals?",
    method_a="Bass diffusion with pharmaceutical constraints",
    method_b="Unconstrained LLM forecasts", 
    metrics=["pi_coverage_80", "pi_coverage_90", "rmse", "constraint_violations"],
    expected_outcome="Constraints improve PI coverage to >75% from baseline ~60%",
    confidence=0.75,
    reasoning="Domain constraints should prevent physically impossible forecasts"
)

def create_standard_protocol() -> ExperimentalProtocol:
    """Create the standard protocol for pharmaceutical AI research"""
    return ExperimentalProtocol(
        hypotheses=[H1_CALIBRATION, H2_ARCHITECTURE, H3_CONSTRAINTS],
        data_split={
            "train": "≤2018 pharmaceutical launches", 
            "validation": "2019-2021 launches",
            "test": "2022-2024 launches (held-out)"
        },
        baselines=[
            "analog_spreadsheet_method",
            "deterministic_epidemiological_funnel", 
            "prompt_only_llm_no_tools"
        ],
        metrics=["mape", "brier_score", "pi_coverage", "portfolio_rnpv"],
        statistical_tests=[
            "paired_bootstrap_significance",
            "spiegelhalter_calibration_test",
            "portfolio_value_under_budget_constraint"
        ],
        sample_size=100,  # Start modest, scale up
        random_seed=42,   # Reproducible
        created_by="Claude-3.5-Sonnet-20241022"
    )