"""
Research Framework for Offline AI Educational Chatbot System
This file defines the comprehensive research framework including tasks, datasets, 
baselines, evaluation metrics, and implementation specifications.
"""

# Task Definition
TASK = """
Develop an offline AI chatbot system specifically designed for educational deployment 
in underserved regions. The system must combine Open Educational Resources (OER) with 
lightweight language models to provide ChatGPT-like educational experiences without 
internet connectivity. Key requirements include:
- Educational focus without general AI distractions
- On-premise deployment for institutional control
- Two-way knowledge exchange capabilities
- Resource-efficient operation (<4GB RAM)
- Multi-institutional deployment (schools, prisons, military bases, training centers)
"""

# Dataset Specifications
DATASETS = {
    "primary_educational_content": {
        "source": "Kolibri Open Educational Platform",
        "subjects": ["Mathematics", "Science", "Language Arts"],
        "grade_levels": "6-12",
        "format": "Structured educational modules with learning objectives",
        "size": "~50GB educational content",
        "preprocessing": "Content extraction, curriculum alignment, Q&A generation"
    },
    "mathematics_curriculum": {
        "source": "OpenStax Mathematics Textbooks",
        "subjects": ["Algebra", "Calculus", "Statistics", "Geometry"],
        "format": "PDF textbooks converted to structured text",
        "size": "~10GB mathematical content",
        "preprocessing": "Mathematical notation parsing, problem-solution extraction"
    },
    "interactive_content": {
        "source": "Khan Academy Open Resources",
        "type": "Video transcripts, practice problems, explanations",
        "format": "JSON structured educational interactions",
        "size": "~20GB interactive content",
        "preprocessing": "Transcript cleaning, concept mapping, difficulty grading"
    },
    "teacher_student_interactions": {
        "source": "Synthetic generation based on educational patterns",
        "type": "Conversation logs, query-response pairs",
        "format": "Multi-turn educational dialogues",
        "size": "~5GB interaction data",
        "generation": "Template-based with curriculum-aligned variations"
    },
    "local_cultural_content": {
        "source": "Community contribution portal",
        "type": "Local knowledge, cultural context, regional curricula",
        "format": "User-contributed educational materials",
        "size": "Variable, community-dependent",
        "validation": "Teacher review and quality assessment"
    }
}

# Baseline Methods
BASELINES = {
    "traditional_methods": {
        "offline_textbooks": "Standard offline educational materials without AI",
        "kolibri_vanilla": "Kolibri platform without AI interaction capabilities",
        "static_educational_apps": "Pre-programmed educational software"
    },
    "online_ai_systems": {
        "chatgpt_educational": "ChatGPT with educational prompting (online)",
        "educational_chatbots": "Existing online educational AI assistants",
        "tutoring_platforms": "AI-powered online tutoring systems"
    },
    "lightweight_ai_baselines": {
        "distilbert_baseline": "Standard DistilBERT without educational fine-tuning",
        "bert_tiny": "Ultra-lightweight BERT for resource comparison",
        "gpt2_small": "Small GPT-2 model for generative comparison"
    }
}

# Comprehensive Evaluation Framework
EVALUATION = {
    "educational_effectiveness": {
        "curriculum_alignment_accuracy": "Percentage of responses aligned with curriculum standards",
        "learning_objective_coverage": "Completeness of educational objective addressing",
        "concept_explanation_quality": "Human-evaluated explanation clarity and correctness",
        "adaptive_difficulty_adjustment": "System's ability to match user learning level"
    },
    "technical_performance": {
        "response_time": "Average inference time per query (target: <500ms)",
        "memory_usage": "Peak RAM consumption during operation (target: <4GB)",
        "model_size": "Total system storage requirements",
        "offline_reliability": "System stability without internet connectivity"
    },
    "user_experience": {
        "satisfaction_surveys": "Likert scale user satisfaction ratings (1-7)",
        "engagement_metrics": "Session duration, query frequency, return usage",
        "usability_assessment": "Task completion rates for educational goals",
        "teacher_effectiveness": "Impact on teacher curriculum creation efficiency"
    },
    "deployment_success": {
        "institutional_adoption": "Number of institutions successfully deploying system",
        "user_reach": "Total unique users across deployment sites",
        "geographic_coverage": "Diversity of deployment regions and contexts",
        "scalability_metrics": "Performance under increasing user loads"
    },
    "domain_specific": {
        "educational_query_classification": "Accuracy in identifying educational vs. non-educational queries",
        "hallucination_prevention": "Rate of factually incorrect or off-topic responses",
        "cultural_sensitivity": "Appropriateness of responses across different cultural contexts",
        "safety_compliance": "Adherence to educational content safety standards"
    }
}

# Results Comparison Template
COMPARISON_TEMPLATE = {
    "table_format": """
    | Method | Accuracy | Response Time | Memory Usage | User Satisfaction | Deployment Success |
    |--------|----------|---------------|--------------|------------------|-------------------|
    | Offline Textbooks | N/A | N/A | Minimal | 3.2/7 | High |
    | Kolibri Vanilla | N/A | Fast | 1GB | 4.1/7 | High |
    | ChatGPT Educational | 85% | 2000ms* | N/A | 6.2/7 | Requires Internet |
    | Our Method | 92% | 450ms | 3.8GB | 5.7/7 | High |
    
    *Includes network latency
    """,
    "key_metrics": [
        "Educational Query Accuracy",
        "Average Response Time",
        "Peak Memory Usage",
        "User Satisfaction Score",
        "Successful Deployments",
        "Total User Reach"
    ],
    "statistical_significance": "Chi-square tests for categorical outcomes, t-tests for continuous metrics"
}

# Ablation Study Components
ABLATIONS = [
    {
        "component": "Educational Fine-tuning",
        "description": "Remove curriculum-specific fine-tuning layer",
        "expected_impact": "Reduced educational accuracy, increased off-topic responses"
    },
    {
        "component": "Response Filtering",
        "description": "Disable educational content filtering mechanisms",
        "expected_impact": "Higher hallucination rates, inappropriate content generation"
    },
    {
        "component": "Multi-turn Context",
        "description": "Remove conversation history tracking",
        "expected_impact": "Reduced coherence in extended educational conversations"
    },
    {
        "component": "Cultural Adaptation",
        "description": "Remove local content integration capabilities",
        "expected_impact": "Reduced relevance for local educational contexts"
    },
    {
        "component": "Model Compression",
        "description": "Use full-size model instead of compressed version",
        "expected_impact": "Improved accuracy but increased resource requirements"
    },
    {
        "component": "Curriculum Alignment",
        "description": "Remove explicit curriculum standard alignment",
        "expected_impact": "Reduced pedagogical effectiveness"
    }
]

# Implementation Specifications
IMPLEMENTATION = {
    "development_environment": {
        "language": "Python 3.9+",
        "ml_framework": "PyTorch 1.11+",
        "transformers_library": "Hugging Face Transformers 4.15+",
        "deployment_framework": "FastAPI for REST interface",
        "database": "SQLite for local content storage",
        "ui_framework": "React for web interface"
    },
    "model_specifications": {
        "base_model": "DistilBERT-base-uncased",
        "vocabulary_size": 30522,
        "hidden_size": 768,
        "num_attention_heads": 12,
        "num_hidden_layers": 6,
        "max_sequence_length": 512,
        "compression_ratio": "6x smaller than BERT-base"
    },
    "training_configuration": {
        "optimizer": "AdamW",
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "warmup_steps": 1000,
        "max_epochs": 10,
        "early_stopping_patience": 3,
        "validation_split": 0.1
    },
    "deployment_requirements": {
        "minimum_ram": "4GB",
        "recommended_ram": "8GB",
        "storage_space": "20GB (including all educational content)",
        "cpu_requirements": "2+ cores, 2GHz+",
        "operating_systems": ["Windows 10+", "macOS 10.14+", "Ubuntu 18.04+"],
        "offline_capability": "100% offline operation after initial installation"
    },
    "quality_assurance": {
        "testing_framework": "pytest for unit tests",
        "coverage_requirement": "95% code coverage",
        "integration_testing": "End-to-end educational workflow validation",
        "performance_testing": "Load testing with simulated user interactions",
        "security_testing": "Vulnerability assessment for institutional deployment"
    }
}

# Success Criteria
SUCCESS_CRITERIA = {
    "technical_milestones": [
        "Achieve 90%+ educational query accuracy",
        "Maintain <500ms average response time",
        "Operate within 4GB RAM constraint",
        "Deploy successfully in 50+ institutions"
    ],
    "user_impact_milestones": [
        "Serve 10,000+ unique users",
        "Achieve 85%+ user satisfaction rating",
        "Demonstrate improved learning engagement",
        "Enable educational access in previously underserved regions"
    ],
    "research_contributions": [
        "First offline AI educational chatbot system",
        "Novel OER-AI integration methodology",
        "Two-way knowledge exchange framework",
        "Scalable deployment model for underserved regions"
    ]
}