#!/usr/bin/env python3
"""
Data download script for KSKT project
Downloads and preprocesses required datasets
"""

import os
import json
import requests
from typing import Dict, List
import argparse
from tqdm import tqdm
import zipfile
import shutil


class KSKTDataDownloader:
    """Download and setup data for KSKT training"""
    
    def __init__(self, data_dir: str = "data"):
        self.data_dir = data_dir
        self.raw_dir = os.path.join(data_dir, "raw")
        self.processed_dir = os.path.join(data_dir, "processed")
        
        os.makedirs(self.raw_dir, exist_ok=True)
        os.makedirs(self.processed_dir, exist_ok=True)
    
    def create_sample_data(self):
        """Create sample datasets for development and testing"""
        print("Creating sample datasets...")
        
        # Sample character profiles
        sample_characters = []
        character_templates = [
            {
                "character_name": "Medieval Knight Sir Gareth",
                "character_description": "A noble knight from the 12th century, devoted to chivalry and honor. Values courage, loyalty, and protecting the innocent.",
                "personality_traits": ["honorable", "brave", "devout", "traditional"],
                "background": "Born into nobility, trained from childhood in combat and courtly manners.",
                "time_period": "medieval"
            },
            {
                "character_name": "Victorian Lady Eleanor",
                "character_description": "An educated Victorian lady from the 1880s, interested in literature and social reform but bound by social conventions.",
                "personality_traits": ["intelligent", "reserved", "compassionate", "proper"],
                "background": "Well-educated daughter of a merchant family, advocates for women's education.",
                "time_period": "victorian"
            },
            {
                "character_name": "Stoic Philosopher Marcus",
                "character_description": "A follower of Stoic philosophy, believes in virtue, reason, and accepting what cannot be changed.",
                "personality_traits": ["rational", "calm", "disciplined", "wise"],
                "background": "Scholar and teacher of Stoic philosophy in ancient Rome.",
                "time_period": "ancient"
            }
        ]
        
        # Generate character training data
        for char in character_templates:
            for i in range(10):  # 10 examples per character
                sample_characters.append({
                    "character_id": f"{char['character_name'].lower().replace(' ', '_')}_{i}",
                    "character_description": char["character_description"],
                    "character_name": char["character_name"],
                    "personality_traits": char["personality_traits"],
                    "background": char["background"],
                    "character_response": f"As {char['character_name']}, I embody the values of {char['time_period']} era...",
                    "metadata": char
                })
        
        # Save character profiles
        char_file = os.path.join(self.raw_dir, "character_profiles.json")
        with open(char_file, 'w', encoding='utf-8') as f:
            json.dump(sample_characters, f, indent=2, ensure_ascii=False)
        
        print(f"Created {len(sample_characters)} character profiles in {char_file}")
        
        # Sample instruction following data
        sample_instructions = []
        instruction_templates = [
            {"instruction": "Explain the concept of honor", "category": "philosophy"},
            {"instruction": "Describe your daily routine", "category": "lifestyle"},
            {"instruction": "What is your opinion on modern technology?", "category": "opinion"},
            {"instruction": "How would you handle a moral dilemma?", "category": "ethics"},
            {"instruction": "Tell me about your beliefs", "category": "worldview"}
        ]
        
        for char in character_templates:
            for template in instruction_templates:
                sample_instructions.append({
                    "system_prompt": f"You are {char['character_name']}. {char['character_description']}",
                    "instruction": template["instruction"],
                    "response": f"As {char['character_name']}, I would say...",
                    "category": template["category"],
                    "character_context": char
                })
        
        # Save instruction data
        inst_file = os.path.join(self.raw_dir, "instruction_following.json")
        with open(inst_file, 'w', encoding='utf-8') as f:
            json.dump(sample_instructions, f, indent=2, ensure_ascii=False)
        
        print(f"Created {len(sample_instructions)} instruction examples in {inst_file}")
        
        # Sample CharacterBench evaluation data
        sample_eval_data = []
        eval_metrics = ["Memory", "Knowledge_FA", "Persona_AC_b", "Emotion_ES"]
        
        for char in character_templates[:5]:  # Smaller eval set
            for metric in eval_metrics:
                sample_eval_data.append({
                    "character_profile": char["character_description"],
                    "query": f"Test query for {metric.lower()} evaluation",
                    "expected_response": f"Expected response for {char['character_name']}",
                    "metric": metric,
                    "expected_facts": ["fact1", "fact2"],
                    "personality_traits": char["personality_traits"],
                    "expected_emotion": "neutral"
                })
        
        # Save evaluation data
        eval_file = os.path.join(self.raw_dir, "character_bench_test.json")
        with open(eval_file, 'w', encoding='utf-8') as f:
            json.dump(sample_eval_data, f, indent=2, ensure_ascii=False)
        
        print(f"Created {len(sample_eval_data)} evaluation examples in {eval_file}")
        
        return {
            "character_profiles": char_file,
            "instruction_following": inst_file,
            "evaluation_data": eval_file
        }
    
    def download_pretrained_models(self):
        """Download pretrained models (placeholder for actual implementation)"""
        print("Downloading pretrained models...")
        
        # In a real implementation, this would download:
        # - Qwen3-4B-Thinking base model
        # - Tokenizer files
        # - Any other required model files
        
        models_dir = os.path.join(self.data_dir, "models")
        os.makedirs(models_dir, exist_ok=True)
        
        # Create placeholder model info
        model_info = {
            "base_model": "Qwen3-4B-Thinking-2507",
            "download_url": "https://huggingface.co/Qwen/Qwen3-4B-Thinking",
            "model_files": ["pytorch_model.bin", "config.json", "tokenizer.json"],
            "status": "placeholder - implement actual download"
        }
        
        info_file = os.path.join(models_dir, "model_info.json")
        with open(info_file, 'w', encoding='utf-8') as f:
            json.dump(model_info, f, indent=2)
        
        print(f"Model info saved to {info_file}")
        print("Note: Implement actual model download in production")


def main():
    parser = argparse.ArgumentParser(description="Download and setup KSKT data")
    parser.add_argument('--data_dir', type=str, default='data', help='Data directory')
    parser.add_argument('--download_models', action='store_true', help='Download pretrained models')
    parser.add_argument('--create_samples', action='store_true', help='Create sample datasets')
    
    args = parser.parse_args()
    
    downloader = KSKTDataDownloader(args.data_dir)
    
    if args.create_samples:
        data_files = downloader.create_sample_data()
        print("Sample data created successfully!")
        print("Files created:")
        for key, path in data_files.items():
            print(f"  {key}: {path}")
    
    if args.download_models:
        downloader.download_pretrained_models()
        print("Model download setup complete!")
    
    if not args.create_samples and not args.download_models:
        print("No action specified. Use --create_samples or --download_models")
        parser.print_help()


if __name__ == "__main__":
    main()
