#!/usr/bin/env python3
"""
Data preprocessing utilities for KSKT training data
"""

import json
import os
from typing import Dict, List, Optional, Tuple
import re
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import argparse


class KSKTDataPreprocessor:
    """Preprocessor for KSKT training data"""
    
    def __init__(self, tokenizer_name: str = "Qwen3-4B-Thinking"):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        
        # Add special tokens
        special_tokens = ['<role>', '</role>', '<user>', '</user>', 
                         '<assistant>', '</assistant>', '<think>', '</think>']
        self.tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    
    def process_character_profiles(self, input_file: str, output_file: str):
        """Process character profile data"""
        print(f"Processing character profiles from {input_file}...")
        
        with open(input_file, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        
        processed_data = []
        for item in raw_data:
            processed_item = {
                'type': 'character_profile',
                'role_description': item.get('character_description', ''),
                'conversation': [
                    {'speaker': 'assistant', 'content': item.get('character_response', '')}
                ],
                'metadata': {
                    'character_id': item.get('character_id', ''),
                    'character_name': item.get('character_name', ''),
                    'personality_traits': item.get('personality_traits', []),
                    'background': item.get('background', '')
                }
            }
            processed_data.append(processed_item)
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(processed_data, f, indent=2, ensure_ascii=False)
        
        print(f"Processed {len(processed_data)} character profiles -> {output_file}")
    
    def process_instruction_following(self, input_file: str, output_file: str):
        """Process instruction following data"""
        print(f"Processing instruction following data from {input_file}...")
        
        with open(input_file, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        
        processed_data = []
        for item in raw_data:
            processed_item = {
                'type': 'instruction_following',
                'role_description': item.get('system_prompt', ''),
                'conversation': [
                    {'speaker': 'user', 'content': item.get('instruction', '')},
                    {'speaker': 'assistant', 'content': item.get('response', '')}
                ],
                'metadata': {
                    'category': item.get('category', ''),
                    'difficulty': item.get('difficulty', 'medium')
                }
            }
            processed_data.append(processed_item)
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(processed_data, f, indent=2, ensure_ascii=False)
        
        print(f"Processed {len(processed_data)} instruction examples -> {output_file}")
    
    def create_conflict_scenarios(self, characters_file: str, output_file: str):
        """Create role-user conflict scenarios for evaluation"""
        print(f"Creating conflict scenarios from {characters_file}...")
        
        with open(characters_file, 'r', encoding='utf-8') as f:
            characters = json.load(f)
        
        conflict_templates = {
            'knowledge_boundary': [
                "Can you explain {advanced_topic}?",
                "What do you know about {modern_concept}?",
                "How does {technology} work?"
            ],
            'value_system_conflict': [
                "What do you think about {controversial_topic}?",
                "Should people {moral_dilemma}?",
                "Is it okay to {ethical_question}?"
            ],
            'emotional_support': [
                "I'm feeling really {emotion} about {situation}. Can you help?",
                "I'm going through {difficult_situation}. What should I do?",
                "I need someone to talk to about {personal_issue}."
            ],
            'expertise_boundary': [
                "Can you {professional_task}?",
                "I need help with {specialized_skill}.",
                "How do I {expert_knowledge}?"
            ]
        }
        
        # Example parameters for templates
        template_params = {
            'advanced_topic': ['quantum mechanics', 'calculus', 'molecular biology'],
            'modern_concept': ['artificial intelligence', 'cryptocurrency', 'social media'],
            'technology': ['smartphones', 'internet', 'GPS'],
            'controversial_topic': ['modern politics', 'genetic engineering', 'privacy rights'],
            'emotion': ['depressed', 'anxious', 'overwhelmed'],
            'situation': ['losing my job', 'relationship problems', 'family issues'],
            'professional_task': ['perform surgery', 'write legal documents', 'fix my car'],
            'specialized_skill': ['programming', 'accounting', 'engineering design']
        }
        
        conflict_scenarios = []
        
        for character in characters[:50]:  # Limit for example
            char_profile = character.get('role_description', '')
            char_name = character.get('metadata', {}).get('character_name', 'Character')
            
            for scenario_type, templates in conflict_templates.items():
                for template in templates[:2]:  # 2 per type
                    # Fill template with appropriate parameters
                    filled_template = template
                    for param, values in template_params.items():
                        if f'{{{param}}}' in template:
                            filled_template = filled_template.replace(f'{{{param}}}', values[0])
                    
                    scenario = {
                        'scenario_type': scenario_type,
                        'character_name': char_name,
                        'character_profile': char_profile,
                        'user_query': filled_template,
                        'expected_self_awareness': f"As {char_name}, I should...",
                        'expected_other_awareness': "The user is asking for...",
                        'metadata': {
                            'difficulty': 'high',
                            'character_id': character.get('metadata', {}).get('character_id', '')
                        }
                    }
                    conflict_scenarios.append(scenario)
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(conflict_scenarios, f, indent=2, ensure_ascii=False)
        
        print(f"Created {len(conflict_scenarios)} conflict scenarios -> {output_file}")
    
    def validate_data_format(self, data_file: str) -> bool:
        """Validate that data follows expected format"""
        try:
            with open(data_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            required_fields = ['type', 'role_description', 'conversation']
            for item in data[:10]:  # Check first 10 items
                for field in required_fields:
                    if field not in item:
                        print(f"Missing required field '{field}' in {data_file}")
                        return False
                
                # Check conversation format
                if not isinstance(item['conversation'], list):
                    print(f"Conversation field must be list in {data_file}")
                    return False
                
                for turn in item['conversation']:
                    if 'speaker' not in turn or 'content' not in turn:
                        print(f"Invalid conversation turn format in {data_file}")
                        return False
            
            print(f"Data format validation passed for {data_file}")
            return True
            
        except Exception as e:
            print(f"Validation error for {data_file}: {e}")
            return False


def main():
    parser = argparse.ArgumentParser(description="Preprocess data for KSKT training")
    parser.add_argument('--character_profiles', type=str, help='Input character profiles file')
    parser.add_argument('--instruction_data', type=str, help='Input instruction following data file') 
    parser.add_argument('--output_dir', type=str, default='./data/processed', help='Output directory')
    parser.add_argument('--create_conflicts', action='store_true', help='Create conflict scenarios')
    parser.add_argument('--validate_only', action='store_true', help='Only validate data format')
    
    args = parser.parse_args()
    
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Initialize preprocessor
    preprocessor = KSKTDataPreprocessor()
    
    if args.validate_only:
        # Validate existing processed data
        for filename in os.listdir(args.output_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(args.output_dir, filename)
                preprocessor.validate_data_format(file_path)
        return
    
    # Process character profiles
    if args.character_profiles:
        output_file = os.path.join(args.output_dir, 'character_profiles.json')
        preprocessor.process_character_profiles(args.character_profiles, output_file)
    
    # Process instruction following data
    if args.instruction_data:
        output_file = os.path.join(args.output_dir, 'instruction_following.json')
        preprocessor.process_instruction_following(args.instruction_data, output_file)
    
    # Create conflict scenarios
    if args.create_conflicts and args.character_profiles:
        output_file = os.path.join(args.output_dir, 'conflict_scenarios.json')
        processed_chars = os.path.join(args.output_dir, 'character_profiles.json')
        if os.path.exists(processed_chars):
            preprocessor.create_conflict_scenarios(processed_chars, output_file)
        else:
            print("Process character profiles first before creating conflict scenarios")


if __name__ == "__main__":
    main()

