#!/usr/bin/env python3

import json
import numpy as np
from pathlib import Path
from typing import Dict, List

def load_trait_definitions(filepath: Path) -> Dict:
    with open(filepath, 'r') as f:
        return json.load(f)

def prepare_prompt_pairs(trait: str, definition: Dict) -> List[tuple]:
    pairs = []
    
    base_prompt = definition.get('base_prompt', '')
    positive_examples = definition.get('positive', [])
    negative_examples = definition.get('negative', [])
    
    for pos in positive_examples:
        for neg in negative_examples:
            pairs.append((
                f"{base_prompt} {pos}",
                f"{base_prompt} {neg}"
            ))
    
    return pairs

def load_dataset_splits(data_dir: Path) -> Dict:
    splits = {}
    
    for split in ['train', 'val', 'test']:
        split_file = data_dir / f'{split}.json'
        if split_file.exists():
            with open(split_file, 'r') as f:
                splits[split] = json.load(f)
    
    return splits

def preprocess_text(text: str) -> str:
    text = text.strip()
    text = ' '.join(text.split())
    return text

def batch_texts(texts: List[str], batch_size: int = 32) -> List[List[str]]:
    batches = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batches.append(batch)
    return batches

def save_prepared_data(data: Dict, output_path: Path):
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    if output_path.suffix == '.json':
        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2)
    elif output_path.suffix == '.npy':
        np.save(output_path, data)

def main():
    from ..utils.config import BASE_DIR, TRAITS
    
    data_dir = BASE_DIR / 'data'
    output_dir = BASE_DIR / 'prepared_data'
    
    for trait in TRAITS:
        print(f"Processing {trait}...")
        
        trait_file = data_dir / f'{trait}_prompts.json'
        if trait_file.exists():
            with open(trait_file, 'r') as f:
                trait_data = json.load(f)
            
            processed = {
                'trait': trait,
                'prompts': preprocess_text(trait_data.get('prompt', '')),
                'examples': trait_data.get('examples', [])
            }
            
            output_path = output_dir / f'{trait}_prepared.json'
            save_prepared_data(processed, output_path)
    
    print("Dataset preparation complete")

if __name__ == "__main__":
    main()