#!/usr/bin/env python3

import json
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict
from datasets import load_dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HuggingFaceHarvester:
    def __init__(self):
        self.output_dir = Path('harvested_data')
        self.output_dir.mkdir(exist_ok=True)
        
    def harvest_cnn_dailymail(self, max_pairs: int = 5000) -> Dict[str, List[Tuple[str, str]]]:
        dataset = load_dataset('cnn_dailymail', '3.0.0')
        train_data = dataset['train']
        
        accessibility_pairs = []
        verbosity_pairs = []
        
        for i in range(min(max_pairs, len(train_data))):
            article = train_data[i]['article']
            summary = train_data[i]['highlights']
            
            accessibility_pairs.append((article, summary))
            verbosity_pairs.append((article, summary))
        
        return {
            'accessibility': accessibility_pairs,
            'verbosity': verbosity_pairs
        }
    
    def harvest_goemotions(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('go_emotions')
        train_data = dataset['train']
        
        positive_emotions = [3, 4, 5, 6, 13, 15, 17, 18, 20]
        negative_emotions = [2, 7, 8, 9, 10, 11, 14, 16, 19, 22, 23, 24, 25]
        
        positive_texts = []
        negative_texts = []
        
        for item in train_data:
            labels = item['labels']
            text = item['text']
            
            if any(label in positive_emotions for label in labels):
                positive_texts.append(text)
            if any(label in negative_emotions for label in labels):
                negative_texts.append(text)
        
        pairs = []
        for i in range(min(max_pairs, min(len(positive_texts), len(negative_texts)))):
            pairs.append((positive_texts[i], negative_texts[i]))
        
        return pairs
    
    def harvest_paradetox(self, max_pairs: int = 4577) -> List[Tuple[str, str]]:
        dataset = load_dataset('s-nlp/paradetox', 'en')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            toxic = train_data[i]['toxic_comment']
            neutral = train_data[i]['neutral_comment']
            pairs.append((neutral, toxic))
        
        return pairs
    
    def harvest_imdb(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('imdb')
        train_data = dataset['train']
        
        positive_reviews = []
        negative_reviews = []
        
        for item in train_data:
            if item['label'] == 1:
                positive_reviews.append(item['text'])
            else:
                negative_reviews.append(item['text'])
        
        pairs = []
        for i in range(min(max_pairs, min(len(positive_reviews), len(negative_reviews)))):
            pairs.append((positive_reviews[i], negative_reviews[i]))
        
        return pairs
    
    def harvest_civil_comments(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('civil_comments')
        train_data = dataset['train']
        
        professional_texts = []
        unprofessional_texts = []
        
        for item in train_data:
            if item['toxicity'] < 0.3:
                professional_texts.append(item['text'])
            elif item['toxicity'] > 0.7:
                unprofessional_texts.append(item['text'])
        
        pairs = []
        for i in range(min(max_pairs, min(len(professional_texts), len(unprofessional_texts)))):
            pairs.append((professional_texts[i], unprofessional_texts[i]))
        
        return pairs
    
    def save_pairs(self, pairs: List[Tuple[str, str]], trait: str):
        output_file = self.output_dir / f'{trait}_pairs.json'
        
        formatted_pairs = [
            {'positive': pos, 'negative': neg}
            for pos, neg in pairs
        ]
        
        with open(output_file, 'w') as f:
            json.dump(formatted_pairs, f, indent=2)
        
        logger.info(f"Saved {len(pairs)} pairs for {trait} to {output_file}")
    
    def harvest_all(self):
        logger.info("Starting HuggingFace dataset harvesting...")
        
        logger.info("Harvesting CNN/DailyMail...")
        cnn_results = self.harvest_cnn_dailymail()
        self.save_pairs(cnn_results['accessibility'], 'accessibility')
        self.save_pairs(cnn_results['verbosity'], 'verbosity')
        
        logger.info("Harvesting Go-Emotions...")
        emotion_pairs = self.harvest_goemotions()
        self.save_pairs(emotion_pairs, 'emotional_tone')
        
        logger.info("Harvesting ParaDetox...")
        formality_pairs = self.harvest_paradetox()
        self.save_pairs(formality_pairs, 'formality')
        
        logger.info("Harvesting IMDB...")
        optimism_pairs = self.harvest_imdb()
        self.save_pairs(optimism_pairs, 'optimism')
        
        logger.info("Harvesting Civil Comments...")
        professionalism_pairs = self.harvest_civil_comments()
        self.save_pairs(professionalism_pairs, 'professionalism')
        
        trait_mapping = {
            'newsroom': 'clarity',
            'empathetic_dialogues': 'register',
            'emotion': 'enthusiasm',
            'debatesum': 'assertiveness',
            'silicone': 'directness',
            'hate_speech': 'inclusivity',
            'debiased_news': 'objectivity',
            'arxiv': 'specificity'
        }
        
        logger.info("Dataset harvesting complete!")

def main():
    harvester = HuggingFaceHarvester()
    harvester.harvest_all()

if __name__ == "__main__":
    main()