#!/usr/bin/env python3

import json
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict
from datasets import load_dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CompleteHuggingFaceHarvester:
    def __init__(self):
        self.output_dir = Path('harvested_data')
        self.output_dir.mkdir(exist_ok=True)
        
    def harvest_cnn_dailymail(self, max_pairs: int = 5000) -> Dict[str, List[Tuple[str, str]]]:
        dataset = load_dataset('cnn_dailymail', '3.0.0')
        train_data = dataset['train']
        
        accessibility_pairs = []
        verbosity_pairs = []
        authority_pairs = []
        
        for i in range(min(max_pairs, len(train_data))):
            article = train_data[i]['article']
            summary = train_data[i]['highlights']
            
            accessibility_pairs.append((summary, article))
            verbosity_pairs.append((article, summary))
            
            if 'Reuters' in article or 'CNN' in article or 'Associated Press' in article:
                authority_pairs.append((article[:500], summary))
        
        return {
            'accessibility': accessibility_pairs,
            'verbosity': verbosity_pairs,
            'authority': authority_pairs[:3000]
        }
    
    def harvest_goemotions(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('go_emotions', 'simplified')
        train_data = dataset['train']
        
        positive_texts = []
        negative_texts = []
        
        for item in train_data:
            labels = item['labels']
            text = item['text']
            
            if 3 in labels or 4 in labels or 13 in labels or 15 in labels:
                positive_texts.append(text)
            if 2 in labels or 7 in labels or 8 in labels or 14 in labels:
                negative_texts.append(text)
        
        pairs = []
        for i in range(min(max_pairs, min(len(positive_texts), len(negative_texts)))):
            pairs.append((positive_texts[i], negative_texts[i]))
        
        return pairs
    
    def harvest_paradetox(self, max_pairs: int = 4577) -> List[Tuple[str, str]]:
        dataset = load_dataset('s-nlp/paradetox', 'en')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            toxic = train_data[i]['toxic_comment']
            neutral = train_data[i]['neutral_comment']
            pairs.append((neutral, toxic))
        
        return pairs
    
    def harvest_imdb(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('imdb')
        train_data = dataset['train']
        
        positive_reviews = []
        negative_reviews = []
        
        for item in train_data:
            if item['label'] == 1:
                positive_reviews.append(item['text'])
            else:
                negative_reviews.append(item['text'])
        
        pairs = []
        for i in range(min(max_pairs, min(len(positive_reviews), len(negative_reviews)))):
            pairs.append((positive_reviews[i], negative_reviews[i]))
        
        return pairs
    
    def harvest_civil_comments(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('google/civil_comments')
        train_data = dataset['train']
        
        professional_texts = []
        unprofessional_texts = []
        
        for item in train_data:
            if item['toxicity'] < 0.2:
                professional_texts.append(item['text'])
            elif item['toxicity'] > 0.7:
                unprofessional_texts.append(item['text'])
        
        pairs = []
        for i in range(min(max_pairs, min(len(professional_texts), len(unprofessional_texts)))):
            pairs.append((professional_texts[i], unprofessional_texts[i]))
        
        return pairs
    
    def harvest_newsroom(self, max_pairs: int = 5000) -> List[Tuple[str, str]]:
        dataset = load_dataset('lil-lab/newsroom')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            item = train_data[i]
            if item['density'] > 3.0:
                complex_text = item['text'][:1000]
                clear_text = item['summary']
                pairs.append((clear_text, complex_text))
        
        return pairs
    
    def harvest_empathetic_dialogues(self, max_pairs: int = 4000) -> List[Tuple[str, str]]:
        dataset = load_dataset('facebook/empathetic_dialogues')
        train_data = dataset['train']
        
        formal_responses = []
        informal_responses = []
        
        for item in train_data:
            utterance = item['utterance']
            if len(utterance) > 50:
                if any(word in utterance.lower() for word in ['therefore', 'furthermore', 'however']):
                    formal_responses.append(utterance)
                elif any(word in utterance.lower() for word in ['yeah', 'gonna', 'wanna', 'kinda']):
                    informal_responses.append(utterance)
        
        pairs = []
        for i in range(min(max_pairs, min(len(formal_responses), len(informal_responses)))):
            pairs.append((formal_responses[i], informal_responses[i]))
        
        return pairs
    
    def harvest_emotion(self, max_pairs: int = 3000) -> List[Tuple[str, str]]:
        dataset = load_dataset('emotion')
        train_data = dataset['train']
        
        enthusiastic_texts = []
        neutral_texts = []
        
        for item in train_data:
            if item['label'] == 0:
                enthusiastic_texts.append(item['text'])
            elif item['label'] == 4:
                neutral_texts.append(item['text'])
        
        pairs = []
        for i in range(min(max_pairs, min(len(enthusiastic_texts), len(neutral_texts)))):
            pairs.append((enthusiastic_texts[i], neutral_texts[i]))
        
        return pairs
    
    def harvest_debatesum(self, max_pairs: int = 3000) -> List[Tuple[str, str]]:
        dataset = load_dataset('Hellisotherpeople/DebateSum')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            item = train_data[i]
            if 'Argument' in item:
                assertive = item['Argument'][:500]
                question = item.get('Extract', assertive)
                pairs.append((assertive, question))
        
        return pairs
    
    def harvest_silicone(self, max_pairs: int = 3000) -> List[Tuple[str, str]]:
        dataset = load_dataset('eusip/silicone', 'dyda_da')
        train_data = dataset['train']
        
        direct_texts = []
        indirect_texts = []
        
        for item in train_data:
            utterance = item['Utterance']
            label = item['Label']
            
            if label in [0, 1, 2]:
                direct_texts.append(utterance)
            elif label in [3, 4]:
                indirect_texts.append(utterance)
        
        pairs = []
        for i in range(min(max_pairs, min(len(direct_texts), len(indirect_texts)))):
            pairs.append((direct_texts[i], indirect_texts[i]))
        
        return pairs
    
    def harvest_hate_speech(self, max_pairs: int = 3000) -> List[Tuple[str, str]]:
        dataset = load_dataset('ucberkeley-dlab/measuring-hate-speech')
        train_data = dataset['train']
        
        inclusive_texts = []
        exclusive_texts = []
        
        for item in train_data:
            text = item['text']
            score = item['hate_speech_score']
            
            if score < -1.0:
                inclusive_texts.append(text)
            elif score > 1.0:
                exclusive_texts.append(text)
        
        pairs = []
        for i in range(min(max_pairs, min(len(inclusive_texts), len(exclusive_texts)))):
            pairs.append((inclusive_texts[i], exclusive_texts[i]))
        
        return pairs
    
    def harvest_objectivity(self, max_pairs: int = 4000) -> List[Tuple[str, str]]:
        dataset = load_dataset('newsmediabias/debiased_dataset')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            item = train_data[i]
            if 'biased_text' in item and 'debiased_text' in item:
                biased = item['biased_text']
                objective = item['debiased_text']
                pairs.append((objective, biased))
        
        return pairs
    
    def harvest_arxiv(self, max_pairs: int = 4000) -> List[Tuple[str, str]]:
        dataset = load_dataset('ccdv/arxiv-summarization')
        train_data = dataset['train']
        
        pairs = []
        for i in range(min(max_pairs, len(train_data))):
            item = train_data[i]
            specific = item['article'][:500]
            general = item['abstract']
            pairs.append((specific, general))
        
        return pairs
    
    def save_pairs(self, pairs: List[Tuple[str, str]], trait: str):
        output_file = self.output_dir / f'{trait}_pairs.json'
        
        formatted_pairs = [
            {'positive': pos, 'negative': neg}
            for pos, neg in pairs
        ]
        
        with open(output_file, 'w') as f:
            json.dump(formatted_pairs, f, indent=2)
        
        logger.info(f"Saved {len(pairs)} pairs for {trait} to {output_file}")
    
    def harvest_all(self):
        logger.info("Starting complete HuggingFace dataset harvesting for 15 traits...")
        
        logger.info("1/9: Harvesting CNN/DailyMail (accessibility, authority, verbosity)...")
        cnn_results = self.harvest_cnn_dailymail()
        self.save_pairs(cnn_results['accessibility'], 'accessibility')
        self.save_pairs(cnn_results['verbosity'], 'verbosity')
        self.save_pairs(cnn_results['authority'], 'authority')
        
        logger.info("2/9: Harvesting Go-Emotions (emotional_tone)...")
        emotion_pairs = self.harvest_goemotions()
        self.save_pairs(emotion_pairs, 'emotional_tone')
        
        logger.info("3/9: Harvesting ParaDetox (formality)...")
        formality_pairs = self.harvest_paradetox()
        self.save_pairs(formality_pairs, 'formality')
        
        logger.info("4/9: Harvesting IMDB (optimism)...")
        optimism_pairs = self.harvest_imdb()
        self.save_pairs(optimism_pairs, 'optimism')
        
        logger.info("5/9: Harvesting Civil Comments (professionalism)...")
        professionalism_pairs = self.harvest_civil_comments()
        self.save_pairs(professionalism_pairs, 'professionalism')
        
        logger.info("6/9: Harvesting Newsroom (clarity)...")
        clarity_pairs = self.harvest_newsroom()
        self.save_pairs(clarity_pairs, 'clarity')
        
        logger.info("7/9: Harvesting Empathetic Dialogues (register)...")
        register_pairs = self.harvest_empathetic_dialogues()
        self.save_pairs(register_pairs, 'register')
        
        logger.info("8/10: Harvesting Emotion dataset (enthusiasm)...")
        enthusiasm_pairs = self.harvest_emotion()
        self.save_pairs(enthusiasm_pairs, 'enthusiasm')
        
        logger.info("9/10: Harvesting Objectivity dataset...")
        try:
            objectivity_pairs = self.harvest_objectivity()
            self.save_pairs(objectivity_pairs, 'objectivity')
        except Exception as e:
            logger.warning(f"Could not harvest Debiased News dataset: {e}")
        
        logger.info("10/10: Additional datasets...")
        
        try:
            assertiveness_pairs = self.harvest_debatesum()
            self.save_pairs(assertiveness_pairs, 'assertiveness')
        except Exception as e:
            logger.warning(f"Could not harvest DebateSum: {e}")
        
        try:
            directness_pairs = self.harvest_silicone()
            self.save_pairs(directness_pairs, 'directness')
        except Exception as e:
            logger.warning(f"Could not harvest SILICONE: {e}")
        
        try:
            inclusivity_pairs = self.harvest_hate_speech()
            self.save_pairs(inclusivity_pairs, 'inclusivity')
        except Exception as e:
            logger.warning(f"Could not harvest Hate Speech dataset: {e}")
        
        try:
            specificity_pairs = self.harvest_arxiv()
            self.save_pairs(specificity_pairs, 'specificity')
        except Exception as e:
            logger.warning(f"Could not harvest ArXiv dataset: {e}")
        logger.info("Complete dataset harvesting finished!")

def main():
    harvester = CompleteHuggingFaceHarvester()
    harvester.harvest_all()

if __name__ == "__main__":
    main()