import pandas as pd
import numpy as np
from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold
import torch
import json
import wandb
from typing import Dict, List
from data_utils import load_metaculus_data, load_cladder
from accelerate import Accelerator

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return [{
        'text': item['text'],
        'label': item['resolution']
    } for item in data]

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def train_fold(
    train_dataset, 
    val_dataset, 
    model_name: str,
    fold_idx: int,
    base_output_dir: str
) -> Dict[str, float]:
    
    # Initialize accelerator
    accelerator = Accelerator()
    
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        problem_type="single_label_classification"
    )

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding="max_length",
            truncation=True,
            max_length=512
        )

    # Tokenize datasets
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)

    # Calculate class weights
    labels = train_dataset['label']
    class_counts = np.bincount(labels)
    total_samples = len(labels)
    class_weights = torch.FloatTensor([total_samples / (len(class_counts) * count) for count in class_counts])

    # Define training arguments with adjusted parameters
    training_args = TrainingArguments(
        output_dir=f"{base_output_dir}/{fold_idx}",
        learning_rate=2e-6,  # Reduced learning rate by 10x
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps (effective batch size = 8)
        num_train_epochs=60,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=100,
        # load_best_model_at_end=True,
        metric_for_best_model="f1",
        # save_total_limit=1,
        report_to="wandb",
        save_strategy="no",
        run_name=f"{fold_idx}",
        warmup_ratio=0.1,  # Changed from warmup_steps to warmup_ratio
        logging_steps=2,
        lr_scheduler_type="cosine"  # Added cosine learning rate scheduler
    )

    # Create custom trainer with weighted loss
    class WeightedTrainer(Trainer):
        def __init__(self, class_weights, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.class_weights = class_weights.to(self.args.device)

        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Apply weighted cross entropy loss
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            
            return (loss, outputs) if return_outputs else loss

    # Initialize trainer with class weights
    trainer = WeightedTrainer(
        class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
    )

    # Prepare everything with accelerator
    trainer = accelerator.prepare(trainer)

    # Train the model
    trainer.train()
    
    # Evaluate on validation set
    eval_results = trainer.evaluate()
    
    return eval_results
def cross_validate(n_folds: int = 5):
    # Initialize accelerator
    accelerator = Accelerator()
    
    # Load all datasets
    dataset = load_cladder()
    
    
    # Split into train and test sets (80% train, 20% test)
    train_test_split = dataset.train_test_split(test_size=0.8, shuffle=True, seed=42)
    train_dataset = train_test_split['train']
    test_dataset = train_test_split['test']
    
    # split into train, test 
    
    # make resolution column label 1 if resolution is 1 or 1.0, 0 otherwise
    train_dataset = train_dataset.map(lambda x: {"label": 1 if x["label"] == "yes" else 0})
    test_dataset = test_dataset.map(lambda x: {"label": 1 if x["label"] == "yes" else 0})
    
    # Rename prompt column to text
    train_dataset = train_dataset.rename_column("prompt", "text")
    test_dataset = test_dataset.rename_column("prompt", "text")
    
    # shuffle both datasets
    train_dataset = train_dataset.shuffle(seed=42)
    test_dataset = test_dataset.shuffle(seed=42)
    
    train_label_counts = np.bincount(train_dataset['label'])
    test_label_counts = np.bincount(test_dataset['label'])
    if accelerator.is_main_process:
        print(f"Train label counts: {train_label_counts}")
        print(f"Test label counts: {test_label_counts}")
    
    # print first 2 rows of train and test datasets
    if accelerator.is_main_process:
        print(f"Train dataset: {train_dataset[:2]}")
        print(f"Test dataset: {test_dataset[:2]}")

    # Initialize wandb only on the main process
    if accelerator.is_main_process:
        wandb.init(project="forecasting-classifier", name="cladder-v1.5")
    
    # Store results for each fold
    fold_results = []
    model_name = "microsoft/deberta-v3-large"
    
    # Combine train and validation datasets for cross-validation
    combined_train = concatenate_datasets([train_dataset])
    
    
    # Train final model on all training data and evaluate on validation set
    if accelerator.is_main_process:
        print("Training final model on all training data...")
    train_all = concatenate_datasets([train_dataset])
    final_results = train_fold(
        train_all,
        test_dataset,
        model_name,
        fold_idx="metaculus",
        base_output_dir="./results"
    )
    
    # Log results only on the main process
    if accelerator.is_main_process:
        wandb.log({
            "final_accuracy": final_results['eval_accuracy'],
            "final_f1": final_results['eval_f1'],
            "final_precision": final_results['eval_precision'],
            "final_recall": final_results['eval_recall']
        })
        
        wandb.finish()
    
    # return avg_metrics, final_results

if __name__ == "__main__":
    # Enable deterministic behavior for reproducibility across GPUs
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    cross_validate(n_folds=5)
    # avg_metrics, final_results = cross_validate(n_folds=5)
    # print("\nAverage Cross-Validation Metrics:")
    # print(avg_metrics)
    # print("\nFinal Model Performance on Validation Set:")
    # print(final_results)