import pandas as pd
import numpy as np
from datasets import Dataset, concatenate_datasets, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import torch
import json
import wandb
from typing import Dict, List
from accelerate import Accelerator

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        
    for item in data:
        options_txt = ""
        for i, option in enumerate(item['options']):
            options_txt += f"{chr(65 + i)}. {option}\n"
        item['just_options'] = options_txt
        # print(item['just_options'])
        
    return [{
        'text': item['just_options'],
        'label': item['answer_idx']
    } for item in data]

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def train_model(
    train_dataset, 
    eval_dataset, 
    model_name: str,
    output_dir: str
) -> Dict[str, float]:
    
    # Initialize accelerator
    accelerator = Accelerator()
    
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=4,  # For A, B, C, D options
        problem_type="single_label_classification"
    )

    # Freeze all layers except the classification head
    # for param in model.parameters():
    #     param.requires_grad = False
    
    # # # Unfreeze only the classification head
    # for param in model.classifier.parameters():
    #     param.requires_grad = True
    
    # # Print number of trainable parameters
    # trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # total_params = sum(p.numel() for p in model.parameters())
    # print(f"Trainable parameters: {trainable_params} ({trainable_params/total_params:.2%} of total)")

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding="max_length",
            truncation=True,
            max_length=512
        )

    # Tokenize datasets
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

    # Calculate class weights
    labels = train_dataset['label']
    class_counts = np.bincount(labels)
    total_samples = len(labels)
    class_weights = torch.FloatTensor([total_samples / (len(class_counts) * count) for count in class_counts])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=1,
        num_train_epochs=50,
        weight_decay=0.01,
        eval_strategy="steps",
        eval_steps=100,
        metric_for_best_model="accuracy",
        save_total_limit=1,
        run_name="mengye-mcq",
        warmup_ratio=0.1,
        logging_steps=10,
        report_to="wandb",
        save_strategy="no",
        lr_scheduler_type="cosine"
    )

    # Create custom trainer with weighted loss
    class WeightedTrainer(Trainer):
        def __init__(self, class_weights, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.class_weights = class_weights.to(self.args.device)

        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Apply weighted cross entropy loss
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            
            return (loss, outputs) if return_outputs else loss

    # Initialize trainer with class weights
    trainer = Trainer(
        # class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        compute_metrics=compute_metrics,
    )

    # Prepare everything with accelerator
    trainer = accelerator.prepare(trainer)

    # Train the model
    trainer.train()
    
    # Evaluate on validation set
    eval_results = trainer.evaluate()
    
    return eval_results

def main():
    # Initialize accelerator
    accelerator = Accelerator()
    
    # Load all datasets
    train_data = load_data("/fast/XXXX-3/forecasting/datasets/menge/mcq_train.json")
    val_data = load_data("/fast/XXXX-3/forecasting/datasets/menge/mcq_validation.json")
    test_data = load_data("/fast/XXXX-3/forecasting/datasets/menge/mcq_test.json")
    # test_data = load_data("/fast/XXXX-3/forecasting/datasets/infinitegames/binary_balanced_test.json")
    
    # Convert to HuggingFace datasets
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    test_dataset = Dataset.from_list(test_data)
    
    # print kength of eahc dataset
    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Val dataset length: {len(val_dataset)}")
    print(f"Test dataset length: {len(test_dataset)}")
    
    # Initialize wandb only on the main process
    if accelerator.is_main_process:
        wandb.init(project="mcq-classifier", name="mengye-mcq-only-options")
    
    # Use a strong model for multiple-choice tasks
    model_name = "google/flan-t5-xl"  # Alternative: "microsoft/deberta-v3-large"
    model_name = "microsoft/deberta-v3-large"
    
    # Train on combined train and validation data, evaluate on test
    if accelerator.is_main_process:
        print("Training model on training data...")
    
    # First train on training data and evaluate on validation data
    test_results = train_model(
        train_dataset,
        test_dataset,
        model_name,
        output_dir="./results/mengye-mcq/test"
    )
    
    # Log validation results
    # if accelerator.is_main_process:
    #     wandb.log({
    #         "val_accuracy": val_results['eval_accuracy'],
    #         "val_f1": val_results['eval_f1'],
    #         "val_precision": val_results['eval_precision'],
    #         "val_recall": val_results['eval_recall']
    #     })
        
    #     print("\nValidation performance:")
    #     print(val_results)
    
    # Then train on combined train+validation and evaluate on test
    # if accelerator.is_main_process:
    #     print("\nTraining final model on combined train and validation data...")
    
    # combined_train = concatenate_datasets([train_dataset, val_dataset])
    
    # test_results = train_model(
    #     combined_train,
    #     test_dataset,
    #     model_name,
    #     output_dir="./results/mengye-mcq/test"
    # )
    
    # Log test results
    if accelerator.is_main_process:
        wandb.log({
            "test_accuracy": test_results['eval_accuracy'],
            "test_f1": test_results['eval_f1'],
            "test_precision": test_results['eval_precision'],
            "test_recall": test_results['eval_recall']
        })
        
        print("\nTest performance:")
        print(test_results)
        
        wandb.finish()
    
    return test_results

if __name__ == "__main__":
    # Enable deterministic behavior for reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    test_results = main()
    print("\nTest Performance:")
    print(test_results)