import requests
import json
import time
from openai import OpenAI
from peft import PeftModel
import torch
from transformers import (
    PreTrainedModel,
    PretrainedConfig,
    Qwen2ForCausalLM,
    Qwen2Tokenizer,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)


class OpenAILLM:
    def __init__(self, model_name, max_tokens=128, temperature=0):
        self.client = OpenAI(api_key='openai_api_key', base_url='openai_url')
        self.config = {
            'model': model_name,
            'max_tokens': max_tokens,
            'temperature': temperature,
        }

    def run(self, prompt):
        while True:
            try:
                messages = [{"role": "user", "content": prompt}]
                response = None
                while not response:
                    output = self.client.chat.completions.create(
                        model=self.config['model'],
                        messages=messages,
                        temperature=self.config['temperature'],
                        max_tokens=self.config['max_tokens'],
                        top_p=1.0,
                        frequency_penalty=0.0,
                        presence_penalty=0.0,
                    )
                    response = output.choices[0].message.content
                return response
            except Exception as e:
                print(e)
                print("retrying")
                time.sleep(1)


class HuggingLocalLLM:
    def __init__(self, model_name, max_tokens=512, temperature=0, top_p=1, n=1, max_length=2048, do_sample=False):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = Qwen2Tokenizer.from_pretrained(
            model_name,
            padding_side="left"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_length = max_length
        self.model = Qwen2ForCausalLM.from_pretrained(model_name).to(self.device)
        self.generation_kwargs = {
            "max_new_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "num_return_sequences": n,
            "num_beams": 1,
            "do_sample": do_sample,
            "repetition_penalty": 1.0,
            "pad_token_id": self.tokenizer.eos_token_id
        }

    def run(self, prompt):
        self.model.eval()
        concatenated_encodings = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        concatenated_input_ids = concatenated_encodings["input_ids"].to(self.device)
        concatenated_attention_mask = concatenated_encodings["attention_mask"].to(self.device)

        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids=concatenated_input_ids,
                attention_mask=concatenated_attention_mask,
                **self.generation_kwargs
            )
        generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        final_answers = [t.replace(c, '') for c, t in zip(prompt, generated_texts)]

        return final_answers


class HuggingLocalLoraLLM:
    def __init__(self, model_name, lora_path, max_tokens=512, temperature=0, top_p=1, n=1, max_length=2048):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side="left"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_length = max_length

        self.model = PeftModel.from_pretrained(Qwen2ForCausalLM.from_pretrained(model_name),
                                               lora_path).to(self.device)

        self.generation_kwargs = {
            "max_new_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "num_return_sequences": n,
            "num_beams": 1,
            "do_sample": False,
            "repetition_penalty": 1.0,
            "pad_token_id": self.tokenizer.eos_token_id
        }

    def run(self, prompt):
        self.model.eval()
        concatenated_encodings = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        concatenated_input_ids = concatenated_encodings["input_ids"].to(self.device)
        concatenated_attention_mask = concatenated_encodings["attention_mask"].to(self.device)

        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids=concatenated_input_ids,
                attention_mask=concatenated_attention_mask,
                **self.generation_kwargs
            )
        generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        final_answers = [t.replace(c, '') for c, t in zip(prompt, generated_texts)]

        return final_answers

