from flask import Flask, request, jsonify
from threading import Lock
from vllm import LLM, SamplingParams

HUMAN_HEADER = "Question:"
AI_HEADER = "Answer:"


class Inferencer:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model_lock = Lock()
        self.model = None

    def get_model(self):
        if self.model is None:
            self.model = LLM(self.model_name)
        return self.model

    def predict_from_json(self, inputs):
        if inputs is None:
            return jsonify({"error": "no json provided"})

        if 'conversation' not in inputs or not isinstance(inputs['conversation'], list):
            return jsonify({"error": "conversation not found"})

        convo = inputs['conversation']

        if len(convo) == 0 or not all(isinstance(x, str) for x in convo):
            return jsonify({"error": "conversation must be a list of strings"})

        # must be odd
        if len(convo) % 2 == 0:
            return jsonify({"error": "conversation must have an odd number of strings; last one is the user input"})

        max_tokens = inputs.get('max_tokens', (len(inputs) * 3) + 1024)
        temperature = inputs.get('temperature', 0.4)
        top_p = inputs.get('top_p', 0.9)
        n = inputs.get('n', 1)

        with self.model_lock:
            model = self.get_model()
            params = SamplingParams(
                max_tokens=max_tokens, temperature=temperature, top_p=top_p, stop=[
                    HUMAN_HEADER]
            )
            prompt = ""
            for i, text in enumerate(convo):
                if i % 2 == 0:
                    prompt += f"{HUMAN_HEADER}\n{text}\n"
                else:
                    prompt += f"{AI_HEADER}\n{text}\n"

            prompt += f"{AI_HEADER}\n"

            result = model.generate(
                [prompt] * n, sampling_params=params,
            )
            outs = [x.outputs[0].text for x in result]

        return jsonify(outs)


app = Flask(__name__)
inferencer = Inferencer("bigcode/starcoder")


@app.after_request  # pragma: no cover
def after_request(response):
    response.headers.add('Access-Control-Allow-Origin', '*')
    response.headers.add('Access-Control-Allow-Headers',
                         'Content-Type,Authorization')
    response.headers.add('Access-Control-Allow-Methods',
                         'GET,PUT,POST,DELETE,OPTIONS')
    return response


@app.route('/predict', methods=['POST'])  # pragma: no cover
def predict():
    return inferencer.predict_from_json(request.json)
