import re
import torch
import transformers
from transformers import AutoTokenizer

zephyr_path = '/data/models/huggingface-format/zephyr-7b-alpha/'

tokenizer = AutoTokenizer.from_pretrained(zephyr_path)
pipeline = transformers.pipeline(
    "text-generation",
    model=zephyr_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

def call(message, max_tokens=1000):
    with torch.no_grad():
        prompt = pipeline.tokenizer.apply_chat_template(
            message,
            tokenize=False,
            add_generation_prompt=True)
        sequences = pipeline(
            prompt,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=max_tokens,
            top_k=50,
            top_p=0.95,
            temperature=0.2,
        )
        response = sequences[0]['generated_text']

    response = re.sub('\n\n', '\n', response)
#    print(response)
    response = '\n'.join(response.split('\n')[5:])
    
    return response

if __name__ == '__main__':
    message = [
        {'role': 'system', 'content': 'You are a useful assistant.'},
        {'role': 'user', 'content': 'How can I improve my time management skills?'}
    ]
    print(call(message))

