"""
The fast api implementation of LLM
Using VLLM framework to adapt for multiple mainstream LLM architectures
"""
from fastapi import FastAPI, Request
import argparse
import fcntl
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import json
import datetime
import torch
import time
import uuid
import sys
import os
import uvicorn
import subprocess
import re
import numpy as np
import torch.nn.functional as F
from global_utils.reward_models import auto_get_rm, rm_path_dict
import logging
from uvicorn.config import LOGGING_CONFIG

os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

# the
LOGGING_CONFIG["formatters"]["default"]["fmt"] = "%(asctime)s - %(levelprefix)s %(message)s"
LOGGING_CONFIG["formatters"]["access"]["fmt"] = "%(asctime)s - %(levelprefix)s %(client_addr)s - \"%(request_line)s\" %(status_code)s"

logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler1 = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s - %(module)s - %(funcName)s - line:%(lineno)d - %(levelname)s - %(message)s"
)
logger.addHandler(handler1)
# Command line argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default='Qwen2.5-Math-PRM-7B')
parser.add_argument("--port", default=6007)
# parser.add_argument("--rope_scaling", action="store_true")

args = parser.parse_args()
gpu_num = torch.cuda.device_count()
# Create the FastAPI application
app = FastAPI()

# GPU cleanup function
def torch_gc():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

# The main function for POST request
@app.post("/")
async def create_item(request: Request):
    request_content_list = []
    json_post_raw = await request.json()
    json_post = json.dumps(json_post_raw)
    json_post_list = json.loads(json_post)
    # Generate response
    response_content = args.model.obtain_reward(json_post_list['question'], json_post_list['response'], json_post_list['batch_size'])
    return response_content


# Main server function
def main():
    model = auto_get_rm(args.model_name)(rm_path_dict[args.model_name], args.model_name, device='auto')
    args.model = model
    model_name = args.model_name
    server_name = model_name + '-' + str(uuid.uuid4()).split('-')[0]
    args.server_name = server_name
    print(f'Server {server_name} started and waiting for requests!')
    ip_output = subprocess.run(['ip', 'addr'], capture_output=True, text=True).stdout
    # match = re.search(r'net\d+:\s+.*?inet\s+(10\.\d+\.\d+\.\d+)', ip_output, re.DOTALL)
    match = re.search(r'inet\s+(172\.\d+\.\d+\.\d+)', ip_output)
    if match:
        internal_ip = match.group(1)
        print("The id address is:", internal_ip)
        with open('./all_server.txt', 'a') as f:
            f.write(f"server_name: {server_name} ip: {internal_ip} \n")
    else:
        print("No match ip address!")


    # Start FastAPI
    # 6006 endpoint
    uvicorn.run(app, host='0.0.0.0', port=int(args.port), workers=1) 


if __name__ == '__main__':
    main()