import re
import os
import sys
import json
import spacy

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from argparse import ArgumentParser
from module_04_upgrade.rule_rewrite import rule_rewrite
from utils.set_random_seed import set_random_seed
from utils.json_reader import jsonl_loader, json_loader
from utils.model_loader import Model, VLLM_Model
from reader.llama_reader import llama_reader
from reader.vllm_reader import vllm_reader, vllm_reader_batch
from module_04_upgrade.llm_rewrite import llm_feature
from random import sample

def dataset_rewrite(jsonl_input, jsonl_output):

    # load the nlp tool
    nlp_tool = spacy.load("en_core_web_sm")

    set_random_seed(42)
    json_list = json_loader(jsonl_input)
    object = []

    path_to_yml = "configs/config.yml"
    model_name = "qwen_32b_model"

    # load the model
    language_model = VLLM_Model(model_name, path_to_yml)
    model = language_model.load_model()
    tokenizer = language_model.load_tokenizer()
    params = language_model.load_config()
    
    for line in json_list:
        new_question = line["new_question"]
        question = line["question"]
        question_refine = line["question_refine"]
        selected_constraints =line["selected_constraints"]

        constraint_instruction = line["constraints_instructions"]
        modification = line["modification"]
        keywords = line["keywords"] 
        answer = line["answer"]
        short_answer = line["short_answer"]
        answer_split = line["answer_split"]
        original_answer = line["original_answer"]
        sentence_count = line["sentence_count"]
        words_count = line["word_count"]
        title = line["title"]

        feature =  rule_rewrite(constraint_instruction, selected_constraints, modification, question, new_question, question_refine, answer, answer_split, original_answer, short_answer, keywords, title, sentence_count, words_count, nlp_tool, model, tokenizer, params)
        feature.constraint_selection()
        object.append(feature.to_dict())
    
    with open(jsonl_output, "w", encoding="utf-8") as w:
        json.dump(object, w, indent=2, ensure_ascii=False)

if __name__ == '__main__':
    parser= ArgumentParser()
    parser.add_argument("--jsonl_input", type=str, default="dataset/natural_question/natural_question_03.jsonl", help="Input JSONL file path")
    parser.add_argument("--jsonl_output", type=str, default="dataset/natural_question/natural_question_04.jsonl", help="Output JSONL file path")
    args = parser.parse_args()

    llm_feature(args.jsonl_input, args.jsonl_output)
    dataset_rewrite(args.jsonl_output, args.jsonl_output)
