import jsonlines
import random
import copy
import argparse
from common.utils import async_http_process_requests, simple_promptify, list_to_string
from common.model_configs import config_model, config_aliyun
from utils import return_disturb_num
import itertools


template = """
# Goal

{goal}

# Information

{information}

# Background

{background}

# Instruction

DO NOT consider rationality, assemble the Goal, Information, and Background into a coherent text to get an Unclear Question.

Because the Unclear Question contains Missing Information compared with the Original Question shown below, \
the Unclear Question should not be able to be answered directly.

***
Original Question:

{original_question}
***

Provide the reasons that why the Unclear Question is Unclear (i.e, can not be answered directly).
Generate your output STRICTLY in the following format.
Unclear Question:
Reason of Unclearness:

# Requirements

1. **The Unclear Question SHOULD NOT CONTAIN the title of Goal, Information, and Background.**
2. **The Unclear Question SHOULD CONTAIN ALL contents in the Goal, Information, and Background (if the Information and Background are not empty).**
3. **DO NOT ADD ANY content that is not included in the Goal, Information and Background from the Original Question to the Unclear Question.**
"""


def parse_results(result, metadata):
    return {
        'response': result,
        'metadata': metadata
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, required=True)
    parser.add_argument('--output_file', type=str, required=True)
    parser.add_argument('--task_level', type=int, default=1)
    parser.add_argument('--max_queries', type=int, default=-1)
    parser.add_argument('--variants', type=int, default=1)
    parser.add_argument('--seed', type=int, default=42)
    args = parser.parse_args()
    random.seed(args.seed)
    with jsonlines.open(args.input_file) as r:
        data = list(r)
    all_queries, all_metadata = [], []
    # 如果构造task level的数据，要求filtered info list大于等于3
    if args.task_level != 1:
        data = [item for item in data if len(item['filtered_info_list']) >= 3]
    if args.max_queries != -1:
        random.shuffle(data)
        data = data[:args.max_queries]
    for item in data:
        goal, information, background = item['goal'], item['filtered_info_list'], item['background']
        # 如果不构造task level数据，默认干扰数量为1
        delete_num = return_disturb_num(information, args.task_level) if args.task_level != 1 else 1
        indices = list(range(len(information)))
        if delete_num == 1:
            delete_indices = [[_] for _ in indices]
        else:
            delete_indices = list(itertools.combinations(indices, delete_num))
        random.shuffle(delete_indices)
        delete_indices = delete_indices[:args.variants]
        for delete_index in delete_indices:
            item_copy = copy.deepcopy(item)
            incomplete_information = [info for idx, info in enumerate(information) if idx not in delete_index]
            query = template.format(
                original_question=item['raw_task'], goal=goal, background=background,
                information=list_to_string(incomplete_information, '-') if len(incomplete_information) > 0 else "(empty)",
            )
            item_copy['missed_information'] = [info for idx, info in enumerate(information) if idx in delete_index]
            all_queries.append(query)
            all_metadata.append(item_copy)
    all_requests = simple_promptify(all_queries)
    model_config = config_model(config_aliyun, 'deepseek-r1', 0.6, 250)
    all_res = async_http_process_requests(all_requests, model_config)
    all_res = [parse_results(res[0], metadata) for res, metadata in zip(all_res, all_metadata)]
    with jsonlines.open(args.output_file, 'w') as writer:
        writer.write_all(all_res)



if __name__ == '__main__':
    main()
