import argparse
import jsonlines
from tqdm import tqdm
from datasets import load_dataset
from common.utils import async_http_process_requests, simple_promptify
from common.model_configs import config_model, config_aliyun

template = """Given a TEXT, its "Goal", "Necessary Information", and "Background" are as follows:
- "Goal" refers to the task that the TEXT requires to solve. "Goal" should be short and concise.
- "Necessary Information" and "Background" refer to everything apart from the Goal, including data, facts, examples, etc. Among them:
-- "Necessary Information" refers to the contents that are strictly needed to accomplish the Goal.
-- "Background" refers to the other contents that can be ignored, such as examples and descriptions.

Based on the above definitions, please SPLIT the following TEXT into "Goal", "Necessary Information", and "Background".

***
TEXT:

{problem}
***

If Background is None, leave Background empty.
Split long Necessary Information to short items.
For "Necessary Information", provide a numbered list.

**Requirements**
1. **You should EXACTLY COPY contents from the TEXT to Goal, Necessary Information, and Background!!!**
2. **DO NOT ADD, REWRITE, REPHRASE, ANY CONTENT WHEN COPYING TEXT to YOUR OUTPUTS!!!**
3. **KEEP ALL SYMBOLS, such as EMPTY LINES, SPACES, MATH SYMBOLS, and FORMATTING SYMBOLS WHEN COPYING TEXT to YOUR OUTPUTS!!!**
4. **Goal, Necessary Information, and Background SHOULD NOT OVERLAP!!!**
5. **Goal, Necessary Information, and Background SHOULD COVER ALL contents of the TEXT!!!**
"""

# template = """
# Given a Problem, its "Goal", "Necessary Premise", and "Background" are defined as follows:
# - "Goal" refers to the result that the problem requires to infer. "Goal" should be short and concise.
# - "Necessary Premise" and "Background" refer to everything apart from the "Goal" in the problem:
#   - "Necessary Premise" refers to the contents that are strictly needed to accomplish the "Goal".
#   - "Background" refers to the other contents that can be ignored and does not affect the accomplishment of the Goal, such as examples and descriptions.
#
# Based on the above definitions, please split the following Problem into "Goal", "Necessary Premise", and "Background".
#
# **********************************
# Problem:
# {problem}
# **********************************
#
# If Background is None, leave it empty.
# Split long Necessary Premise to short items.
# For "Necessary Premise", provide a numbered list.
#
# Strictly follow the requirement below.
#
# **Requirements
# 1. Goal, Necessary Premise, and Background should not overlap.
# 2. Goal, Necessary Premise, and Background should cover all contents in the Problem.
# 3. You should exactly copy contents from the Problem as your outputs of Goal, Necessary Premise, and Background.
# 4. Do not add, rewrite, rephrase any content.
# 5. In your output, keep all symbols the same as the Problem, such as empty lines, spaces, math symbols, and formatting symbols.
# """


def parse_results(response, metadata):
    extracted_data = {
        'decomposed_task': response,
        'raw_task': metadata[0],
        'solution': metadata[1],
        'answer': metadata[2],
        'level': metadata[3],
    }

    return extracted_data


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, required=True)
    args = parser.parse_args()
    ds = load_dataset('json', data_files=args.input_file, split='train')
    _tqdm = tqdm(total=len(ds))
    all_data = []
    for item in ds:
        problem = item['problem']
        all_data.append([template.format(problem=problem), problem, item['solution'], item['answer'], item['level']])
    all_inputs = [item[0] for item in all_data]
    print(all_inputs[0])
    all_requests = simple_promptify(all_inputs)
    all_metadata = [[item[1], item[2], item[3], item[4]] for item in all_data]
    model_config = config_model(config_aliyun, 'deepseek-r1', 0.6, 300)
    all_res = async_http_process_requests(all_requests, model_config)
    all_res = [parse_results(res[0], metadata) for res, metadata in zip(all_res, all_metadata)]
    file_name = '.'.join(args.input_file.split('.')[:-1])
    output_file = f'{file_name}_parsed.jsonl'
    with jsonlines.open(output_file, 'w') as writer:
        writer.write_all(all_res)


if __name__ == '__main__':
    main()
