"""
In this file, we extract the questions from the dataset and save them in a separate file.
Questions are at forecastbench-datasets/datasets/question_sets/*-llm.json files.


There is a `questions` array in each such file. It has entries of the following kind (example):

{
    "id": "njIsNltHOS7eR4ZUgaXX",
    "source": "manifold",
    "question": "Will a New York sports team win a championship by the end of the 2025 sports season?",
    "resolution_criteria": "Resolves to the outcome of the question found at XXXX.",
    "background": "This market will resolve YES if one of the following major New York metropolitan area-based teams wins their respective championship by the conclusion of the 2025 NFL season (Latest Feb 9, 2026 \u2013 Super Bowl 60)\n\nNew York Mets \u2013  World Series \n\nNew York Yankees \u2013 World Series\n\nNew York Giants \u2013 Super Bowl\n\nNew York Jets \u2013 Super Bowl \n\nNew York Red Bulls \u2013 MLS Cup or Leagues Cup\n\nNew York City FC \u2013 MLS Cup or Leagues Cup\n\nNew York Nets \u2013 NBA Final\n\nNew York Knicks \u2013 NBA Final\n\nNew York Rangers \u2013 Stanley Cup\n\nNew York Islanders \u2013 Stanley Cup\n\nThe two New York football teams, although not physically based in the NY Metropolitan area, will be considered New York teams for the purpose of the market. \n\nOther small-market teams, such as the WNBA's NY Liberty, MLR New York Rugby, or NLL Lacrosse will not be counted.",
    "market_info_open_datetime": "2023-09-12T20:58:36+00:00",
    "market_info_close_datetime": "2026-02-10T04:59:00+00:00",
    "market_info_resolution_criteria": "N/A",
    "url": "XXXX",
    "freeze_datetime": "2024-11-28T00:00:00+00:00",
    "freeze_datetime_value": "0.325439612073177",
    "freeze_datetime_value_explanation": "The market value.",
    "source_intro": "We would like you to predict the outcome of a prediction market. A prediction market, in this context, is the aggregate of predictions submitted by users on the website Manifold. You're going to predict the probability that the market will resolve as 'Yes'.",
    "combination_of": "N/A",
    "resolution_dates": "N/A"
},

I want you to extract all the questions with source as manifold, market_info_close_datetime before Feb 1, 2025, and "combination_of": "N/A".

"""


import json
import os
from datetime import datetime
import pandas as pd

# import library for pretty print 
from pprint import pprint

# Load the dataset
dataset_path = 'forecastbench-datasets/datasets/question_sets/'
files = os.listdir(dataset_path)

# Extract the questions
questions = []
trusted_sources = ['manifold', 'metaculus', 'infer']
IDs = []
CUTOFF_DATE = '2025-02-01'

for file in files:
    if not file.endswith('-llm.json') or not file.startswith('2024'):
        continue 
    
    with open(dataset_path + file) as f:
        data = json.load(f)
        for question in data['questions']:
            source = question['source']
            if source in trusted_sources and question['id'] not in IDs and question['market_info_close_datetime'] < CUTOFF_DATE and question['combination_of'] == 'N/A':
                
                # Extract the data from info_close_datetime (by removing time part)
                close_date = question['market_info_close_datetime'].split('T')[0]
                question['close_date'] = close_date
                questions.append(question)
                IDs.append(question['id'])
                # pprint(question)
                
print(f"Total questions: {len(questions)}")
print(f"Total IDs: {len(IDs)}")
# pprint(IDs)
assert len(IDs) == len(questions), "Length mismatch between IDs and questions"

# Print the duplicate IDs
# df = pd.DataFrame(IDs, columns=['ID'])
# df['is_duplicate'] = df.duplicated()
# print(df[df['is_duplicate']])
# assert not df['is_duplicate'].any(), "Duplicate IDs found"


# Question 11164 is BROKEN?


# Now go over resolution_set at forecastbench-datasets/datasets/resolution_sets/*.json files.
# They have an array called 'resolutions' in json file with entry of the following kind (example):
#  {
#             "id": "66b6a2d010663acd0151084028b848f1fabb2f0104caef851775a7cb75e634b2",
#             "source": "acled",
#             "direction": null,
#             "resolution_date": "2024-07-28",
#             "resolved_to": 0.0,
#             "resolved": true
#         },
# Prepare a dictionary which maps question ID to the dictionary example given above 


# Load the dataset
dataset_path = 'forecastbench-datasets/datasets/resolution_sets/'
files = os.listdir(dataset_path)

# Extract the resolutions
resolutions = {}
for file in files:
    if not file.endswith('.json'):
        continue 
    
    with open(dataset_path + file) as f:
        data = json.load(f)
        for resolution in data['resolutions']:
            if resolution['id'] in IDs:
                resolutions[resolution['id']] = resolution
                # pprint(resolution)
                
print(f"Total resolutions: {len(resolutions)}")
# assert len(resolutions) == len(questions), "Length mismatch between resolutions and questions"

# Now combine data of resolution into the questions 

for question in questions:
    if question['id'] not in resolutions:
        continue 
    
    for k, v in resolutions[question['id']].items():
        if k not in question:
            question[k] = v
        
# Print those questions which do not have resolution data
# for question in questions:
#     if 'resolved' not in question:
#         pprint(question)

# Filter those questions which do not have resolution data
questions = [question for question in questions if 'resolved' in question]
print(f"Total questions with resolution data: {len(questions)}")

# Save these questions in a separate file (.json)
OUTPUT_DIR = '/fast/XXXX-3/forecasting/datasets/forecast-bench'
# Create file_name using CUTOF_DATE and trusted sources 

file_name = f'resolved_by_{CUTOFF_DATE}__{"_".join(trusted_sources)}.json'
print(f"Saving the questions in {file_name}")

# Shuffle the questions 

print(questions[2])
# Save the questions to the file
# with open(os.path.join(OUTPUT_DIR, file_name), 'w') as f:
#     json.dump(questions, f, indent=4)