import re
import json
# Your input text

# Define a regular expression pattern to match text within double quotes or single quotes
pattern = r'(["\'])((?:\\\1|(?:(?!\1)).)*)(\1)'

def extract_format(sentence):
    sentence = sentence.replace("“", "\"")
    matches = re.findall(pattern, sentence)
    # Extract the captured groups (the text within quotes)
    extracted_sentences = [match[1] for match in matches]
    # print(f"-->{sentence}<--")
    print(f"Extracted sentences: {extracted_sentences[-1]}")
    return extracted_sentences[-1]


def read_dataset(json_address):
    with open(json_address) as json_file:
        data = json.load(json_file)
    return data


def extract_pattern_from_dataset(json_address):
    dataset = read_dataset(json_address)
    for key, value in dataset.items():
        all_attempts = value["all_attempts"]
        formats = []
        for attempt in all_attempts:
            formats.append(extract_format(attempt))
        dataset[key]["formats"] = formats
    return dataset


if __name__ == '__main__':
    dataset_name = "analytical_entailment"
    formatted_data = extract_pattern_from_dataset('/Volumes/Academic/Projects/PRoMTd/outputs/analytical_entailment/analytical_entailment_final.json')
    with open(f'outputs/{dataset_name}_formatted_data.json', 'w') as outfile:
        json.dump(formatted_data, outfile)