import os
import re
import tqdm
import json
import time
import random
from typing import Literal
from core.domain.schema import ProblemDomain, Problem, BinaryProblem
from utils.io_utils import load_file

class CMV(ProblemDomain):
    """Change My View (Reddit site for value-laden questions) as a problem domain, with access to "delta(s) from OP" as ground truth.""" 

    def __init__(
        self,
        dataset_file: str = "changemymind.json",
        train_size: float = 0.8,
    ):
        """Instantiate a CMV problem set.

        :param dataset_file: dataset filepath relative to `data/questions/`, defaults to "changemymind.json"

        :type dataset_file: str, optional
        :param train_size: the portion of samples to serve as training samples, defaults to 0.8
        :type train_size: float, optional
        """

        self.train_size = train_size
        # Access cmv data
        # self.dataset_path = os.path.join("data", "questions", dataset_file)
        self.dataset_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "data","questions", dataset_file))
        self.dataset_content = load_file(self.dataset_path)
        
        def construct_question(title: str, options: tuple[str, str]) -> str:
            # Remove the complete word "CMV" (or lowercase version) and preceding/trailing punctuation
            title += f" - {options[0].lower()} or {options[1].lower()}?"
            title = re.sub(r"[\.\?\!:,]*[cC][mM][vV][\.\?\!:,]*", "", title)
            title = re.sub(r'\s+', ' ', title)
            return title.strip()

        # Parse questions
        self.questions_all = [
            BinaryProblem(
                id=cmv_id, 
                question=construct_question(cmv_prob["op-title"], ("Yes", "No")),
                options=("Yes", "No"),
                correct_option = None,
                aux_info = {
                    "context": cmv_prob["op-text-body"]
                }
            )
            for cmv_id, cmv_prob in self.dataset_content.items()
            if "op-title" in cmv_prob 
        ]
        random.shuffle(self.questions_all)

        # partition questions 
        train_examples = int(len(self.questions_all) * self.train_size) 
        self.partitioned_data = {
            "train": self.questions_all[:train_examples],
            "test": self.questions_all[train_examples:],
        }
        print(f"Training set size: {len(self.partitioned_data['train'])}")
        print(f"Test set size: {len(self.partitioned_data['test'])}")

    def sample_problems(
        self, n: int = 1, split: Literal["train", "test"] = "train"
    ) -> list[BinaryProblem]:
        """Sample a number of problems from a dataset split. The splitting is performed during instantiation.""" 
        
        samples = random.sample(self.partitioned_data[split], n)
        samples = [s.shuffle_options() for s in samples]
        return samples
