import random
from pathlib import Path
from typing import Any
from datasets import load_dataset, Dataset, ClassLabel
from collections import Counter

import yaml
from datasets import Dataset, DatasetDict, concatenate_datasets

from src.data.utils import CustomColName, DatasetConfig
from src.utils.logging_utils import get_logger
from src.utils.pipeline_utils import PipelineStep

logger = get_logger(name=__name__)


class CreateInstagSubset(PipelineStep):
    class Config(PipelineStep.Config):
        source_dataset: DatasetConfig
        hf_token: str
        split_name: str = "train"
        num_proc: int = 1
        data_seed: int = 42
        dataset_cfg_template: Path = Path("config/train/dataset/_dataset_template.yaml")

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.config = self.Config(**kwargs)
        random.seed(self.config.data_seed)

    def _call(self, **kwargs) -> Any:
        ds = load_dataset(self.config.original_dataset_name, split=self.config.split_name)
        pass