# coding=utf-8
# Copyright 2022 The Conceptual Learning Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Conceptual SCAN dataset suite spec definitions."""

from typing import Mapping, Sequence, Tuple

import immutabledict

from conceptual_learning.cscan import inputs

# Datasets used in the main leaderboard and breakdown analyses.
_DATASET_SPECS_MAIN: Tuple[str, Ellipsis] = (
    'base',
    'extended',  # 14_rules_output_size_6_8_40
    'base_mcd',
    'extended_mcd',
)

# Datasets of the base grammar used for dataset size effect comparison
# excluding `base`.
_DATA_SIZE_EFFECT_BASE_SPECS_EXCEPT_MAIN: Tuple[str, Ellipsis] = (
    'base_100_contexts',
    'base_5000_contexts',
    'base_8K_contexts',
    )

# Datasets of the extended grammar used for dataset size effect comparison
# excluding `extended`.
_DATA_SIZE_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN: Tuple[str, Ellipsis] = (
    'extended_100_contexts',
    'extended_5000_contexts',
    'extended_8K_contexts',
    )

# Datasets of the base grammar used for contexts diversity effect comparison
# excluding `base`.
_CONTEXTS_DIVERSITY_EFFECT_BASE_SPECS_EXCEPT_MAIN: Tuple[str, Ellipsis] = (
    'base_100_contexts_1000_examples',
    'base_10K_contexts_10_examples',
    )

# Datasets of the extended grammar used for contexts diversity effect
# comparison excluding `base`.
_CONTEXTS_DIVERSITY_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN: Tuple[str, Ellipsis] = (
    'extended_100_contexts_1000_examples',
    'extended_10K_contexts_10_examples',
    )

# Datasets used in side experiments but not in the main leaderboard.
_DATASET_SPECS_FULL_EXCEPT_MAIN: Tuple[str, Ellipsis] = (
    # Effect of data size (comparison with base).
    *_DATA_SIZE_EFFECT_BASE_SPECS_EXCEPT_MAIN,

    # Effect of data size (comparison with extended).
    *_DATA_SIZE_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN,

    # Effect of diversity of contexts (comparison with base).
    *_CONTEXTS_DIVERSITY_EFFECT_BASE_SPECS_EXCEPT_MAIN,

    # Effect of diversity of contexts (comparison with extended).
    *_CONTEXTS_DIVERSITY_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN
)

# Collection of all valid dataset suite specs.
_DATASET_SUITE_SPECS: Sequence[inputs.DatasetSuiteSpec] = (
    inputs.DatasetSuiteSpec(
        id='full',
        description=(
            'The full suite of datasets that we might possibly use in the '
            'experiments in the paper. Subsumes most of the other suites.'),
        dataset_specs=list(
            _DATASET_SPECS_MAIN + _DATASET_SPECS_FULL_EXCEPT_MAIN)),
    inputs.DatasetSuiteSpec(
        id='full_except_main',
        description=(
            'The full suite of datasets used in the experiments in the paper, '
            'except for the dataset in the main suite. Experiments using this '
            'suite can be safely run in parallel with experiments on the main '
            'suite, since they are disjoint.'),
        dataset_specs=list(_DATASET_SPECS_FULL_EXCEPT_MAIN)),
    inputs.DatasetSuiteSpec(
        id='main',
        description=(
            'Just the datasets used for the main leaderboard and breakdown '
            'metrics'),
        dataset_specs=list(_DATASET_SPECS_MAIN)),
    inputs.DatasetSuiteSpec(
        id='base_random_and_mcd',
        description=(
            'The datasets from the leaderboard based on the base rule space.'),
        dataset_specs=[
            'base',
            'base_mcd',
        ]),
    inputs.DatasetSuiteSpec(
        id='effect_of_data_size_base',
        description=('Datasets needed for evaluating effect of data size '
                     '(comparison with base).'),
        dataset_specs=[
            'base_100_contexts',
            'base',
            'base_5000_contexts',
            'base_8K_contexts',
        ]),
    inputs.DatasetSuiteSpec(
        id='effect_of_data_size_base_except_main',
        description=('Datasets needed for evaluating effect of data size '
                     '(comparison with base) excluding `base`.'),
        dataset_specs=_DATA_SIZE_EFFECT_BASE_SPECS_EXCEPT_MAIN),
    inputs.DatasetSuiteSpec(
        id='effect_of_data_size_extended',
        description=('Datasets needed for evaluating effect of data size '
                     '(comparison with extended).'),
        dataset_specs=[
            'extended_100_contexts',
            'extended',
            'extended_5000_contexts',
            'extended_8K_contexts'
        ]),
    inputs.DatasetSuiteSpec(
        id='effect_of_data_size_extended_except_main',
        description=('Datasets needed for evaluating effect of data size '
                     '(comparison with extended) excluding `extended`.'),
        dataset_specs=_DATA_SIZE_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN),
    inputs.DatasetSuiteSpec(
        id='effect_of_diversity_of_contexts_base',
        description=('Datasets needed for evaluating effect of diversity of '
                     'contexts (100K examples total, using base).'),
        dataset_specs=[
            'base_100_contexts_1000_examples',
            'base',
            'base_10K_contexts_10_examples',
        ]),
    inputs.DatasetSuiteSpec(
        id='effect_of_diversity_of_contexts_base_except_main',
        description=('Datasets needed for evaluating effect of diversity of '
                     'contexts (100K examples total, using the base grammar.) '
                     'excluding `base`.'),
        dataset_specs=_CONTEXTS_DIVERSITY_EFFECT_BASE_SPECS_EXCEPT_MAIN),
    inputs.DatasetSuiteSpec(
        id='effect_of_diversity_of_contexts_extended',
        description=('Datasets needed for evaluating effect of diversity of '
                     'contexts (100K examples total, using extended).'),
        dataset_specs=[
            'extended_100_contexts_1000_examples',
            'extended',
            'extended_10K_contexts_10_examples',
        ]),
    inputs.DatasetSuiteSpec(
        id='effect_of_diversity_of_contexts_extended_except_main',
        description=('Datasets needed for evaluating effect of diversity of '
                     'contexts (100K examples total, using the extended '
                     'grammar) excluding `extended`.'),
        dataset_specs=_CONTEXTS_DIVERSITY_EFFECT_EXTENDED_SPECS_EXCEPT_MAIN),
    inputs.DatasetSuiteSpec(
        id='100_contexts',
        description=(
            'A single smaller dataset containing 100 train contexts, for use '
            'in quickly verifying end-to-end benchmark generation '
            'functionality including splitting.'),
        dataset_specs=['base_100_contexts']),
    inputs.DatasetSuiteSpec(
        id='test',
        description=(
            'A small suite suitable for use in automated testing of the '
            'benchmark generation pipeline. Not used in the paper.'),
        dataset_specs=[
            'small_with_validation_and_test',
            'small_with_unreliable_and_omitted',
        ]),
)

# Returns a mapping of dataset suite spec ID to its corresponding spec.
DATASET_SUITE_SPEC_BY_ID: Mapping[str, inputs.DatasetSuiteSpec] = (
    immutabledict.immutabledict(
        {spec.id: spec for spec in _DATASET_SUITE_SPECS}))
