# ruff: noqa: E501

"""Developer tools.

This module is supposed to be used by authors in their private scripts and notebooks.
This module must not be imported from the main lib/ and bin/.
"""

import datetime
import functools
import json
import warnings

# fmt: off
from copy import deepcopy
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import lib

# ======================================================================================
# Datasets
# ======================================================================================
CHURN = 'churn'
CALIFORNIA = 'california'
HOUSE = 'house'
ADULT = 'adult'
DIAMOND = 'diamond'
OTTO = 'otto'
HIGGS_SMALL = 'higgs-small'
BLACK_FRIDAY = 'black-friday'
COVTYPE2 = 'covtype2'
MICROSOFT = 'microsoft'

DATASETS_DEFAULT = [
    CHURN,
    CALIFORNIA,
    HOUSE,
    ADULT,
    DIAMOND,
    OTTO,
    HIGGS_SMALL,
    BLACK_FRIDAY,
    COVTYPE2,
    MICROSOFT,
]

SBERBANK_HOUSING = 'sberbank-housing'
ECOM_OFFERS = 'ecom-offers'
MAPS_ROUTING = 'maps-routing'
HOMESITE_INSURANCE = 'homesite-insurance'
COOKING_TIME = 'cooking-time'
HOMECREDIT_DEFAULT = 'homecredit-default'
DELIVERY_ETA = 'delivery-eta'
WEATHER = 'weather'

DATASETS_TABRED = [
    SBERBANK_HOUSING,
    ECOM_OFFERS,
    MAPS_ROUTING,
    HOMESITE_INSURANCE,
    COOKING_TIME,
    HOMECREDIT_DEFAULT,
    DELIVERY_ETA,
    WEATHER,
]

CLASSIF_CAT_LARGE_0_COVERTYPE = 'classif-cat-large-0-covertype'
CLASSIF_CAT_LARGE_0_ROAD_SAFETY = 'classif-cat-large-0-road-safety'
CLASSIF_CAT_MEDIUM_0_KDDCUP09_UPSELLING = 'classif-cat-medium-0-KDDCup09_upselling'
CLASSIF_CAT_MEDIUM_0_COMPASS = 'classif-cat-medium-0-compass'
CLASSIF_CAT_MEDIUM_0_COVERTYPE = 'classif-cat-medium-0-covertype'
CLASSIF_CAT_MEDIUM_0_ELECTRICITY = 'classif-cat-medium-0-electricity'
CLASSIF_CAT_MEDIUM_0_EYE_MOVEMENTS = 'classif-cat-medium-0-eye_movements'
CLASSIF_CAT_MEDIUM_0_RL = 'classif-cat-medium-0-rl'
CLASSIF_CAT_MEDIUM_0_ROAD_SAFETY = 'classif-cat-medium-0-road-safety'
CLASSIF_CAT_MEDIUM_1_KDDCUP09_UPSELLING = 'classif-cat-medium-1-KDDCup09_upselling'
CLASSIF_CAT_MEDIUM_1_COMPASS = 'classif-cat-medium-1-compass'
CLASSIF_CAT_MEDIUM_1_EYE_MOVEMENTS = 'classif-cat-medium-1-eye_movements'
CLASSIF_CAT_MEDIUM_1_RL = 'classif-cat-medium-1-rl'
CLASSIF_CAT_MEDIUM_2_KDDCUP09_UPSELLING = 'classif-cat-medium-2-KDDCup09_upselling'
CLASSIF_CAT_MEDIUM_2_EYE_MOVEMENTS = 'classif-cat-medium-2-eye_movements'
CLASSIF_CAT_MEDIUM_2_RL = 'classif-cat-medium-2-rl'
CLASSIF_NUM_LARGE_0_HIGGS = 'classif-num-large-0-Higgs'
CLASSIF_NUM_LARGE_0_MINIBOONE = 'classif-num-large-0-MiniBooNE'
CLASSIF_NUM_LARGE_0_COVERTYPE = 'classif-num-large-0-covertype'
CLASSIF_NUM_LARGE_0_JANNIS = 'classif-num-large-0-jannis'
CLASSIF_NUM_MEDIUM_0_HIGGS = 'classif-num-medium-0-Higgs'
CLASSIF_NUM_MEDIUM_0_MAGICTELESCOPE = 'classif-num-medium-0-MagicTelescope'
CLASSIF_NUM_MEDIUM_0_MINIBOONE = 'classif-num-medium-0-MiniBooNE'
CLASSIF_NUM_MEDIUM_0_BANK_MARKETING = 'classif-num-medium-0-bank-marketing'
CLASSIF_NUM_MEDIUM_0_CALIFORNIA = 'classif-num-medium-0-california'
CLASSIF_NUM_MEDIUM_0_COVERTYPE = 'classif-num-medium-0-covertype'
CLASSIF_NUM_MEDIUM_0_CREDIT = 'classif-num-medium-0-credit'
CLASSIF_NUM_MEDIUM_0_ELECTRICITY = 'classif-num-medium-0-electricity'
CLASSIF_NUM_MEDIUM_0_EYE_MOVEMENTS = 'classif-num-medium-0-eye_movements'
CLASSIF_NUM_MEDIUM_0_HOUSE_16H = 'classif-num-medium-0-house_16H'
CLASSIF_NUM_MEDIUM_0_JANNIS = 'classif-num-medium-0-jannis'
CLASSIF_NUM_MEDIUM_0_KDD_IPUMS_LA_97_SMALL = 'classif-num-medium-0-kdd_ipums_la_97-small'
CLASSIF_NUM_MEDIUM_0_PHONEME = 'classif-num-medium-0-phoneme'
CLASSIF_NUM_MEDIUM_0_POL = 'classif-num-medium-0-pol'
CLASSIF_NUM_MEDIUM_0_WINE = 'classif-num-medium-0-wine'
CLASSIF_NUM_MEDIUM_1_MAGICTELESCOPE = 'classif-num-medium-1-MagicTelescope'
CLASSIF_NUM_MEDIUM_1_BANK_MARKETING = 'classif-num-medium-1-bank-marketing'
CLASSIF_NUM_MEDIUM_1_CREDIT = 'classif-num-medium-1-credit'
CLASSIF_NUM_MEDIUM_1_EYE_MOVEMENTS = 'classif-num-medium-1-eye_movements'
CLASSIF_NUM_MEDIUM_1_HOUSE_16H = 'classif-num-medium-1-house_16H'
CLASSIF_NUM_MEDIUM_1_KDD_IPUMS_LA_97_SMALL = 'classif-num-medium-1-kdd_ipums_la_97-small'
CLASSIF_NUM_MEDIUM_1_PHONEME = 'classif-num-medium-1-phoneme'
CLASSIF_NUM_MEDIUM_1_POL = 'classif-num-medium-1-pol'
CLASSIF_NUM_MEDIUM_1_WINE = 'classif-num-medium-1-wine'
CLASSIF_NUM_MEDIUM_2_MAGICTELESCOPE = 'classif-num-medium-2-MagicTelescope'
CLASSIF_NUM_MEDIUM_2_BANK_MARKETING = 'classif-num-medium-2-bank-marketing'
CLASSIF_NUM_MEDIUM_2_EYE_MOVEMENTS = 'classif-num-medium-2-eye_movements'
CLASSIF_NUM_MEDIUM_2_HOUSE_16H = 'classif-num-medium-2-house_16H'
CLASSIF_NUM_MEDIUM_2_KDD_IPUMS_LA_97_SMALL = 'classif-num-medium-2-kdd_ipums_la_97-small'
CLASSIF_NUM_MEDIUM_2_PHONEME = 'classif-num-medium-2-phoneme'
CLASSIF_NUM_MEDIUM_2_POL = 'classif-num-medium-2-pol'
CLASSIF_NUM_MEDIUM_2_WINE = 'classif-num-medium-2-wine'
CLASSIF_NUM_MEDIUM_3_PHONEME = 'classif-num-medium-3-phoneme'
CLASSIF_NUM_MEDIUM_3_WINE = 'classif-num-medium-3-wine'
CLASSIF_NUM_MEDIUM_4_PHONEME = 'classif-num-medium-4-phoneme'
CLASSIF_NUM_MEDIUM_4_WINE = 'classif-num-medium-4-wine'
REGRESSION_CAT_LARGE_0_SGEMM_GPU_KERNEL_PERFORMANCE = 'regression-cat-large-0-SGEMM_GPU_kernel_performance'
REGRESSION_CAT_LARGE_0_BLACK_FRIDAY = 'regression-cat-large-0-black_friday'
REGRESSION_CAT_LARGE_0_DIAMONDS = 'regression-cat-large-0-diamonds'
REGRESSION_CAT_LARGE_0_NYC_TAXI_GREEN_DEC_2016 = 'regression-cat-large-0-nyc-taxi-green-dec-2016'
REGRESSION_CAT_LARGE_0_PARTICULATE_MATTER_UKAIR_2017 = 'regression-cat-large-0-particulate-matter-ukair-2017'
REGRESSION_CAT_MEDIUM_0_BIKE_SHARING_DEMAND = 'regression-cat-medium-0-Bike_Sharing_Demand'
REGRESSION_CAT_MEDIUM_0_BRAZILIAN_HOUSES = 'regression-cat-medium-0-Brazilian_houses'
REGRESSION_CAT_MEDIUM_0_MERCEDES_BENZ_GREENER_MANUFACTURING = 'regression-cat-medium-0-Mercedes_Benz_Greener_Manufacturing'
REGRESSION_CAT_MEDIUM_0_ONLINENEWSPOPULARITY = 'regression-cat-medium-0-OnlineNewsPopularity'
REGRESSION_CAT_MEDIUM_0_SGEMM_GPU_KERNEL_PERFORMANCE = 'regression-cat-medium-0-SGEMM_GPU_kernel_performance'
REGRESSION_CAT_MEDIUM_0_ANALCATDATA_SUPREME = 'regression-cat-medium-0-analcatdata_supreme'
REGRESSION_CAT_MEDIUM_0_BLACK_FRIDAY = 'regression-cat-medium-0-black_friday'
REGRESSION_CAT_MEDIUM_0_DIAMONDS = 'regression-cat-medium-0-diamonds'
REGRESSION_CAT_MEDIUM_0_HOUSE_SALES = 'regression-cat-medium-0-house_sales'
REGRESSION_CAT_MEDIUM_0_NYC_TAXI_GREEN_DEC_2016 = 'regression-cat-medium-0-nyc-taxi-green-dec-2016'
REGRESSION_CAT_MEDIUM_0_PARTICULATE_MATTER_UKAIR_2017 = 'regression-cat-medium-0-particulate-matter-ukair-2017'
REGRESSION_CAT_MEDIUM_0_VISUALIZING_SOIL = 'regression-cat-medium-0-visualizing_soil'
REGRESSION_CAT_MEDIUM_0_YPROP_4_1 = 'regression-cat-medium-0-yprop_4_1'
REGRESSION_CAT_MEDIUM_1_BIKE_SHARING_DEMAND = 'regression-cat-medium-1-Bike_Sharing_Demand'
REGRESSION_CAT_MEDIUM_1_BRAZILIAN_HOUSES = 'regression-cat-medium-1-Brazilian_houses'
REGRESSION_CAT_MEDIUM_1_MERCEDES_BENZ_GREENER_MANUFACTURING = 'regression-cat-medium-1-Mercedes_Benz_Greener_Manufacturing'
REGRESSION_CAT_MEDIUM_1_ANALCATDATA_SUPREME = 'regression-cat-medium-1-analcatdata_supreme'
REGRESSION_CAT_MEDIUM_1_VISUALIZING_SOIL = 'regression-cat-medium-1-visualizing_soil'
REGRESSION_CAT_MEDIUM_1_YPROP_4_1 = 'regression-cat-medium-1-yprop_4_1'
REGRESSION_CAT_MEDIUM_2_BRAZILIAN_HOUSES = 'regression-cat-medium-2-Brazilian_houses'
REGRESSION_CAT_MEDIUM_2_MERCEDES_BENZ_GREENER_MANUFACTURING = 'regression-cat-medium-2-Mercedes_Benz_Greener_Manufacturing'
REGRESSION_CAT_MEDIUM_2_ANALCATDATA_SUPREME = 'regression-cat-medium-2-analcatdata_supreme'
REGRESSION_CAT_MEDIUM_2_VISUALIZING_SOIL = 'regression-cat-medium-2-visualizing_soil'
REGRESSION_CAT_MEDIUM_2_YPROP_4_1 = 'regression-cat-medium-2-yprop_4_1'
REGRESSION_CAT_MEDIUM_3_MERCEDES_BENZ_GREENER_MANUFACTURING = 'regression-cat-medium-3-Mercedes_Benz_Greener_Manufacturing'
REGRESSION_CAT_MEDIUM_3_ANALCATDATA_SUPREME = 'regression-cat-medium-3-analcatdata_supreme'
REGRESSION_CAT_MEDIUM_4_MERCEDES_BENZ_GREENER_MANUFACTURING = 'regression-cat-medium-4-Mercedes_Benz_Greener_Manufacturing'
REGRESSION_CAT_MEDIUM_4_ANALCATDATA_SUPREME = 'regression-cat-medium-4-analcatdata_supreme'
REGRESSION_NUM_LARGE_0_DIAMONDS = 'regression-num-large-0-diamonds'
REGRESSION_NUM_LARGE_0_NYC_TAXI_GREEN_DEC_2016 = 'regression-num-large-0-nyc-taxi-green-dec-2016'
REGRESSION_NUM_LARGE_0_YEAR = 'regression-num-large-0-year'
REGRESSION_NUM_MEDIUM_0_AILERONS = 'regression-num-medium-0-Ailerons'
REGRESSION_NUM_MEDIUM_0_BIKE_SHARING_DEMAND = 'regression-num-medium-0-Bike_Sharing_Demand'
REGRESSION_NUM_MEDIUM_0_BRAZILIAN_HOUSES = 'regression-num-medium-0-Brazilian_houses'
REGRESSION_NUM_MEDIUM_0_MIAMIHOUSING2016 = 'regression-num-medium-0-MiamiHousing2016'
REGRESSION_NUM_MEDIUM_0_CALIFORNIA = 'regression-num-medium-0-california'
REGRESSION_NUM_MEDIUM_0_CPU_ACT = 'regression-num-medium-0-cpu_act'
REGRESSION_NUM_MEDIUM_0_DIAMONDS = 'regression-num-medium-0-diamonds'
REGRESSION_NUM_MEDIUM_0_ELEVATORS = 'regression-num-medium-0-elevators'
REGRESSION_NUM_MEDIUM_0_FIFA = 'regression-num-medium-0-fifa'
REGRESSION_NUM_MEDIUM_0_HOUSE_16H = 'regression-num-medium-0-house_16H'
REGRESSION_NUM_MEDIUM_0_HOUSE_SALES = 'regression-num-medium-0-house_sales'
REGRESSION_NUM_MEDIUM_0_HOUSES = 'regression-num-medium-0-houses'
REGRESSION_NUM_MEDIUM_0_ISOLET = 'regression-num-medium-0-isolet'
REGRESSION_NUM_MEDIUM_0_MEDICAL_CHARGES = 'regression-num-medium-0-medical_charges'
REGRESSION_NUM_MEDIUM_0_NYC_TAXI_GREEN_DEC_2016 = 'regression-num-medium-0-nyc-taxi-green-dec-2016'
REGRESSION_NUM_MEDIUM_0_POL = 'regression-num-medium-0-pol'
REGRESSION_NUM_MEDIUM_0_SULFUR = 'regression-num-medium-0-sulfur'
REGRESSION_NUM_MEDIUM_0_SUPERCONDUCT = 'regression-num-medium-0-superconduct'
REGRESSION_NUM_MEDIUM_0_WINE_QUALITY = 'regression-num-medium-0-wine_quality'
REGRESSION_NUM_MEDIUM_0_YEAR = 'regression-num-medium-0-year'
REGRESSION_NUM_MEDIUM_1_AILERONS = 'regression-num-medium-1-Ailerons'
REGRESSION_NUM_MEDIUM_1_BIKE_SHARING_DEMAND = 'regression-num-medium-1-Bike_Sharing_Demand'
REGRESSION_NUM_MEDIUM_1_BRAZILIAN_HOUSES = 'regression-num-medium-1-Brazilian_houses'
REGRESSION_NUM_MEDIUM_1_MIAMIHOUSING2016 = 'regression-num-medium-1-MiamiHousing2016'
REGRESSION_NUM_MEDIUM_1_CPU_ACT = 'regression-num-medium-1-cpu_act'
REGRESSION_NUM_MEDIUM_1_ELEVATORS = 'regression-num-medium-1-elevators'
REGRESSION_NUM_MEDIUM_1_FIFA = 'regression-num-medium-1-fifa'
REGRESSION_NUM_MEDIUM_1_ISOLET = 'regression-num-medium-1-isolet'
REGRESSION_NUM_MEDIUM_1_POL = 'regression-num-medium-1-pol'
REGRESSION_NUM_MEDIUM_1_SULFUR = 'regression-num-medium-1-sulfur'
REGRESSION_NUM_MEDIUM_1_WINE_QUALITY = 'regression-num-medium-1-wine_quality'
REGRESSION_NUM_MEDIUM_2_AILERONS = 'regression-num-medium-2-Ailerons'
REGRESSION_NUM_MEDIUM_2_BRAZILIAN_HOUSES = 'regression-num-medium-2-Brazilian_houses'
REGRESSION_NUM_MEDIUM_2_MIAMIHOUSING2016 = 'regression-num-medium-2-MiamiHousing2016'
REGRESSION_NUM_MEDIUM_2_CPU_ACT = 'regression-num-medium-2-cpu_act'
REGRESSION_NUM_MEDIUM_2_ISOLET = 'regression-num-medium-2-isolet'
REGRESSION_NUM_MEDIUM_2_SULFUR = 'regression-num-medium-2-sulfur'
REGRESSION_NUM_MEDIUM_2_WINE_QUALITY = 'regression-num-medium-2-wine_quality'
# fmt: on

_DATASETS_WHY_RAW = [
    CLASSIF_CAT_LARGE_0_COVERTYPE,
    CLASSIF_CAT_LARGE_0_ROAD_SAFETY,
    CLASSIF_CAT_MEDIUM_0_KDDCUP09_UPSELLING,
    CLASSIF_CAT_MEDIUM_0_COMPASS,
    CLASSIF_CAT_MEDIUM_0_COVERTYPE,
    CLASSIF_CAT_MEDIUM_0_ELECTRICITY,
    CLASSIF_CAT_MEDIUM_0_EYE_MOVEMENTS,
    CLASSIF_CAT_MEDIUM_0_RL,
    CLASSIF_CAT_MEDIUM_0_ROAD_SAFETY,
    CLASSIF_CAT_MEDIUM_1_KDDCUP09_UPSELLING,
    CLASSIF_CAT_MEDIUM_1_COMPASS,
    CLASSIF_CAT_MEDIUM_1_EYE_MOVEMENTS,
    CLASSIF_CAT_MEDIUM_1_RL,
    CLASSIF_CAT_MEDIUM_2_KDDCUP09_UPSELLING,
    CLASSIF_CAT_MEDIUM_2_EYE_MOVEMENTS,
    CLASSIF_CAT_MEDIUM_2_RL,
    CLASSIF_NUM_LARGE_0_HIGGS,
    CLASSIF_NUM_LARGE_0_MINIBOONE,
    CLASSIF_NUM_LARGE_0_COVERTYPE,
    CLASSIF_NUM_LARGE_0_JANNIS,
    CLASSIF_NUM_MEDIUM_0_HIGGS,
    CLASSIF_NUM_MEDIUM_0_MAGICTELESCOPE,
    CLASSIF_NUM_MEDIUM_0_MINIBOONE,
    CLASSIF_NUM_MEDIUM_0_BANK_MARKETING,
    CLASSIF_NUM_MEDIUM_0_CALIFORNIA,
    CLASSIF_NUM_MEDIUM_0_COVERTYPE,
    CLASSIF_NUM_MEDIUM_0_CREDIT,
    CLASSIF_NUM_MEDIUM_0_ELECTRICITY,
    CLASSIF_NUM_MEDIUM_0_EYE_MOVEMENTS,
    CLASSIF_NUM_MEDIUM_0_HOUSE_16H,
    CLASSIF_NUM_MEDIUM_0_JANNIS,
    CLASSIF_NUM_MEDIUM_0_KDD_IPUMS_LA_97_SMALL,
    CLASSIF_NUM_MEDIUM_0_PHONEME,
    CLASSIF_NUM_MEDIUM_0_POL,
    CLASSIF_NUM_MEDIUM_0_WINE,
    CLASSIF_NUM_MEDIUM_1_MAGICTELESCOPE,
    CLASSIF_NUM_MEDIUM_1_BANK_MARKETING,
    CLASSIF_NUM_MEDIUM_1_CREDIT,
    CLASSIF_NUM_MEDIUM_1_EYE_MOVEMENTS,
    CLASSIF_NUM_MEDIUM_1_HOUSE_16H,
    CLASSIF_NUM_MEDIUM_1_KDD_IPUMS_LA_97_SMALL,
    CLASSIF_NUM_MEDIUM_1_PHONEME,
    CLASSIF_NUM_MEDIUM_1_POL,
    CLASSIF_NUM_MEDIUM_1_WINE,
    CLASSIF_NUM_MEDIUM_2_MAGICTELESCOPE,
    CLASSIF_NUM_MEDIUM_2_BANK_MARKETING,
    CLASSIF_NUM_MEDIUM_2_EYE_MOVEMENTS,
    CLASSIF_NUM_MEDIUM_2_HOUSE_16H,
    CLASSIF_NUM_MEDIUM_2_KDD_IPUMS_LA_97_SMALL,
    CLASSIF_NUM_MEDIUM_2_PHONEME,
    CLASSIF_NUM_MEDIUM_2_POL,
    CLASSIF_NUM_MEDIUM_2_WINE,
    CLASSIF_NUM_MEDIUM_3_PHONEME,
    CLASSIF_NUM_MEDIUM_3_WINE,
    CLASSIF_NUM_MEDIUM_4_PHONEME,
    CLASSIF_NUM_MEDIUM_4_WINE,
    REGRESSION_CAT_LARGE_0_SGEMM_GPU_KERNEL_PERFORMANCE,
    REGRESSION_CAT_LARGE_0_BLACK_FRIDAY,
    REGRESSION_CAT_LARGE_0_DIAMONDS,
    REGRESSION_CAT_LARGE_0_NYC_TAXI_GREEN_DEC_2016,
    REGRESSION_CAT_LARGE_0_PARTICULATE_MATTER_UKAIR_2017,
    REGRESSION_CAT_MEDIUM_0_BIKE_SHARING_DEMAND,
    REGRESSION_CAT_MEDIUM_0_BRAZILIAN_HOUSES,
    REGRESSION_CAT_MEDIUM_0_MERCEDES_BENZ_GREENER_MANUFACTURING,
    REGRESSION_CAT_MEDIUM_0_ONLINENEWSPOPULARITY,
    REGRESSION_CAT_MEDIUM_0_SGEMM_GPU_KERNEL_PERFORMANCE,
    REGRESSION_CAT_MEDIUM_0_ANALCATDATA_SUPREME,
    REGRESSION_CAT_MEDIUM_0_BLACK_FRIDAY,
    REGRESSION_CAT_MEDIUM_0_DIAMONDS,
    REGRESSION_CAT_MEDIUM_0_HOUSE_SALES,
    REGRESSION_CAT_MEDIUM_0_NYC_TAXI_GREEN_DEC_2016,
    REGRESSION_CAT_MEDIUM_0_PARTICULATE_MATTER_UKAIR_2017,
    REGRESSION_CAT_MEDIUM_0_VISUALIZING_SOIL,
    REGRESSION_CAT_MEDIUM_0_YPROP_4_1,
    REGRESSION_CAT_MEDIUM_1_BIKE_SHARING_DEMAND,
    REGRESSION_CAT_MEDIUM_1_BRAZILIAN_HOUSES,
    REGRESSION_CAT_MEDIUM_1_MERCEDES_BENZ_GREENER_MANUFACTURING,
    REGRESSION_CAT_MEDIUM_1_ANALCATDATA_SUPREME,
    REGRESSION_CAT_MEDIUM_1_VISUALIZING_SOIL,
    REGRESSION_CAT_MEDIUM_1_YPROP_4_1,
    REGRESSION_CAT_MEDIUM_2_BRAZILIAN_HOUSES,
    REGRESSION_CAT_MEDIUM_2_MERCEDES_BENZ_GREENER_MANUFACTURING,
    REGRESSION_CAT_MEDIUM_2_ANALCATDATA_SUPREME,
    REGRESSION_CAT_MEDIUM_2_VISUALIZING_SOIL,
    REGRESSION_CAT_MEDIUM_2_YPROP_4_1,
    REGRESSION_CAT_MEDIUM_3_MERCEDES_BENZ_GREENER_MANUFACTURING,
    REGRESSION_CAT_MEDIUM_3_ANALCATDATA_SUPREME,
    REGRESSION_CAT_MEDIUM_4_MERCEDES_BENZ_GREENER_MANUFACTURING,
    REGRESSION_CAT_MEDIUM_4_ANALCATDATA_SUPREME,
    REGRESSION_NUM_LARGE_0_DIAMONDS,
    REGRESSION_NUM_LARGE_0_NYC_TAXI_GREEN_DEC_2016,
    REGRESSION_NUM_LARGE_0_YEAR,
    REGRESSION_NUM_MEDIUM_0_AILERONS,
    REGRESSION_NUM_MEDIUM_0_BIKE_SHARING_DEMAND,
    REGRESSION_NUM_MEDIUM_0_BRAZILIAN_HOUSES,
    REGRESSION_NUM_MEDIUM_0_MIAMIHOUSING2016,
    REGRESSION_NUM_MEDIUM_0_CALIFORNIA,
    REGRESSION_NUM_MEDIUM_0_CPU_ACT,
    REGRESSION_NUM_MEDIUM_0_DIAMONDS,
    REGRESSION_NUM_MEDIUM_0_ELEVATORS,
    REGRESSION_NUM_MEDIUM_0_FIFA,
    REGRESSION_NUM_MEDIUM_0_HOUSE_16H,
    REGRESSION_NUM_MEDIUM_0_HOUSE_SALES,
    REGRESSION_NUM_MEDIUM_0_HOUSES,
    REGRESSION_NUM_MEDIUM_0_ISOLET,
    REGRESSION_NUM_MEDIUM_0_MEDICAL_CHARGES,
    REGRESSION_NUM_MEDIUM_0_NYC_TAXI_GREEN_DEC_2016,
    REGRESSION_NUM_MEDIUM_0_POL,
    REGRESSION_NUM_MEDIUM_0_SULFUR,
    REGRESSION_NUM_MEDIUM_0_SUPERCONDUCT,
    REGRESSION_NUM_MEDIUM_0_WINE_QUALITY,
    REGRESSION_NUM_MEDIUM_0_YEAR,
    REGRESSION_NUM_MEDIUM_1_AILERONS,
    REGRESSION_NUM_MEDIUM_1_BIKE_SHARING_DEMAND,
    REGRESSION_NUM_MEDIUM_1_BRAZILIAN_HOUSES,
    REGRESSION_NUM_MEDIUM_1_MIAMIHOUSING2016,
    REGRESSION_NUM_MEDIUM_1_CPU_ACT,
    REGRESSION_NUM_MEDIUM_1_ELEVATORS,
    REGRESSION_NUM_MEDIUM_1_FIFA,
    REGRESSION_NUM_MEDIUM_1_ISOLET,
    REGRESSION_NUM_MEDIUM_1_POL,
    REGRESSION_NUM_MEDIUM_1_SULFUR,
    REGRESSION_NUM_MEDIUM_1_WINE_QUALITY,
    REGRESSION_NUM_MEDIUM_2_AILERONS,
    REGRESSION_NUM_MEDIUM_2_BRAZILIAN_HOUSES,
    REGRESSION_NUM_MEDIUM_2_MIAMIHOUSING2016,
    REGRESSION_NUM_MEDIUM_2_CPU_ACT,
    REGRESSION_NUM_MEDIUM_2_ISOLET,
    REGRESSION_NUM_MEDIUM_2_SULFUR,
    REGRESSION_NUM_MEDIUM_2_WINE_QUALITY,
]


def _filter_why_datasets():
    datasets_by_name = {}
    for dataset in _DATASETS_WHY_RAW:
        datasets_by_name.setdefault(dataset.rsplit('-', 1)[-1], []).append(dataset)
    for name, datasets in list(datasets_by_name.items()):
        if name in [
            'compass',  # Contains a data leak.
            'electricity',  # Contains a data leak (incorrectly split time series).
            'eye_movements',  # Contains a data leak.
            'rl',  # Anonymous datasets with weird results (also contains a leak?).
            'yprop_4_1',  # Not representative (all models have the same score)
        ]:
            continue
        elif name == 'houses':
            # This dataset is almost a copy of the "California housing" dataset
            # that is already presented in DATASETS_DEFAULT
            # (in fact, that dataset it is also presented among the "why" datasets
            #  under the name "california").
            continue
        elif name in [
            'california',
            'house_16H',
            'covertype',
            'black_friday',
            'diamonds',
            'Higgs',
        ]:
            # These datasets are already presented in DATASETS_DEFAULT.
            continue
        if any(x.startswith('regression-') for x in datasets):
            # The original task is regression.
            datasets = [x for x in datasets if x.startswith('regression-')]
        if any(x.split('-', 4)[1] == 'cat' for x in datasets):
            # The original task has categorical features.
            datasets = [x for x in datasets if x.split('-', 4)[1] == 'cat']
        if any(x.split('-', 4)[2] == 'large' for x in datasets):
            # The original task is large.
            datasets = [x for x in datasets if x.split('-', 4)[2] == 'large']
        yield from datasets


DATASETS_WHY = list(_filter_why_datasets())
assert not (set(DATASETS_DEFAULT) & set(DATASETS_TABRED))
assert not (set(DATASETS_DEFAULT) & set(DATASETS_WHY))
assert not (set(DATASETS_TABRED) & set(DATASETS_WHY))

DATASETS_ALL = [*DATASETS_DEFAULT, *DATASETS_TABRED, *DATASETS_WHY]

NN_BATCH_SIZE = {
    CHURN: 256,
    CALIFORNIA: 256,
    HOUSE: 256,
    ADULT: 256,
    DIAMOND: 512,
    OTTO: 512,
    HIGGS_SMALL: 512,
    BLACK_FRIDAY: 512,
    COVTYPE2: 1024,
    MICROSOFT: 1024,
    #
    SBERBANK_HOUSING: 256,
    ECOM_OFFERS: 1024,
    MAPS_ROUTING: 1024,
    HOMESITE_INSURANCE: 1024,
    COOKING_TIME: 1024,
    HOMECREDIT_DEFAULT: 1024,
    DELIVERY_ETA: 1024,
    WEATHER: 1024,
    #
    CLASSIF_CAT_LARGE_0_ROAD_SAFETY: 512,
    CLASSIF_CAT_MEDIUM_0_KDDCUP09_UPSELLING: 64,
    CLASSIF_CAT_MEDIUM_1_KDDCUP09_UPSELLING: 64,
    CLASSIF_CAT_MEDIUM_2_KDDCUP09_UPSELLING: 64,
    CLASSIF_NUM_LARGE_0_MINIBOONE: 512,
    CLASSIF_NUM_LARGE_0_JANNIS: 512,
    CLASSIF_NUM_MEDIUM_0_MAGICTELESCOPE: 256,
    CLASSIF_NUM_MEDIUM_1_MAGICTELESCOPE: 256,
    CLASSIF_NUM_MEDIUM_2_MAGICTELESCOPE: 256,
    CLASSIF_NUM_MEDIUM_0_BANK_MARKETING: 256,
    CLASSIF_NUM_MEDIUM_1_BANK_MARKETING: 256,
    CLASSIF_NUM_MEDIUM_2_BANK_MARKETING: 256,
    CLASSIF_NUM_MEDIUM_0_CREDIT: 256,
    CLASSIF_NUM_MEDIUM_1_CREDIT: 256,
    CLASSIF_NUM_MEDIUM_0_KDD_IPUMS_LA_97_SMALL: 64,
    CLASSIF_NUM_MEDIUM_1_KDD_IPUMS_LA_97_SMALL: 64,
    CLASSIF_NUM_MEDIUM_2_KDD_IPUMS_LA_97_SMALL: 64,
    CLASSIF_NUM_MEDIUM_0_PHONEME: 32,
    CLASSIF_NUM_MEDIUM_1_PHONEME: 32,
    CLASSIF_NUM_MEDIUM_2_PHONEME: 32,
    CLASSIF_NUM_MEDIUM_3_PHONEME: 32,
    CLASSIF_NUM_MEDIUM_4_PHONEME: 32,
    REGRESSION_NUM_MEDIUM_0_POL: 256,
    REGRESSION_NUM_MEDIUM_1_POL: 256,
    CLASSIF_NUM_MEDIUM_0_WINE: 32,
    CLASSIF_NUM_MEDIUM_1_WINE: 32,
    CLASSIF_NUM_MEDIUM_2_WINE: 32,
    CLASSIF_NUM_MEDIUM_3_WINE: 32,
    CLASSIF_NUM_MEDIUM_4_WINE: 32,
    REGRESSION_CAT_LARGE_0_SGEMM_GPU_KERNEL_PERFORMANCE: 512,
    REGRESSION_CAT_LARGE_0_NYC_TAXI_GREEN_DEC_2016: 512,
    REGRESSION_CAT_LARGE_0_PARTICULATE_MATTER_UKAIR_2017: 512,
    REGRESSION_CAT_MEDIUM_0_BIKE_SHARING_DEMAND: 256,
    REGRESSION_CAT_MEDIUM_1_BIKE_SHARING_DEMAND: 256,
    REGRESSION_CAT_MEDIUM_0_BRAZILIAN_HOUSES: 256,
    REGRESSION_CAT_MEDIUM_1_BRAZILIAN_HOUSES: 256,
    REGRESSION_CAT_MEDIUM_2_BRAZILIAN_HOUSES: 256,
    REGRESSION_CAT_MEDIUM_0_MERCEDES_BENZ_GREENER_MANUFACTURING: 64,
    REGRESSION_CAT_MEDIUM_1_MERCEDES_BENZ_GREENER_MANUFACTURING: 64,
    REGRESSION_CAT_MEDIUM_2_MERCEDES_BENZ_GREENER_MANUFACTURING: 64,
    REGRESSION_CAT_MEDIUM_3_MERCEDES_BENZ_GREENER_MANUFACTURING: 64,
    REGRESSION_CAT_MEDIUM_4_MERCEDES_BENZ_GREENER_MANUFACTURING: 64,
    REGRESSION_CAT_MEDIUM_0_ONLINENEWSPOPULARITY: 256,
    REGRESSION_CAT_MEDIUM_0_ANALCATDATA_SUPREME: 64,
    REGRESSION_CAT_MEDIUM_1_ANALCATDATA_SUPREME: 64,
    REGRESSION_CAT_MEDIUM_2_ANALCATDATA_SUPREME: 64,
    REGRESSION_CAT_MEDIUM_3_ANALCATDATA_SUPREME: 64,
    REGRESSION_CAT_MEDIUM_4_ANALCATDATA_SUPREME: 64,
    REGRESSION_CAT_MEDIUM_0_HOUSE_SALES: 256,
    REGRESSION_CAT_MEDIUM_0_VISUALIZING_SOIL: 128,
    REGRESSION_CAT_MEDIUM_1_VISUALIZING_SOIL: 128,
    REGRESSION_CAT_MEDIUM_2_VISUALIZING_SOIL: 128,
    REGRESSION_NUM_LARGE_0_YEAR: 512,
    REGRESSION_NUM_MEDIUM_0_AILERONS: 256,
    REGRESSION_NUM_MEDIUM_1_AILERONS: 256,
    REGRESSION_NUM_MEDIUM_2_AILERONS: 256,
    REGRESSION_NUM_MEDIUM_0_MIAMIHOUSING2016: 256,
    REGRESSION_NUM_MEDIUM_1_MIAMIHOUSING2016: 256,
    REGRESSION_NUM_MEDIUM_2_MIAMIHOUSING2016: 256,
    REGRESSION_NUM_MEDIUM_0_CPU_ACT: 128,
    REGRESSION_NUM_MEDIUM_1_CPU_ACT: 128,
    REGRESSION_NUM_MEDIUM_2_CPU_ACT: 128,
    REGRESSION_NUM_MEDIUM_0_ELEVATORS: 256,
    REGRESSION_NUM_MEDIUM_1_ELEVATORS: 256,
    REGRESSION_NUM_MEDIUM_0_FIFA: 256,
    REGRESSION_NUM_MEDIUM_1_FIFA: 256,
    REGRESSION_NUM_MEDIUM_0_ISOLET: 128,
    REGRESSION_NUM_MEDIUM_1_ISOLET: 128,
    REGRESSION_NUM_MEDIUM_2_ISOLET: 128,
    REGRESSION_NUM_MEDIUM_0_MEDICAL_CHARGES: 512,
    REGRESSION_NUM_MEDIUM_0_SULFUR: 256,
    REGRESSION_NUM_MEDIUM_1_SULFUR: 256,
    REGRESSION_NUM_MEDIUM_2_SULFUR: 256,
    REGRESSION_NUM_MEDIUM_0_SUPERCONDUCT: 256,
    REGRESSION_NUM_MEDIUM_0_WINE_QUALITY: 128,
    REGRESSION_NUM_MEDIUM_1_WINE_QUALITY: 128,
    REGRESSION_NUM_MEDIUM_2_WINE_QUALITY: 128,
}

# NOTE
# The 'noisy-quantile' normalization is used for most of the datasets,
# with several exceptions.
NN_NUM_POLICY = {
    x: 'noisy-quantile'
    for x in DATASETS_ALL
    if x
    not in {
        # The 'noisy-quantile' normalization works poorly for the OTTO dataset.
        OTTO,
        # The *MERCEDES* datasets do not have numerical features.
        REGRESSION_CAT_MEDIUM_0_MERCEDES_BENZ_GREENER_MANUFACTURING,
        REGRESSION_CAT_MEDIUM_1_MERCEDES_BENZ_GREENER_MANUFACTURING,
        REGRESSION_CAT_MEDIUM_2_MERCEDES_BENZ_GREENER_MANUFACTURING,
        REGRESSION_CAT_MEDIUM_3_MERCEDES_BENZ_GREENER_MANUFACTURING,
        REGRESSION_CAT_MEDIUM_4_MERCEDES_BENZ_GREENER_MANUFACTURING,
        # The following TabReD datasets are already normalized.
        MAPS_ROUTING,
        COOKING_TIME,
        DELIVERY_ETA,
    }
}
for d in [
    MAPS_ROUTING,
    COOKING_TIME,
    DELIVERY_ETA,
]:
    NN_NUM_POLICY[d] = 'identity'


def wrap_dataset_name(name: str) -> str:
    return (
        f'why/{name}'
        if name in DATASETS_WHY
        else f'tabred/{name}'
        if name in DATASETS_TABRED
        else name
    )

def try_get_relative_path(path: str | Path) -> Path:
    path = Path(path).resolve()
    project_dir = lib.PROJECT_DIR
    return path.relative_to(project_dir) if project_dir in path.parents else path


def load_dataset_info(path: str | Path) -> dict[str, Any]:
    if isinstance(path, str):
        path = path.removeprefix(':')
    path = lib.PROJECT_DIR/Path(path)
    assert path.exists(), f'Dataset {path} does not exist.'

    relative_path = str(try_get_relative_path(path))
    raw_name = relative_path.removeprefix('data/')
    prefix = wrap_dataset_name(raw_name).removesuffix(raw_name)
    if raw_name in DATASETS_WHY:
        _, _, _, split, name = raw_name.split('-', 4)
        split = int(split)
    else:
        name = raw_name
        split = 0
    info = {
        'path': relative_path,
        'name': prefix + name,
        'split': split,
        'task_type': json.loads(path.joinpath('info.json').read_text())['task_type'],
    }
    parts = ['train', 'val', 'test']
    for part in parts:
        info[f'{part}_size'] = len(np.load(path / f'Y_{part}.npy'))
    info['n_features'] = 0
    for ftype in ['num', 'bin', 'cat']:
        x_ftype_path = path / f'X_{ftype}_val.npy'
        n = np.load(x_ftype_path).shape[1] if x_ftype_path.exists() else 0
        info[f'n_{ftype}_features'] = n
        info['n_features'] += n

    return info


def load_datasets_info() -> pd.DataFrame:
    return pd.DataFrame.from_records(
        [load_dataset_info(f'data/{x}') for x in DATASETS_ALL]
    ).sort_values('train_size')


# ======================================================================================
# Configs
# ======================================================================================
def make_nn_config(
    reference_config: dict, dataset: str, n_trials: None | int = None
) -> dict:
    """Copy the reference config and overwrite dataset-specific fields.

    The supposed usage is to manually carefully compose the reference
    config for one dataset (the California Housing dataset is recommended),
    and then automatically create configs for other datasets
    with this function.

    Args:
        reference_config: a fully functioning reference config.
        dataset: the dataset for which the new config will be made
            based on the provided reference.
        n_trials: how many rounds of hyperparameter tuning to perform
            (applicable only to hyperparameter tuning configs).

    Returns:
        The new config.
    """
    config = deepcopy(reference_config)

    # Fill tuning-related fields.
    if 'space' in config:
        # Tuning config.
        assert n_trials is not None, 'For tuning configs, n_trials must be provided'
        config['n_trials'] = n_trials
        space = config['space']
    else:
        # Evaluation config.
        space = config

    # Fill data-related fields.
    dataset_path = lib.DATA_DIR / dataset
    space['data']['path'] = str(dataset_path.relative_to(lib.PROJECT_DIR))

    space['data'].pop('num_policy', None)
    num_policy = NN_NUM_POLICY.get(dataset)
    if num_policy is not None:
        space['data']['num_policy'] = num_policy

    space['data'].pop('cat_policy', None)
    if dataset_path.joinpath('X_cat_train.npy').exists():
        space['data']['cat_policy'] = 'ordinal'

    # Fill training-related fields.
    space['batch_size'] = NN_BATCH_SIZE[dataset]

    return config


# ======================================================================================
# Results
# ======================================================================================
@functools.lru_cache(2048)
def _load_dataset_info_cached(*args, **kwargs) -> dict[str, Any]:
    return load_dataset_info(*args, **kwargs)


def parse_timedelta(timedelta: str) -> datetime.timedelta:
    seconds = 0.0
    timedelta_ = timedelta.split()
    if len(timedelta_) == 3:
        seconds += int(timedelta_[0]) * 60 * 60 * 24
    h, m, s = timedelta_[-1].split(':')
    seconds += float(s) + int(m) * 60 + int(h) * 60 * 60
    return datetime.timedelta(seconds=seconds)


def load_result(
    path: str | Path,
    name: str = '',
    *,
    patch: bool = True,
    enrich: bool = True,
    add_dataset_info: bool = False,
) -> dict[str, Any]:
    """
    Args:
        path: the path to the experiment directory.
        name: if enrich=True, then the "Name" field will be set to this value.
        patch: if True, some parts of the loaded report are updated.
            This includes patching "foreign" reports from other projects,
            rescaling metrics for some datasets, etc.
        enrich: if True, additional helpful fields are added.
            By convention, their names start with capital letters.
        add_dataset_info: if True, dataset properties are added
            with the prefix "Dataset."
    """
    path = Path(path).resolve()
    project_dir = lib.PROJECT_DIR

    # >>> Load the report.
    report_path = path / 'report.json'
    assert (
        report_path.exists()
    ), f'The experiment {path} is missing a report ("report.json")'

    report = json.loads(report_path.read_text())
    if 'best' in report:
        assert report['function'] == 'bin.tune.main'
    subreport = report.get('best', report)

    # >>> Fix certain reports from the TabR project.
    if patch and 'data' in report:
        report['config']['data'] = {'path': report.pop('data')}

    # >>> Add new fields. By convention, their names start with capital letters.
    raw_dataset_name = (
        subreport['config']['data']['path']
        .removeprefix(':')
        .removeprefix(str(lib.DATA_DIR.relative_to(project_dir)))
        .removeprefix('/')
    )


    x: dict[str, Any] = {'Name': name}

    if raw_dataset_name in DATASETS_WHY:
        # Here, the code relies on the fact that all duplicated datasets
        # are removed from the "Why" benchmark.
        _, _, _, dataset_split, dataset_name = raw_dataset_name.split('-', 4)
        dataset_split = int(dataset_split)
    else:
        dataset_name = raw_dataset_name
        dataset_split = 0
    prefix = wrap_dataset_name(raw_dataset_name).removesuffix(raw_dataset_name)
    x['Dataset.name'] = prefix + dataset_name
    x['Dataset.split'] = dataset_split

    if report['function'] in ('bin.ensemble.main', 'bin.ensemble-multiple.main'):
        x['Seed'] = ','.join(map(str, subreport['config']['seeds']))
    else:
        x['Seed'] = subreport['config']['seed']
    if 'time' in subreport:
        x['TimeSeconds'] = parse_timedelta(subreport['time']).total_seconds()

    x['Path'] = str(
        path.relative_to(project_dir) if project_dir in path.parents else path
    )

    # >>> Make metrics more human-friendly on certain datasets.
    if patch and raw_dataset_name == HOUSE:
        for part in report['metrics']:
            report['metrics'][part]['score'] /= 10000

    # >>>
    x |= report
    if add_dataset_info:
        x |= {
            f'Dataset.{k}': v
            for k, v in _load_dataset_info_cached(
                subreport['config']['data']['path']
            ).items()
        }
    return x


def _flatten_keys(d: dict, key_prefix: str, result: dict) -> None:
    for k, v in d.items():
        new_k = f'{key_prefix}.{k}' if key_prefix else k
        if isinstance(v, dict):
            _flatten_keys(v, new_k, result)
        else:
            if result.setdefault(new_k, v) is not v:
                RuntimeError(
                    'Different parts of the dictionary resulted'
                    f' in the same flat key "{new_k}"'
                )


@functools.lru_cache(None)
def _load_result_flat(*args, **kwargs) -> dict[str, Any]:
    result = load_result(*args, **kwargs)
    result_flat = {}
    _flatten_keys(result, '', result_flat)
    return result_flat


def _can_be_collected(path: Path, require_done: bool) -> bool:
    return not require_done or path.joinpath('DONE').exists()


@functools.lru_cache(None)
def _collect_experiments_by_pattern(pattern: str, require_done: bool) -> list[Path]:
    return list(
        x
        for x in Path.cwd().glob(pattern)
        if x.is_dir() and _can_be_collected(x, require_done)
    )


def load_results(
    paths: str | Path | list[str | Path] | dict[str, str | Path | list[str | Path]],
    *,
    require_done: bool = True,
    patch: bool = True,
    enrich: bool = True,
    add_dataset_info: bool = False,
    blocklist: None | frozenset[Path] = None,
    verbose: bool = False,
    cache: bool = False,
) -> pd.DataFrame:
    """Load results as a Pandas dataframe.

    NOTE
    When there are many paths matching the provided glob patterns,
    the overhead of pattern matching can become noticeable. In such cases, consider
    explicitly assembling lists of Path objects in vanilla Python loops
    and passing them to this function instead of glob patterns.

    Args:
        paths: <see examples below>
        require_done: if True, only reports of completed runs are loaded.
        patch: see the docs for `load_result`.
        enrich: see the docs for `load_result`.
        add_dataset_info: see the docs for `load_result`.
        verbose: if True, a progress bar will be shown.
        cache: if True, some of the computations are cached:
            - The loaded results for collected paths.
            - Paths found with glob patterns.
            So cache=True should be used only when no changes to experiments
            are expected. To reset the cache, simply call this function once
            with any input and cache=False.
        blocklist: paths from this set will be skipped.

    Examples:

    >>> # `Path` is treated as a concrete existing path.
    >>> load_reports(Path('exp/mlp/california/evaluation/0'))
    >>> load_reports(Path('exp/mlp/california/tuning'))

    >>> # `str` is treated as a glob pattern that matches only directories.
    >>> load_reports('exp/mlp/california/evaluation/*')
    >>> # The previous line is equivalent to:
    >>> load_reports([
    ...     x
    ...     for x in Path.cwd().glob('exp/mlp/california/evaluation/*')
    ...     if x.is_dir()
    ... ])

    >>> # Many paths and patterns can be provided at once.
    >>> load_reports(
    ...     [
    ...         'exp/mlp/california/evaluation/0',
    ...         Path('exp/mlp/adult/evaluation/1'),
    ...         'exp/mlp/house/evaluation/*',
    ...     ]
    ... )

    >>> # If a dictionary is provided, the output dataframe will have
    >>> # a meaningful "Name" column useful for further aggregation and analysis.
    >>> load_reports(
    ...     {
    ...         'MLP': 'exp/mlp/**/evaluation/*',
    ...         'XGBoost': [
    ...             'exp/xgboost_/**/evaluation/*',
    ...             '../other_project/exp/xgboost_/**/evaluation/*'),
    ...         ]
    ...     }
    ... )

    >>> # Skip irrelevant experiments with blocklist.
    >>> load_reports(
    ...     {
    ...         'MLP': 'exp/mlp/**/evaluation/*',
    ...         'TabR': 'exp/baselines/tabr/**/evaluation/*',
    ...     },
    ...     blocklist=frozenset(Path.cwd().glob('exp/baselines/tabr/weather-small/0-evaluation/*'))
    ... )
    """
    collect_experiments_by_pattern = (
        _collect_experiments_by_pattern
        if cache
        else _collect_experiments_by_pattern.__wrapped__
    )

    def _collect(source: str | Path | list[str | Path]) -> list[Path]:
        if isinstance(source, str | Path):
            source = [source]
        collected_paths = []
        for x in source:
            if isinstance(x, Path) and _can_be_collected(x, require_done):
                collected_paths.append(x)
            else:
                collected_paths.extend(collect_experiments_by_pattern(x, require_done))
        return collected_paths

    if verbose:
        print('Collecting experiment paths...')
    named_paths = (
        {'': _collect(paths)}
        if isinstance(paths, str | Path | list)
        else {k: _collect(v) for k, v in paths.items()}
    )

    records = []
    load_result_flat = _load_result_flat if cache else _load_result_flat.__wrapped__
    pbar = tqdm(
        desc='Loading results',
        total=sum(map(len, named_paths.values())),
        disable=not verbose,
    )
    for name in named_paths:
        if not named_paths[name]:
            message = (
                f'No results were found for the name "{name}"'
                if isinstance(paths, dict)
                else 'No results were found'
            )
            warnings.warn(message)

        visited = set()
        for path in named_paths[name]:
            if blocklist is not None and path in blocklist:
                continue
            if path in visited:
                raise RuntimeError(
                    f'The experiment {path} was provided more than once'
                    ' (perhaps, it matches more than one pattern)'
                )
            assert path.exists(), f'The experiment {path} does not exist'
            assert path.is_dir(), f'Bad experiment {path}. Please, see examples'
            records.append(
                load_result_flat(
                    path,
                    name,
                    patch=patch,
                    enrich=enrich,
                    add_dataset_info=add_dataset_info,
                )
            )
            visited.add(path)
            pbar.update()

    df = pd.DataFrame.from_records(records)
    sort_by = [
        x
        for x in ['Dataset.train_size', 'Dataset.name', 'Dataset.split', 'Name', 'Seed']
        if x in df.columns
    ]
    if sort_by:
        df = df.sort_values(sort_by)
    return df


def results_cache_clear():
    _load_dataset_info_cached.cache_clear()
    _collect_experiments_by_pattern.cache_clear()
    _load_result_flat.cache_clear()


def compute_ranks_(df: pd.DataFrame, mean_column: str, std_column: str) -> pd.DataFrame:
    df = df.sort_values([mean_column, std_column], ascending=[False, True])
    ranks = []
    current_score = None
    current_std = None
    for _, columns in df.iterrows():
        score = columns[mean_column]
        std = columns[std_column]
        if current_score is None:
            ranks.append(1)
            current_score = score
            current_std = std
        elif current_score - score <= current_std:
            ranks.append(ranks[-1])
        else:
            ranks.append(ranks[-1] + 1)
            current_score = score
            current_std = std
    df['Rank'] = ranks
    return df


