from __future__ import annotations

import itertools

import numpy as np
from datasets import Dataset, DatasetDict

from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

_LANGUAGES = {
    "bs": ["bos-Latn"],
    "ca": ["cat-Latn"],
    "cs": ["ces-Latn"],
    "da": ["dan-Latn"],
    "eu": ["eus-Latn"],
    "gv": ["glv-Latn"],
    "ilo": ["ilo-Latn"],
    "ku": ["kur-Latn"],
    "lv": ["lav-Latn"],
    "min": ["min-Latn"],
    "mt": ["mlt-Latn"],
    "sco": ["sco-Latn"],
    "sq": ["sqi-Latn"],
    "wa": ["wln-Latn"],
}


class WikiClusteringP2P(AbsTaskClustering, MultilingualTask):
    superseded_by = "WikiClusteringP2P.v2"
    metadata = TaskMetadata(
        name="WikiClusteringP2P",
        description="Clustering of wikipedia articles inspired by BlubrbsClusteringP2P. Labels are taken from top-level categories of the respective languages (e.g., https://lv.wikipedia.org/wiki/Kategorija:Pamatkategorijas).",
        reference="https://github.com/Rysias/wiki-clustering",
        dataset={
            "path": "ryzzlestrizzle/multi-wiki-clustering-p2p",
            "revision": "d4d92f8f28be71035be6a96bdfd4e200cf62faa8",
        },
        type="Clustering",
        category="p2p",
        modalities=["text"],
        eval_splits=["test"],
        eval_langs=_LANGUAGES,
        main_score="v_measure",
        date=("2001-01-15", "2024-04-15"),
        domains=["Encyclopaedic", "Written"],
        task_subtypes=["Thematic clustering"],
        license="cc-by-sa-3.0",
        annotations_creators="derived",
        dialect=[],
        sample_creation="created",
        bibtex_citation=None,  # None exists
        descriptive_stats={
            "n_samples": {"test": 71680},
            "test": {
                "num_samples": 140,
                "average_text_length": 512.0,
                "average_labels_per_text": 512.0,
                "unique_labels": 282,
                "labels": {
                    "Nauke": {"count": 1492},
                    "Dru\u00c5\u00a1tvo": {"count": 504},
                    "Priroda": {"count": 448},
                    "Kultura": {"count": 1042},
                    "Tehnologija": {"count": 671},
                    "Tehnika": {"count": 281},
                    "Geografija": {"count": 431},
                    "Informatika": {"count": 355},
                    "Koncepti": {"count": 83},
                    "Humanisti\u00c4\u008dke_nauke": {"count": 62},
                    "Informacija": {"count": 21},
                    "Historija": {"count": 223},
                    "Matematika": {"count": 74},
                    "Okoli\u00c5\u00a1": {"count": 6},
                    "Jezik": {"count": 15},
                    "Misao": {"count": 28},
                    "Energija": {"count": 16},
                    "Llocs": {"count": 642},
                    "Ci\u00c3\u00a8ncia": {"count": 1844},
                    "Humanitats": {"count": 984},
                    "Tecnologia": {"count": 377},
                    "Biografies": {"count": 406},
                    "Cultura": {"count": 710},
                    "Informaci\u00c3\u00b3": {"count": 137},
                    "Esdeveniments": {"count": 20},
                    "Lid\u00c3\u00a9": {"count": 1559},
                    "Geografie": {"count": 1659},
                    "\u00c4\u008cas": {"count": 88},
                    "Politika": {"count": 818},
                    "V\u00c4\u009bda": {"count": 314},
                    "Technika": {"count": 189},
                    "Informace": {"count": 96},
                    "\u00c5\u00bdivot": {"count": 184},
                    "Vojenstv\u00c3\u00ad": {"count": 34},
                    "Um\u00c4\u009bn\u00c3\u00ad": {"count": 95},
                    "P\u00c5\u0099\u00c3\u00adroda": {"count": 387},
                    "Spole\u00c4\u008dnost": {"count": 319},
                    "Historie": {"count": 391},
                    "Sport": {"count": 57},
                    "Dorozum\u00c3\u00adv\u00c3\u00a1n\u00c3\u00ad": {"count": 142},
                    "Zdravotnictv\u00c3\u00ad": {"count": 94},
                    "Vzd\u00c4\u009bl\u00c3\u00a1v\u00c3\u00a1n\u00c3\u00ad": {
                        "count": 41
                    },
                    "P\u00c5\u0099edm\u00c4\u009bty": {"count": 21},
                    "Pr\u00c3\u00a1vo": {"count": 28},
                    "Natur": {"count": 398},
                    "Kultur": {"count": 911},
                    "Samfund": {"count": 531},
                    "Politik": {"count": 287},
                    "Computersystemer": {"count": 84},
                    "Erhvervsliv": {"count": 164},
                    "Teknik": {"count": 166},
                    "Sproget": {"count": 41},
                    "Humaniora": {"count": 139},
                    "Uddannelse": {"count": 79},
                    "Anvendt_videnskab": {"count": 199},
                    "Kunst": {"count": 412},
                    "Sundhed": {"count": 83},
                    "Videnskab": {"count": 257},
                    "Matematik": {"count": 79},
                    "Geografi": {"count": 671},
                    "Liv": {"count": 286},
                    "Personer": {"count": 322},
                    "Jura": {"count": 35},
                    "Entitateak": {"count": 2486},
                    "Humanitateak": {"count": 861},
                    "Mundua": {"count": 706},
                    "Zientzia_eta_teknologia": {"count": 957},
                    "Entretenimendua": {"count": 110},
                    "Chron-oaylleeaght": {"count": 1334},
                    "\u00c3\u0087heeraghyn": {"count": 470},
                    "Oaylleeaght": {"count": 575},
                    "\u00c3\u0087heer-oaylleeaght": {"count": 190},
                    "Persoonyn": {"count": 249},
                    "Bea": {"count": 207},
                    "Cultoor": {"count": 269},
                    "Glare": {"count": 152},
                    "Dooghys": {"count": 494},
                    "Smooinaght": {"count": 28},
                    "Sheshaght": {"count": 221},
                    "\u00c3\u0087haghnoaylleeaght": {"count": 33},
                    "Strughtoor": {"count": 54},
                    "Politickaght": {"count": 138},
                    "Ronnaghyn_rere_bun-chooish": {"count": 381},
                    "Eirinys": {"count": 80},
                    "Ynsagh": {"count": 16},
                    "Am": {"count": 12},
                    "Leigh": {"count": 9},
                    "Studeyrys-sheelnaue": {"count": 72},
                    "Keirdyn_scoillaragh": {"count": 50},
                    "Shennaghys": {"count": 11},
                    "Dellal": {"count": 23},
                    "Maddaght": {"count": 6},
                    "Meanyn_mooarey": {"count": 7},
                    "Oaylleeaghtyn_feamagh": {"count": 30},
                    "Fysseree": {"count": 7},
                    "\u00c3\u0087hymmyltaght": {"count": 2},
                    "Katutubo": {"count": 135},
                    "Biag": {"count": 933},
                    "Pakasaritaan": {"count": 1302},
                    "Relihion": {"count": 20},
                    "Dagiti_kinatao": {"count": 1405},
                    "Lubong": {"count": 308},
                    "Siensia_ken_teknolohia": {"count": 12},
                    "Gobierno": {"count": 208},
                    "Heograpia": {"count": 323},
                    "Etika": {"count": 12},
                    "Pilosopia": {"count": 26},
                    "Tattao": {"count": 20},
                    "Dagiti_konsepto": {"count": 69},
                    "Panagkukua_ti_tao": {"count": 16},
                    "Pannakaammo": {"count": 29},
                    "Palpaliwa": {"count": 22},
                    "Pagsasao": {"count": 33},
                    "Kagimongan": {"count": 37},
                    "Salun-at": {"count": 9},
                    "Industria": {"count": 5},
                    "Linteg": {"count": 1},
                    "Makan_ken_mainum": {"count": 19},
                    "Dagiti_akademiko_a_disiplina": {"count": 27},
                    "Edukasion": {"count": 9},
                    "Musika": {"count": 1},
                    "Negosio": {"count": 10},
                    "Inhenieria": {"count": 1},
                    "Paspasamak": {"count": 6},
                    "Dagiti_banag": {"count": 2},
                    "Annuroten": {"count": 1},
                    "Enerhia": {"count": 1},
                    "Kes": {"count": 524},
                    "Medya": {"count": 67},
                    "Xwarin_\u00c3\u00bb_vexwarin": {"count": 127},
                    "Civak": {"count": 277},
                    "L\u00c3\u00aakol\u00c3\u00aen\u00c3\u00aan_mirovahiy\u00c3\u00aa": {
                        "count": 269
                    },
                    "Zeman": {"count": 174},
                    "Teknoloj\u00c3\u00ae": {"count": 92},
                    "D\u00c3\u00aes\u00c3\u00aepl\u00c3\u00aen\u00c3\u00aan_akadem\u00c3\u00aek": {
                        "count": 237
                    },
                    "Erdn\u00c3\u00aegar\u00c3\u00ae": {"count": 1078},
                    "D\u00c3\u00aerok": {"count": 436},
                    "Gerd\u00c3\u00bbn": {"count": 42},
                    "Jiyan": {"count": 255},
                    "\u00c5\u009eah\u00c3\u00ae": {"count": 107},
                    "Siyaset": {"count": 126},
                    "\u00c3\u0087and": {"count": 349},
                    "Zan\u00c3\u00aen": {"count": 82},
                    "D\u00c3\u00aen": {"count": 150},
                    "Helwesta_mirovan": {"count": 41},
                    "Zanist": {"count": 138},
                    "Hey\u00c3\u00aen": {"count": 49},
                    "Ziman": {"count": 69},
                    "Xweza": {"count": 129},
                    "Matemat\u00c3\u00aek": {"count": 23},
                    "Agah\u00c3\u00ae": {"count": 17},
                    "Dad": {"count": 9},
                    "\u00c3\u008enternet": {"count": 19},
                    "Hik\u00c3\u00bbmet": {"count": 35},
                    "Konsept": {"count": 10},
                    "Perwerde": {"count": 6},
                    "Werzi\u00c5\u009f": {"count": 20},
                    "Felsefe": {"count": 28},
                    "Et\u00c3\u00aek": {"count": 19},
                    "Wize": {"count": 6},
                    "Abor\u00c3\u00ae": {"count": 27},
                    "Tendurust\u00c3\u00ae": {"count": 38},
                    "Ragihandin": {"count": 5},
                    "Karwer\u00c3\u00ae": {"count": 6},
                    "Endezyar\u00c3\u00ae": {"count": 16},
                    "Le\u00c5\u009fker\u00c3\u00ae": {"count": 18},
                    "Kult\u00c5\u00abra": {"count": 749},
                    "Latvija": {"count": 356},
                    "\u00c4\u00a2eogr\u00c4\u0081fija": {"count": 721},
                    "Zin\u00c4\u0081tne_un_tehnolo\u00c4\u00a3ijas": {"count": 469},
                    "V\u00c4\u0093sture": {"count": 149},
                    "Sabiedr\u00c4\u00abba": {"count": 878},
                    "J\u00c4\u0093dzieni": {"count": 23},
                    "Dz\u00c4\u00abv\u00c4\u00abba": {"count": 274},
                    "Cilv\u00c4\u0093ki": {"count": 428},
                    "Sports": {"count": 166},
                    "M\u00c4\u0081ksla": {"count": 185},
                    "Daba": {"count": 412},
                    "Vesel\u00c4\u00abba": {"count": 124},
                    "Reli\u00c4\u00a3ija": {"count": 86},
                    "Notikumi": {"count": 13},
                    "Makaluak_iduik": {"count": 3986},
                    "Indonesia": {"count": 184},
                    "Budayo": {"count": 399},
                    "Ilimu_pangatauan": {"count": 121},
                    "Makanan": {"count": 12},
                    "Seni": {"count": 32},
                    "Organisasi": {"count": 23},
                    "Karajo": {"count": 30},
                    "Astronomi": {"count": 13},
                    "Sarugo": {"count": 30},
                    "Tokoh": {"count": 26},
                    "Agamo": {"count": 48},
                    "Sijarah": {"count": 5},
                    "Teknologi": {"count": 1},
                    "Ulahrago": {"count": 2},
                    "\u00c4\u00a0eografija": {"count": 1634},
                    "Arti": {"count": 194},
                    "Gvern": {"count": 107},
                    "Reli\u00c4\u00a1jon": {"count": 293},
                    "Dixxiplini_akkademi\u00c4\u008bi": {"count": 139},
                    "Nies": {"count": 270},
                    "So\u00c4\u008bjet\u00c3\u00a0": {"count": 88},
                    "Natura": {"count": 449},
                    "Sa\u00c4\u00a7\u00c4\u00a7a": {"count": 31},
                    "Xjenza": {"count": 667},
                    "Storja": {"count": 167},
                    "Ekonomija": {"count": 199},
                    "Lingwa": {"count": 76},
                    "Filosofija": {"count": 14},
                    "\u00c4\u00a6ajja_ta\\_Kuljum": {"count": 81},
                    "Edukazzjoni": {"count": 30},
                    "Mu\u00c5\u00bcika": {"count": 14},
                    "Komunikazzjoni_umana": {"count": 39},
                    "Spettaklu": {"count": 38},
                    "Kronolo\u00c4\u00a1ija": {"count": 39},
                    "Avvenimenti": {"count": 6},
                    "Li\u00c4\u00a1i": {"count": 19},
                    "Teknolo\u00c4\u00a1ija": {"count": 17},
                    "In\u00c4\u00a1inerija": {"count": 2},
                    "Life": {"count": 621},
                    "Naitur": {"count": 265},
                    "Society": {"count": 446},
                    "Humanities": {"count": 259},
                    "History": {"count": 184},
                    "Airts": {"count": 106},
                    "Technology": {"count": 324},
                    "Fowk": {"count": 208},
                    "Concepts": {"count": 237},
                    "Cultur": {"count": 427},
                    "Environs": {"count": 231},
                    "Warld": {"count": 141},
                    "Politics": {"count": 294},
                    "Eddication": {"count": 42},
                    "Airt": {"count": 18},
                    "Heal": {"count": 70},
                    "Science_an_technology": {"count": 60},
                    "Mathematics": {"count": 36},
                    "Law": {"count": 3},
                    "Tuils": {"count": 7},
                    "Employment": {"count": 14},
                    "Gjeografi": {"count": 586},
                    "Politik\u00c3\u00ab": {"count": 351},
                    "Let\u00c3\u00abrsi": {"count": 67},
                    "Administrat\u00c3\u00ab_publike": {"count": 320},
                    "Shoq\u00c3\u00abri": {"count": 116},
                    "Sporte": {"count": 105},
                    "Shkenc\u00c3\u00ab": {"count": 1109},
                    "Kultur\u00c3\u00ab": {"count": 299},
                    "Arte": {"count": 217},
                    "Persona": {"count": 425},
                    "Histori": {"count": 744},
                    "Mitologji": {"count": 5},
                    "Gjuh\u00c3\u00absi": {"count": 64},
                    "Teknologji": {"count": 84},
                    "Kinematografi": {"count": 72},
                    "Media": {"count": 51},
                    "Sigurime": {"count": 31},
                    "Loj\u00c3\u00abra": {"count": 3},
                    "Fe": {"count": 131},
                    "Bujq\u00c3\u00absi": {"count": 41},
                    "Ngjarje": {"count": 11},
                    "Biografi": {"count": 116},
                    "Matematik\u00c3\u00ab": {"count": 27},
                    "Teknik\u00c3\u00ab": {"count": 26},
                    "Drejt\u00c3\u00absi": {"count": 18},
                    "Organizata": {"count": 27},
                    "Jeta": {"count": 4},
                    "Agronomi": {"count": 3},
                    "Natyr\u00c3\u00ab": {"count": 11},
                    "Sh\u00c3\u00abndeti": {"count": 3},
                    "Shkencat_humane": {"count": 22},
                    "Shp\u00c3\u00abrblime": {"count": 2},
                    "Blegtori": {"count": 10},
                    "L\u00c3\u00abnd\u00c3\u00ab": {"count": 8},
                    "Enciklopedistika": {"count": 6},
                    "Economeye": {"count": 816},
                    "Syinces": {"count": 3653},
                    "Vicaedje_des_djins": {"count": 314},
                    "Creyance": {"count": 310},
                    "Rilom\u00c3\u00aay\u00c3\u00a8s_djins": {"count": 25},
                    "Date": {"count": 2},
                },
                "hf_subset_descriptive_stats": {
                    "bs": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 17,
                        "labels": {
                            "Nauke": {"count": 1492},
                            "Dru\u00c5\u00a1tvo": {"count": 504},
                            "Priroda": {"count": 448},
                            "Kultura": {"count": 453},
                            "Tehnologija": {"count": 671},
                            "Tehnika": {"count": 281},
                            "Geografija": {"count": 431},
                            "Informatika": {"count": 355},
                            "Koncepti": {"count": 83},
                            "Humanisti\u00c4\u008dke_nauke": {"count": 62},
                            "Informacija": {"count": 21},
                            "Historija": {"count": 223},
                            "Matematika": {"count": 31},
                            "Okoli\u00c5\u00a1": {"count": 6},
                            "Jezik": {"count": 15},
                            "Misao": {"count": 28},
                            "Energija": {"count": 16},
                        },
                    },
                    "ca": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 8,
                        "labels": {
                            "Llocs": {"count": 642},
                            "Ci\u00c3\u00a8ncia": {"count": 1844},
                            "Humanitats": {"count": 984},
                            "Tecnologia": {"count": 377},
                            "Biografies": {"count": 406},
                            "Cultura": {"count": 710},
                            "Informaci\u00c3\u00b3": {"count": 137},
                            "Esdeveniments": {"count": 20},
                        },
                    },
                    "cs": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 21,
                        "labels": {
                            "Lid\u00c3\u00a9": {"count": 1559},
                            "Geografie": {"count": 578},
                            "\u00c4\u008cas": {"count": 88},
                            "Politika": {"count": 538},
                            "V\u00c4\u009bda": {"count": 314},
                            "Technika": {"count": 189},
                            "Informace": {"count": 96},
                            "\u00c5\u00bdivot": {"count": 184},
                            "Vojenstv\u00c3\u00ad": {"count": 34},
                            "Um\u00c4\u009bn\u00c3\u00ad": {"count": 95},
                            "P\u00c5\u0099\u00c3\u00adroda": {"count": 387},
                            "Spole\u00c4\u008dnost": {"count": 319},
                            "Historie": {"count": 207},
                            "Sport": {"count": 22},
                            "Kultura": {"count": 150},
                            "Matematika": {"count": 34},
                            "Dorozum\u00c3\u00adv\u00c3\u00a1n\u00c3\u00ad": {
                                "count": 142
                            },
                            "Zdravotnictv\u00c3\u00ad": {"count": 94},
                            "Vzd\u00c4\u009bl\u00c3\u00a1v\u00c3\u00a1n\u00c3\u00ad": {
                                "count": 41
                            },
                            "P\u00c5\u0099edm\u00c4\u009bty": {"count": 21},
                            "Pr\u00c3\u00a1vo": {"count": 28},
                        },
                    },
                    "da": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 20,
                        "labels": {
                            "Natur": {"count": 398},
                            "Kultur": {"count": 911},
                            "Samfund": {"count": 531},
                            "Politik": {"count": 287},
                            "Computersystemer": {"count": 84},
                            "Erhvervsliv": {"count": 164},
                            "Teknik": {"count": 166},
                            "Sproget": {"count": 41},
                            "Humaniora": {"count": 139},
                            "Uddannelse": {"count": 79},
                            "Anvendt_videnskab": {"count": 199},
                            "Kunst": {"count": 412},
                            "Sundhed": {"count": 83},
                            "Historie": {"count": 184},
                            "Videnskab": {"count": 257},
                            "Matematik": {"count": 79},
                            "Geografi": {"count": 463},
                            "Liv": {"count": 286},
                            "Personer": {"count": 322},
                            "Jura": {"count": 35},
                        },
                    },
                    "eu": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 5,
                        "labels": {
                            "Entitateak": {"count": 2486},
                            "Humanitateak": {"count": 861},
                            "Mundua": {"count": 706},
                            "Zientzia_eta_teknologia": {"count": 957},
                            "Entretenimendua": {"count": 110},
                        },
                    },
                    "gv": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 28,
                        "labels": {
                            "Chron-oaylleeaght": {"count": 1334},
                            "\u00c3\u0087heeraghyn": {"count": 470},
                            "Oaylleeaght": {"count": 575},
                            "\u00c3\u0087heer-oaylleeaght": {"count": 190},
                            "Persoonyn": {"count": 249},
                            "Bea": {"count": 207},
                            "Cultoor": {"count": 269},
                            "Glare": {"count": 152},
                            "Dooghys": {"count": 494},
                            "Smooinaght": {"count": 28},
                            "Sheshaght": {"count": 221},
                            "\u00c3\u0087haghnoaylleeaght": {"count": 33},
                            "Strughtoor": {"count": 54},
                            "Politickaght": {"count": 138},
                            "Ronnaghyn_rere_bun-chooish": {"count": 381},
                            "Eirinys": {"count": 80},
                            "Ynsagh": {"count": 16},
                            "Am": {"count": 12},
                            "Leigh": {"count": 9},
                            "Studeyrys-sheelnaue": {"count": 72},
                            "Keirdyn_scoillaragh": {"count": 50},
                            "Shennaghys": {"count": 11},
                            "Dellal": {"count": 23},
                            "Maddaght": {"count": 6},
                            "Meanyn_mooarey": {"count": 7},
                            "Oaylleeaghtyn_feamagh": {"count": 30},
                            "Fysseree": {"count": 7},
                            "\u00c3\u0087hymmyltaght": {"count": 2},
                        },
                    },
                    "ilo": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 34,
                        "labels": {
                            "Katutubo": {"count": 135},
                            "Biag": {"count": 933},
                            "Pakasaritaan": {"count": 1302},
                            "Kultura": {"count": 72},
                            "Relihion": {"count": 20},
                            "Dagiti_kinatao": {"count": 1405},
                            "Lubong": {"count": 308},
                            "Siensia_ken_teknolohia": {"count": 12},
                            "Gobierno": {"count": 208},
                            "Heograpia": {"count": 323},
                            "Matematika": {"count": 9},
                            "Etika": {"count": 12},
                            "Pilosopia": {"count": 26},
                            "Tattao": {"count": 20},
                            "Dagiti_konsepto": {"count": 69},
                            "Panagkukua_ti_tao": {"count": 16},
                            "Pannakaammo": {"count": 29},
                            "Palpaliwa": {"count": 22},
                            "Pagsasao": {"count": 33},
                            "Kagimongan": {"count": 37},
                            "Salun-at": {"count": 9},
                            "Industria": {"count": 5},
                            "Linteg": {"count": 1},
                            "Makan_ken_mainum": {"count": 19},
                            "Dagiti_akademiko_a_disiplina": {"count": 27},
                            "Edukasion": {"count": 9},
                            "Musika": {"count": 1},
                            "Politika": {"count": 37},
                            "Negosio": {"count": 10},
                            "Inhenieria": {"count": 1},
                            "Paspasamak": {"count": 6},
                            "Dagiti_banag": {"count": 2},
                            "Annuroten": {"count": 1},
                            "Enerhia": {"count": 1},
                        },
                    },
                    "ku": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 39,
                        "labels": {
                            "Kes": {"count": 524},
                            "Medya": {"count": 67},
                            "Xwarin_\u00c3\u00bb_vexwarin": {"count": 127},
                            "Civak": {"count": 277},
                            "L\u00c3\u00aakol\u00c3\u00aen\u00c3\u00aan_mirovahiy\u00c3\u00aa": {
                                "count": 269
                            },
                            "Zeman": {"count": 174},
                            "Teknoloj\u00c3\u00ae": {"count": 92},
                            "D\u00c3\u00aes\u00c3\u00aepl\u00c3\u00aen\u00c3\u00aan_akadem\u00c3\u00aek": {
                                "count": 237
                            },
                            "Erdn\u00c3\u00aegar\u00c3\u00ae": {"count": 1078},
                            "D\u00c3\u00aerok": {"count": 436},
                            "Gerd\u00c3\u00bbn": {"count": 42},
                            "Jiyan": {"count": 255},
                            "\u00c5\u009eah\u00c3\u00ae": {"count": 107},
                            "Siyaset": {"count": 126},
                            "\u00c3\u0087and": {"count": 349},
                            "Zan\u00c3\u00aen": {"count": 82},
                            "D\u00c3\u00aen": {"count": 150},
                            "Helwesta_mirovan": {"count": 41},
                            "Zanist": {"count": 138},
                            "Hey\u00c3\u00aen": {"count": 49},
                            "Ziman": {"count": 69},
                            "Xweza": {"count": 129},
                            "Matemat\u00c3\u00aek": {"count": 23},
                            "Agah\u00c3\u00ae": {"count": 17},
                            "Dad": {"count": 9},
                            "\u00c3\u008enternet": {"count": 19},
                            "Hik\u00c3\u00bbmet": {"count": 35},
                            "Konsept": {"count": 10},
                            "Perwerde": {"count": 6},
                            "Werzi\u00c5\u009f": {"count": 20},
                            "Felsefe": {"count": 28},
                            "Et\u00c3\u00aek": {"count": 19},
                            "Wize": {"count": 6},
                            "Abor\u00c3\u00ae": {"count": 27},
                            "Tendurust\u00c3\u00ae": {"count": 38},
                            "Ragihandin": {"count": 5},
                            "Karwer\u00c3\u00ae": {"count": 6},
                            "Endezyar\u00c3\u00ae": {"count": 16},
                            "Le\u00c5\u009fker\u00c3\u00ae": {"count": 18},
                        },
                    },
                    "lv": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 16,
                        "labels": {
                            "Kult\u00c5\u00abra": {"count": 749},
                            "Latvija": {"count": 356},
                            "\u00c4\u00a2eogr\u00c4\u0081fija": {"count": 721},
                            "Zin\u00c4\u0081tne_un_tehnolo\u00c4\u00a3ijas": {
                                "count": 469
                            },
                            "V\u00c4\u0093sture": {"count": 149},
                            "Sabiedr\u00c4\u00abba": {"count": 878},
                            "J\u00c4\u0093dzieni": {"count": 23},
                            "Dz\u00c4\u00abv\u00c4\u00abba": {"count": 274},
                            "Cilv\u00c4\u0093ki": {"count": 428},
                            "Sports": {"count": 120},
                            "M\u00c4\u0081ksla": {"count": 185},
                            "Politika": {"count": 133},
                            "Daba": {"count": 412},
                            "Vesel\u00c4\u00abba": {"count": 124},
                            "Reli\u00c4\u00a3ija": {"count": 86},
                            "Notikumi": {"count": 13},
                        },
                    },
                    "min": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 16,
                        "labels": {
                            "Makaluak_iduik": {"count": 3986},
                            "Indonesia": {"count": 184},
                            "Budayo": {"count": 399},
                            "Ilimu_pangatauan": {"count": 121},
                            "Geografi": {"count": 208},
                            "Makanan": {"count": 12},
                            "Seni": {"count": 32},
                            "Organisasi": {"count": 23},
                            "Karajo": {"count": 30},
                            "Astronomi": {"count": 13},
                            "Sarugo": {"count": 30},
                            "Tokoh": {"count": 26},
                            "Agamo": {"count": 48},
                            "Sijarah": {"count": 5},
                            "Teknologi": {"count": 1},
                            "Ulahrago": {"count": 2},
                        },
                    },
                    "mt": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 27,
                        "labels": {
                            "\u00c4\u00a0eografija": {"count": 1634},
                            "Arti": {"count": 194},
                            "Gvern": {"count": 107},
                            "Reli\u00c4\u00a1jon": {"count": 293},
                            "Dixxiplini_akkademi\u00c4\u008bi": {"count": 139},
                            "Nies": {"count": 270},
                            "So\u00c4\u008bjet\u00c3\u00a0": {"count": 88},
                            "Natura": {"count": 449},
                            "Sa\u00c4\u00a7\u00c4\u00a7a": {"count": 31},
                            "Xjenza": {"count": 667},
                            "Storja": {"count": 167},
                            "Ekonomija": {"count": 199},
                            "Kultura": {"count": 367},
                            "Lingwa": {"count": 76},
                            "Filosofija": {"count": 14},
                            "\u00c4\u00a6ajja_ta\\_Kuljum": {"count": 81},
                            "Edukazzjoni": {"count": 30},
                            "Politika": {"count": 110},
                            "Mu\u00c5\u00bcika": {"count": 14},
                            "Komunikazzjoni_umana": {"count": 39},
                            "Spettaklu": {"count": 38},
                            "Kronolo\u00c4\u00a1ija": {"count": 39},
                            "Avvenimenti": {"count": 6},
                            "Li\u00c4\u00a1i": {"count": 19},
                            "Teknolo\u00c4\u00a1ija": {"count": 17},
                            "Sport": {"count": 30},
                            "In\u00c4\u00a1inerija": {"count": 2},
                        },
                    },
                    "sco": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 23,
                        "labels": {
                            "Life": {"count": 621},
                            "Naitur": {"count": 265},
                            "Geografie": {"count": 1081},
                            "Society": {"count": 446},
                            "Humanities": {"count": 259},
                            "History": {"count": 184},
                            "Airts": {"count": 106},
                            "Technology": {"count": 324},
                            "Fowk": {"count": 208},
                            "Concepts": {"count": 237},
                            "Cultur": {"count": 427},
                            "Environs": {"count": 231},
                            "Warld": {"count": 141},
                            "Politics": {"count": 294},
                            "Eddication": {"count": 42},
                            "Airt": {"count": 18},
                            "Heal": {"count": 70},
                            "Science_an_technology": {"count": 60},
                            "Sports": {"count": 46},
                            "Mathematics": {"count": 36},
                            "Law": {"count": 3},
                            "Tuils": {"count": 7},
                            "Employment": {"count": 14},
                        },
                    },
                    "sq": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 36,
                        "labels": {
                            "Gjeografi": {"count": 586},
                            "Politik\u00c3\u00ab": {"count": 351},
                            "Let\u00c3\u00abrsi": {"count": 67},
                            "Administrat\u00c3\u00ab_publike": {"count": 320},
                            "Shoq\u00c3\u00abri": {"count": 116},
                            "Sporte": {"count": 105},
                            "Shkenc\u00c3\u00ab": {"count": 1109},
                            "Kultur\u00c3\u00ab": {"count": 299},
                            "Arte": {"count": 217},
                            "Persona": {"count": 425},
                            "Histori": {"count": 744},
                            "Mitologji": {"count": 5},
                            "Gjuh\u00c3\u00absi": {"count": 64},
                            "Teknologji": {"count": 84},
                            "Kinematografi": {"count": 72},
                            "Media": {"count": 51},
                            "Sigurime": {"count": 31},
                            "Loj\u00c3\u00abra": {"count": 3},
                            "Fe": {"count": 131},
                            "Bujq\u00c3\u00absi": {"count": 41},
                            "Ngjarje": {"count": 11},
                            "Biografi": {"count": 116},
                            "Matematik\u00c3\u00ab": {"count": 27},
                            "Teknik\u00c3\u00ab": {"count": 26},
                            "Drejt\u00c3\u00absi": {"count": 18},
                            "Organizata": {"count": 27},
                            "Jeta": {"count": 4},
                            "Sport": {"count": 5},
                            "Agronomi": {"count": 3},
                            "Natyr\u00c3\u00ab": {"count": 11},
                            "Sh\u00c3\u00abndeti": {"count": 3},
                            "Shkencat_humane": {"count": 22},
                            "Shp\u00c3\u00abrblime": {"count": 2},
                            "Blegtori": {"count": 10},
                            "L\u00c3\u00abnd\u00c3\u00ab": {"count": 8},
                            "Enciklopedistika": {"count": 6},
                        },
                    },
                    "wa": {
                        "num_samples": 10,
                        "average_text_length": 512.0,
                        "average_labels_per_text": 512.0,
                        "unique_labels": 6,
                        "labels": {
                            "Economeye": {"count": 816},
                            "Syinces": {"count": 3653},
                            "Vicaedje_des_djins": {"count": 314},
                            "Creyance": {"count": 310},
                            "Rilom\u00c3\u00aay\u00c3\u00a8s_djins": {"count": 25},
                            "Date": {"count": 2},
                        },
                    },
                },
            },
        },
    )


class WikiClusteringFastP2P(AbsTaskClusteringFast, MultilingualTask):
    max_document_to_embed = 2048
    max_fraction_of_documents_to_embed = None

    metadata = TaskMetadata(
        name="WikiClusteringP2P.v2",
        description="Clustering of wikipedia articles inspired by BlubrbsClusteringP2P. Labels are taken from top-level categories of the respective languages (e.g., https://lv.wikipedia.org/wiki/Kategorija:Pamatkategorijas).",
        reference="https://github.com/Rysias/wiki-clustering",
        dataset={
            "path": "ryzzlestrizzle/multi-wiki-clustering-p2p",
            "revision": "d4d92f8f28be71035be6a96bdfd4e200cf62faa8",
        },
        type="Clustering",
        category="p2p",
        modalities=["text"],
        eval_splits=["test"],
        eval_langs=_LANGUAGES,
        main_score="v_measure",
        date=("2001-01-15", "2024-04-15"),
        domains=["Encyclopaedic", "Written"],
        task_subtypes=["Thematic clustering"],
        license="cc-by-sa-3.0",
        annotations_creators="derived",
        dialect=[],
        sample_creation="created",
        bibtex_citation="",  # None exists
        descriptive_stats={
            "n_samples": {"test": 2048},
            "test": {
                "num_samples": 28672,
                "average_text_length": 629.7426409040179,
                "average_labels_per_text": 1.0,
                "unique_labels": 39,
                "labels": {
                    "16": {"count": 541},
                    "3": {"count": 1607},
                    "12": {"count": 846},
                    "0": {"count": 2410},
                    "15": {"count": 878},
                    "11": {"count": 864},
                    "6": {"count": 787},
                    "9": {"count": 654},
                    "14": {"count": 966},
                    "8": {"count": 1389},
                    "2": {"count": 2428},
                    "10": {"count": 839},
                    "1": {"count": 1370},
                    "4": {"count": 2942},
                    "7": {"count": 2514},
                    "5": {"count": 1490},
                    "13": {"count": 918},
                    "19": {"count": 315},
                    "17": {"count": 711},
                    "20": {"count": 345},
                    "18": {"count": 800},
                    "24": {"count": 467},
                    "25": {"count": 928},
                    "21": {"count": 62},
                    "26": {"count": 270},
                    "22": {"count": 186},
                    "23": {"count": 36},
                    "27": {"count": 465},
                    "28": {"count": 62},
                    "36": {"count": 139},
                    "32": {"count": 57},
                    "38": {"count": 43},
                    "30": {"count": 52},
                    "34": {"count": 80},
                    "33": {"count": 75},
                    "35": {"count": 62},
                    "31": {"count": 63},
                    "37": {"count": 8},
                    "29": {"count": 3},
                },
                "hf_subset_descriptive_stats": {
                    "bs": {
                        "num_samples": 2048,
                        "average_text_length": 1046.25732421875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 17,
                        "labels": {
                            "16": {"count": 268},
                            "3": {"count": 89},
                            "12": {"count": 597},
                            "0": {"count": 202},
                            "15": {"count": 113},
                            "11": {"count": 11},
                            "6": {"count": 142},
                            "9": {"count": 181},
                            "14": {"count": 179},
                            "8": {"count": 33},
                            "2": {"count": 172},
                            "10": {"count": 12},
                            "1": {"count": 7},
                            "4": {"count": 25},
                            "7": {"count": 6},
                            "5": {"count": 9},
                            "13": {"count": 2},
                        },
                    },
                    "ca": {
                        "num_samples": 2048,
                        "average_text_length": 600.73291015625,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 8,
                        "labels": {
                            "6": {"count": 257},
                            "1": {"count": 737},
                            "2": {"count": 284},
                            "4": {"count": 394},
                            "0": {"count": 162},
                            "7": {"count": 151},
                            "5": {"count": 55},
                            "3": {"count": 8},
                        },
                    },
                    "cs": {
                        "num_samples": 2048,
                        "average_text_length": 659.2294921875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 21,
                        "labels": {
                            "19": {"count": 35},
                            "5": {"count": 624},
                            "17": {"count": 126},
                            "10": {"count": 155},
                            "1": {"count": 231},
                            "7": {"count": 215},
                            "11": {"count": 128},
                            "0": {"count": 57},
                            "13": {"count": 75},
                            "2": {"count": 83},
                            "3": {"count": 38},
                            "9": {"count": 8},
                            "6": {"count": 14},
                            "12": {"count": 9},
                            "16": {"count": 16},
                            "20": {"count": 73},
                            "18": {"count": 38},
                            "4": {"count": 60},
                            "15": {"count": 14},
                            "14": {"count": 38},
                            "8": {"count": 11},
                        },
                    },
                    "da": {
                        "num_samples": 2048,
                        "average_text_length": 767.58935546875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 20,
                        "labels": {
                            "14": {"count": 212},
                            "4": {"count": 74},
                            "15": {"count": 16},
                            "8": {"count": 165},
                            "13": {"count": 115},
                            "0": {"count": 79},
                            "1": {"count": 34},
                            "9": {"count": 114},
                            "7": {"count": 364},
                            "10": {"count": 32},
                            "17": {"count": 66},
                            "18": {"count": 32},
                            "12": {"count": 129},
                            "11": {"count": 159},
                            "2": {"count": 66},
                            "3": {"count": 185},
                            "19": {"count": 103},
                            "16": {"count": 33},
                            "5": {"count": 56},
                            "6": {"count": 14},
                        },
                    },
                    "eu": {
                        "num_samples": 2048,
                        "average_text_length": 405.16015625,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 5,
                        "labels": {
                            "4": {"count": 383},
                            "0": {"count": 995},
                            "3": {"count": 282},
                            "2": {"count": 344},
                            "1": {"count": 44},
                        },
                    },
                    "gv": {
                        "num_samples": 2048,
                        "average_text_length": 368.01123046875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 28,
                        "labels": {
                            "6": {"count": 32},
                            "1": {"count": 83},
                            "24": {"count": 13},
                            "17": {"count": 152},
                            "2": {"count": 534},
                            "25": {"count": 76},
                            "5": {"count": 198},
                            "15": {"count": 100},
                            "21": {"count": 22},
                            "26": {"count": 188},
                            "13": {"count": 230},
                            "20": {"count": 11},
                            "3": {"count": 107},
                            "19": {"count": 88},
                            "16": {"count": 55},
                            "22": {"count": 29},
                            "14": {"count": 12},
                            "8": {"count": 61},
                            "0": {"count": 5},
                            "10": {"count": 4},
                            "4": {"count": 9},
                            "23": {"count": 6},
                            "7": {"count": 3},
                            "9": {"count": 20},
                            "18": {"count": 4},
                            "12": {"count": 3},
                            "27": {"count": 1},
                            "11": {"count": 2},
                        },
                    },
                    "ilo": {
                        "num_samples": 2048,
                        "average_text_length": 617.90771484375,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 29,
                        "labels": {
                            "3": {"count": 562},
                            "0": {"count": 373},
                            "18": {"count": 521},
                            "8": {"count": 129},
                            "13": {"count": 123},
                            "11": {"count": 54},
                            "25": {"count": 8},
                            "27": {"count": 5},
                            "17": {"count": 13},
                            "15": {"count": 4},
                            "4": {"count": 28},
                            "7": {"count": 83},
                            "10": {"count": 15},
                            "1": {"count": 11},
                            "24": {"count": 15},
                            "14": {"count": 8},
                            "16": {"count": 4},
                            "19": {"count": 9},
                            "23": {"count": 10},
                            "26": {"count": 4},
                            "28": {"count": 8},
                            "12": {"count": 29},
                            "21": {"count": 12},
                            "6": {"count": 5},
                            "20": {"count": 6},
                            "5": {"count": 4},
                            "22": {"count": 2},
                            "9": {"count": 2},
                            "2": {"count": 1},
                        },
                    },
                    "ku": {
                        "num_samples": 2048,
                        "average_text_length": 421.17333984375,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 39,
                        "labels": {
                            "14": {"count": 14},
                            "36": {"count": 139},
                            "20": {"count": 108},
                            "22": {"count": 27},
                            "15": {"count": 102},
                            "32": {"count": 55},
                            "8": {"count": 431},
                            "17": {"count": 210},
                            "38": {"count": 43},
                            "30": {"count": 51},
                            "4": {"count": 60},
                            "2": {"count": 111},
                            "6": {"count": 95},
                            "34": {"count": 70},
                            "27": {"count": 15},
                            "5": {"count": 174},
                            "26": {"count": 37},
                            "0": {"count": 11},
                            "25": {"count": 50},
                            "16": {"count": 2},
                            "12": {"count": 16},
                            "24": {"count": 2},
                            "11": {"count": 17},
                            "21": {"count": 9},
                            "13": {"count": 20},
                            "1": {"count": 7},
                            "33": {"count": 33},
                            "35": {"count": 28},
                            "10": {"count": 11},
                            "31": {"count": 51},
                            "18": {"count": 4},
                            "3": {"count": 4},
                            "28": {"count": 8},
                            "37": {"count": 8},
                            "23": {"count": 2},
                            "19": {"count": 7},
                            "7": {"count": 6},
                            "9": {"count": 8},
                            "29": {"count": 2},
                        },
                    },
                    "lv": {
                        "num_samples": 2048,
                        "average_text_length": 770.67138671875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 16,
                        "labels": {
                            "15": {"count": 288},
                            "2": {"count": 110},
                            "6": {"count": 74},
                            "12": {"count": 50},
                            "0": {"count": 171},
                            "14": {"count": 188},
                            "10": {"count": 351},
                            "5": {"count": 142},
                            "4": {"count": 300},
                            "13": {"count": 60},
                            "11": {"count": 48},
                            "1": {"count": 165},
                            "8": {"count": 53},
                            "7": {"count": 5},
                            "3": {"count": 9},
                            "9": {"count": 34},
                        },
                    },
                    "min": {
                        "num_samples": 2048,
                        "average_text_length": 631.74072265625,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 15,
                        "labels": {
                            "7": {"count": 1595},
                            "9": {"count": 9},
                            "4": {"count": 48},
                            "3": {"count": 83},
                            "2": {"count": 160},
                            "0": {"count": 19},
                            "5": {"count": 74},
                            "6": {"count": 12},
                            "10": {"count": 12},
                            "13": {"count": 10},
                            "8": {"count": 5},
                            "11": {"count": 13},
                            "12": {"count": 2},
                            "1": {"count": 5},
                            "14": {"count": 1},
                        },
                    },
                    "mt": {
                        "num_samples": 2048,
                        "average_text_length": 821.22265625,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 27,
                        "labels": {
                            "12": {"count": 8},
                            "10": {"count": 147},
                            "14": {"count": 180},
                            "17": {"count": 117},
                            "25": {"count": 654},
                            "19": {"count": 35},
                            "0": {"count": 77},
                            "3": {"count": 12},
                            "16": {"count": 44},
                            "15": {"count": 108},
                            "24": {"count": 267},
                            "6": {"count": 43},
                            "26": {"count": 32},
                            "4": {"count": 79},
                            "22": {"count": 67},
                            "9": {"count": 16},
                            "8": {"count": 16},
                            "2": {"count": 55},
                            "5": {"count": 6},
                            "11": {"count": 30},
                            "18": {"count": 12},
                            "21": {"count": 12},
                            "20": {"count": 15},
                            "23": {"count": 7},
                            "13": {"count": 6},
                            "7": {"count": 1},
                            "1": {"count": 2},
                        },
                    },
                    "sco": {
                        "num_samples": 2048,
                        "average_text_length": 1065.21044921875,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 23,
                        "labels": {
                            "18": {"count": 178},
                            "6": {"count": 92},
                            "9": {"count": 28},
                            "15": {"count": 106},
                            "8": {"count": 432},
                            "2": {"count": 95},
                            "11": {"count": 104},
                            "1": {"count": 42},
                            "13": {"count": 248},
                            "16": {"count": 118},
                            "20": {"count": 130},
                            "3": {"count": 171},
                            "22": {"count": 57},
                            "7": {"count": 83},
                            "10": {"count": 74},
                            "5": {"count": 6},
                            "4": {"count": 17},
                            "17": {"count": 24},
                            "14": {"count": 14},
                            "0": {"count": 7},
                            "19": {"count": 18},
                            "21": {"count": 3},
                            "12": {"count": 1},
                        },
                    },
                    "sq": {
                        "num_samples": 2048,
                        "average_text_length": 425.486328125,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 36,
                        "labels": {
                            "27": {"count": 444},
                            "9": {"count": 234},
                            "14": {"count": 120},
                            "0": {"count": 128},
                            "15": {"count": 27},
                            "11": {"count": 298},
                            "24": {"count": 170},
                            "28": {"count": 46},
                            "19": {"count": 20},
                            "25": {"count": 140},
                            "3": {"count": 47},
                            "2": {"count": 87},
                            "35": {"count": 34},
                            "8": {"count": 53},
                            "31": {"count": 12},
                            "17": {"count": 3},
                            "23": {"count": 11},
                            "20": {"count": 2},
                            "33": {"count": 42},
                            "10": {"count": 26},
                            "34": {"count": 10},
                            "7": {"count": 2},
                            "13": {"count": 29},
                            "4": {"count": 4},
                            "6": {"count": 7},
                            "26": {"count": 9},
                            "5": {"count": 16},
                            "30": {"count": 1},
                            "21": {"count": 4},
                            "22": {"count": 4},
                            "18": {"count": 11},
                            "32": {"count": 2},
                            "12": {"count": 2},
                            "16": {"count": 1},
                            "1": {"count": 1},
                            "29": {"count": 1},
                        },
                    },
                    "wa": {
                        "num_samples": 2048,
                        "average_text_length": 216.00390625,
                        "average_labels_per_text": 1.0,
                        "unique_labels": 6,
                        "labels": {
                            "5": {"count": 126},
                            "4": {"count": 1461},
                            "0": {"count": 124},
                            "2": {"count": 326},
                            "3": {"count": 10},
                            "1": {"count": 1},
                        },
                    },
                },
            },
        },
    )

    def dataset_transform(self):
        ds = {}
        for lang in self.hf_subsets:
            labels = []
            sentences = []
            ds[lang] = {}
            lang_dict = {}
            for split in self.metadata.eval_splits:
                labels.extend(
                    itertools.chain.from_iterable(self.dataset[lang][split]["labels"])
                )
                sentences.extend(
                    itertools.chain.from_iterable(
                        self.dataset[lang][split]["sentences"]
                    )
                )

                # Remove sentences and labels with only 1 label example.
                unique_labels, counts = np.unique(labels, return_counts=True)
                solo_label_idx = np.where(counts == 1)
                solo_labels = unique_labels[solo_label_idx]
                is_solo = np.isin(labels, solo_labels)
                split_ds = Dataset.from_dict({"labels": labels, "sentences": sentences})
                if is_solo.any():
                    split_ds = split_ds.select(np.nonzero(is_solo == False)[0])  # noqa: E712
                lang_dict.update({split: split_ds})
            ds[lang] = DatasetDict(lang_dict)
        self.dataset = DatasetDict(ds)
        for lang in self.hf_subsets:
            self.dataset[lang] = self.stratified_subsampling(
                self.dataset[lang],
                self.seed,
                self.metadata.eval_splits,
                label="labels",
                n_samples=2048,
            )
