"""
for each line of the dataset, each modality has a % of probability to be present
if there is no modality present, we take one random modality to be here
"""
from random import random, sample

# region parameters
dataset_size = 10_824
number_of_domains = 5
prob_being_here = 0.8
min_number_of_dom = 2
# endregion

# region control variables
supervision_list = []
supervision_data = dict()
number_per_domain = [0 for _ in range(number_of_domains)]
# endregion

for _ in range(dataset_size):
    here_per_domain = [random() < prob_being_here for _ in range(number_of_domains)]

    if here_per_domain.count(True) < min_number_of_dom:
        # sample to random without replacement between 0 and number_of_domains
        modalities = sample(range(number_of_domains), min_number_of_dom)
        here_per_domain = [i in modalities for i in range(number_of_domains)]

    for i, here in enumerate(here_per_domain):
        number_per_domain[i] += 1 if here else 0

    here_modality = 'v'.join([str(i) for i, x in enumerate(here_per_domain) if x])

    supervision_list.append(here_modality)
    supervision_data[here_modality] = supervision_data.get(here_modality, 0) + 1

print(f"{number_per_domain=}")
print(f"{[i/dataset_size for i in number_per_domain]=}")
print(supervision_data)

supervision_str = ""
for key, value in supervision_data.items():
    supervision_str += f"{key},{value},"
print(supervision_str)
