"""
for each line of the dataset, each modality has a % of probability to be present
if there is no modality present, we take one random modality to be here
"""

filename = r'../splits/brats2020_supervision_train_prop_50'
number_of_domains = 5
dataset_size = 10_824

with open(filename, 'r') as file:
    content = file.read()
    proportions = [x for x in content.split(',') if x.strip()]
    print(proportions)

number_per_domain = [0 for _ in range(number_of_domains)]

for i in range(len(proportions) // 2):
    domain_rpz = proportions[i*2]
    nb_tokens = int(proportions[i*2+1])

    list_dom = domain_rpz.split('v')
    list_dom = [int(x) for x in list_dom]

    for dom in list_dom:
        number_per_domain[dom] += nb_tokens

print(number_per_domain)
print([i/dataset_size for i in number_per_domain])
