library(ADRecommender)
suppressMessages(library(tidyverse))
library(scmamp)
library(furrr)
library(here)
library(rsample)
library(microbenchmark)
source(here('paper_scripts', 'utils', 'utils.R'))


prepare_parallelization()
seed_num <- c(7777, 7778, 7779, 7780, 7781)

dat <- get_data_stratified_kfolds_cv(mfs_metaod = FALSE, metric = 'pr_auc', mfs_scaled = T)

clust_stats <- map_dfr(1:30, function(index){
  split <- dat$data$splits[[index]]
  training_perf <- get_train_perf_from_split(split)
  training_mfs <- get_train_mfs_from_split(split)

  rec <- recommender('cfact') %>%
    fit(training_perf, training_mfs)
  tab <- rec$cluster_indices %>% table
  tibble(n_clusters  = length(tab),
         cluster_sizes = list(as.numeric(tab)))
})

p1 <- ggplot(clust_stats) +
  geom_histogram(aes(x = n_clusters),
                 binwidth = 1,
                 color = 'black',
                 fill = 'light blue') +
  theme_light() +
  xlab('Number of clusters') +
  ylab('Count') +
  scale_y_continuous(breaks = c(1, 2, 6, 7, 14))

p2 <- ggplot(clust_stats %>% unnest(cluster_sizes)) +
  geom_histogram(aes(x = cluster_sizes),
                 fill = 'light blue',
                 color = 'black',
                 binwidth = 5) +
  theme_light() +
  xlab('Cluster Size') +
  ylab('Count')

ggsave(filename = 'cfact_clustering_sizes.pdf',
       plot = cowplot::plot_grid(p1, p2, nrow = 1),
       device = 'pdf',
       path = here('paper_scripts', 'appendices'),
       units = 'cm',
       width = 20,
       height =  8)
