m <- read.table("brains.csv", sep=";", as.is=T, header=T)
info <- read.table("geostat.csv", sep=";", as.is=T, header=T)

m <- merge(m, info, by.x="geom", by.y="name", all.x=T)

animals <- c("Cat1", "Cat2", "Cat3", "CElegans", "Drosophila1","Drosophila2", "Human1", "Human2", "Human6", "Human7", "Human8", "Macaque1", "Macaque2", "Macaque3", "Macaque4", "Mouse2", "Mouse3", "Rat1", "Rat2", "Rat3", "ZebraFinch2")
geometries <- c("BIG-g435", "BIG-sol", "BIG-sol3", "e3b", "g435", "g435b2", "g435ch", "g435d", "g711", "g711ch",  "g711d", "h2m", "h2r", "h2ra", "h3m", "nil", "s3", "sol", "sol3", "subnil", "subsol", "subsol3", "t3", "tree", "treed")

m$newid <- paste(m$who,m$geom)

m$s <- m$n * (m$n-1) / 2

m$entropy <- m$m * log(m$m / m$s) + (m$s - m$m) * log((m$s - m$m) / m$s)
m$nll <- 1 + m$loglik / m$entropy

n <- as.data.frame(cbind(m$newid, m$iteration, m$loglik, m$success, m$mAP, m$meanrank, m$stretch))
colnames(n) <- c("newid", "iteration", "loglik", "success", "map", "mr", "stretch")
n <- n[with(n, order(newid, as.numeric(iteration))),]
n$iteration <- as.numeric(n$iteration)
n$loglik <- (-1)*as.numeric(n$loglik)
n$success <- as.numeric(n$success)
n$map <- as.numeric(n$map)
n$mr <- as.numeric(n$mr) +1
n$stretch <- as.numeric(n$stretch)

# analysis of the lack of autocorrelation

# wide format
a <- reshape(n, idvar = "newid", timevar = "iteration", direction = "wide")

# regime A -- independently try to set T, R, and measure

# ljung-box test -- are there autocorellations?
logliks <- seq(2, ncol(a), by=5)
successes <- seq(3, ncol(a), by=5)
maps <- seq(4, ncol(a), by=5)
mrs <- seq(5, ncol(a), by=5)
stretches <- seq(6, ncol(a), by=5)

measures <- list(logliks, successes, maps, mrs, stretches)

for (measure in measures) {
  pvalues <- c()

  for (i in 1:nrow(a)) {
  pvalues <- c(pvalues, Box.test(ts(t(a[i, measure])), lag = 1, type="Ljung")$p.value)
  }

  # Bonferroni correction
  print(sum(pvalues < 0.01/(length(animals)*length(geometries))))
} 

# visualizations
n <- as.data.frame(cbind(m$newid, m$who, m$geom, m$iteration, m$loglik, m$success, m$mAP, m$meanrank, m$stretch, m$nll))
colnames(n) <- c("newid", "who", "geometry", "iteration", "loglik", "success", "map", "mr", "stretch", "nll")
n <- n[with(n, order(newid, as.numeric(iteration))),]
n$iteration <- as.numeric(n$iteration)
n$loglik <- (-1)*as.numeric(n$loglik)
n$success <- as.numeric(n$success)
n$map <- as.numeric(n$map)
n$mr <- as.numeric(n$mr) +1
n$stretch <- as.numeric(n$stretch)

# convenient for visualization: inv_meanrank <- 1 / meanrank ; inv_stretch <- 1/stretch
n$invmr <- 1/n$mr
n$invstr <- 1/n$stretch


# vioplots

library(ggplot2)
draw_vioplot <- function(dirpath, animals_list, measure) {
  for (anim in animals_list) {
    name = paste(dirpath,anim,"_",measure, "_vioplot.pdf", sep="")
    print(name)
    d2 <- subset(n, n$who==anim & n$iteration>0)
    #print(head(d2))
    pdf(name, 15,7)
    print(ggplot(d2, aes_string(x="geometry", y=measure)) + 
            geom_violin(trim=F) + geom_boxplot(width=0.1))
    dev.off()
  } 
}

# logliks
draw_vioplot("vioplots/loglik/", animals, "loglik")

# successes
draw_vioplot("vioplots/success/", animals, "success")

# maps
draw_vioplot("vioplots/maps/", animals, "map")

# mrs
draw_vioplot("vioplots/meanrank/", animals, "invmr")

# stretches
draw_vioplot("vioplots/stretch/", animals, "invstr")

# discrete vs continuous

# check if normality holds

geoms_test <-  c("g435d", "g435ch", "g711d", "g711ch", "treed", "tree")


normality_test <- function(animals_list, geoms_list, measure) {
  for (anim in animals_list) {
  for (geom in geoms_list) {
    test <- shapiro.test(subset(n, n$who==anim & n$geometry==geom)[[measure]])$p.value
    res <- ifelse(test >= 0.01/126, "Support","Reject")
    print(c(anim, geom, test, res))
   }
  }
}

measures <- c("loglik", "map", "mr", "success", "stretch")
for (i in measures) {
  print(i)
  normality_test(animals, geoms_test, i)
}

# we should not use ttests, the normality assumption is invalid

# auxiliary visuals for intuition
plot(density(subset(n, n$who=="Cat1" & n$geometry=="tree")$stretch), col="red")
lines(density(subset(n, n$who=="Cat1" & n$geometry=="treed")$stretch))

# if there is a difference in favor of continuous we aim at rejecting H0
# we also control for the lack of significant difference

compare_wilcox <- function(pres_better, pres_worse, anim_list)  {
  test <- wilcox.test(pres_better, pres_worse, alternative="greater", exact=F)$p.value
  res <- ifelse(test<0.01/length(anim_list), 1, -1)
  test <- wilcox.test(pres_better, pres_worse, alternative="two.sided", exact=F)$p.value
  res_sig <- ifelse(test<0.01/length(anim_list), 1, 0)
  res <- res*res_sig
  return(res)
}

compare_pairs <- function(df, orig, id_comp, worse_cand, better_cand, animals_list) {
  for (anim in animals_list) {
    second <- subset(orig, orig$who==anim & (orig[[id_comp]]==worse_cand))
    first <- subset(orig, orig$who==anim & (orig[[id_comp]]==better_cand))
    
    res_loglik <- compare_wilcox(first$loglik, second$loglik, animals_list) 
    res_success <- compare_wilcox(first$success, second$success, animals_list)
    res_map <- compare_wilcox(first$map, second$map, animals_list)
    res_mr <- compare_wilcox(first$invmr, second$invmr, animals_list)
    res_stretch <- compare_wilcox(first$invstr, second$invstr, animals_list) 
    
    df <- rbind(df, c(res_loglik, res_map, res_mr, res_success, res_stretch))
  }
  colnames(df) <- c("loglik", "map", "mr", "success", "stretch")
  rownames(df) <- animals_list
  return(df)
}

# g345ch vs g345d

g435comp <- data.frame()
g435comp <- compare_pairs(g435comp, n, "geometry", "g435d", "g435ch", animals)

pdf("figs/g435comp.pdf", 7,7)
heatmap(as.matrix(g435comp), Rowv = NA, Colv = NA, scale="none")
dev.off()
  
# g711ch vs g711d
g711comp <- data.frame()
g711comp <- compare_pairs(g711comp, n, "geometry", "g711d", "g711ch", animals)

pdf("figs/g711comp.pdf", 7,7)
heatmap(as.matrix(g711comp), Rowv = NA, Colv = NA, scale="none")
dev.off()

# tree vs treed
treecomp <- data.frame()
treecomp <- compare_pairs(treecomp, n, "geometry", "treed", "tree", animals)

pdf("figs/treecomp.pdf", 7,7)
heatmap(as.matrix(treecomp), Rowv = NA, Colv = NA, scale="none")
dev.off()

# red -- continuous better, significant difference
# orange -- lack of significant difference
# bright -- significant difference, continuous worse

# compare h2ra against h2r
# test if h2ra is no better than h2r
h2rcomp <- data.frame()
h2rcomp <- compare_pairs(h2rcomp, n, "geometry", "h2ra", "h2r", animals)

pdf("figs/h2rcomp.pdf", 7,7)
heatmap(as.matrix(h2rcomp), Rowv = NA, Colv = NA, scale="none")
dev.off()

# red -- h2r better, significant difference
# orange -- lack of significant difference
# bright -- significant difference, h2r worse

# compare twisted against h2r
twistedcomp <- data.frame()
twistedcomp <- compare_pairs(twistedcomp, n, "geometry", worse_cand = "twist", better_cand =  "h2r", animals)

pdf("twistedcomp.pdf", 7,7)
heatmap(as.matrix(twistedcomp), Rowv = NA, Colv = NA, scale="none")
dev.off()

# analysis of small geometries
geometries_small <- c("e3b", "g435", "g435b2", "g711", "h2m", "h2r", "h3m", "nil", "s3", "sol3", "subnil", "subsol3", "t3", "tree", "twist")


# rankings among the best occurrences

normalize <- function(y) {
  min <- min(as.numeric(y))
  max <- max(as.numeric(y))
  #print(c(min, max))
  sapply(y, function(z) (z-min)/(max-min))
}

translate_geoms <- function(df) {
  df$geom2 <- "E3"
  df$geom2 <- ifelse(df$geom =="g435", "H3", df$geom2)
  df$geom2 <- ifelse(df$geom =="g435b2", "H3*", df$geom2)
  df$geom2 <- ifelse(df$geom =="g711", "H2", df$geom2)
  df$geom2 <- ifelse(df$geom =="h2r", "H2xR", df$geom2)
  #df$geom2 <- ifelse(df$geom =="h2ra", "H2xR(ang)", df$geom2)
  df$geom2 <- ifelse(df$geom =="h2m", "H2&", df$geom2)
  df$geom2 <- ifelse(df$geom =="h3m", "H3&", df$geom2)
  df$geom2 <- ifelse(df$geom =="sol3", "Solv", df$geom2)
  df$geom2 <- ifelse(df$geom =="s3", "S3", df$geom2)
  df$geom2 <- ifelse(df$geom =="nil", "Nil", df$geom2)
  df$geom2 <- ifelse(df$geom =="subsol3", "Solv*", df$geom2)
  df$geom2 <- ifelse(df$geom =="subnil", "Nil*", df$geom2)
  df$geom2 <- ifelse(df$geom =="t3", "E3&", df$geom2)
  df$geom2 <- ifelse(df$geom =="tree", "Tree", df$geom2)
  df$geom2 <- ifelse(df$geom =="twist", "Twist", df$geom2)
  return(df)
}

find_best <- function(df, orig, animals_list, geom_list, measure) {
for (anim in animals_list) {
  for (geom in geom_list) {
    d2 <- subset(orig, orig$who==anim & orig$geometry == geom)
    best <- max(as.numeric(d2[[as.character(measure)]]))
    print(c(anim, geom, best))
    df <- rbind(df,c(anim, geom,best))
    }
}
name <- paste("best_", measure, sep="")
colnames(df) <- c("anim", "geom", name)

# uncomment next three lines for normalized version
#w <- matrix(as.numeric(df[[name]]), nrow=length(geom_list), byrow=F)
#w2 <- apply(w, 2, normalize)
#df[[name]] <- c(w2)

df <- translate_geoms(df)
return(df)
}

draw_best_rankings <- function(filename, df, measure, anim_var, shape_vector, color_vector, title) {
  library(ggplot2)
  pdf(filename, 10,7)
  g <- ggplot(df, aes_string(x=anim_var, y=measure)) +
    geom_point(aes(shape=geom2, color=geom2), size=3)+ scale_shape_manual(values=shape_vector) +
    scale_color_manual(values=color_vector) +
    xlab("graph") + ylab(title) +
    scale_x_discrete(guide = guide_axis(angle = 90)) + theme_minimal(base_size = 15)
  print(g)
  dev.off() 
}

stat_table <- function(df, measure_df, measure, geom_list, statistic) {
  placeholder <- data.frame()
  for (geometry in geom_list) {
    if (statistic == 2) {
      a <- median(subset(measure_df, geom==geometry)$ranking)
    }  else {
      if (statistic == 1) {
        a <- min(subset(measure_df, geom==geometry)$ranking)
     } else {
        a <- max(subset(measure_df, geom==geometry)$ranking)
     }
    }
    placeholder <- rbind(placeholder, c(geometry, a))
  }
  colnames(placeholder) <- c("g", measure)
  df <- merge(df, placeholder, by="g", all.x=T)
  return(df)
}

stat_table2 <- function(df, measure_df, measure, geom_list) {
  placeholder <- data.frame()
  for (geometry in geom_list) {
        a <- round(sum(subset(measure_df, geom==geometry)$ranking >= 11)/length(animals)*100,2)
        b <- round(sum(subset(measure_df, geom==geometry)$ranking <= 5)/length(animals)*100,2)
        placeholder <- rbind(placeholder, c(geometry, a, b))
  }
  name1 <- paste(measure, "_top5", sep="")
  name2 <- paste(measure, "_bottom5", sep="")
  colnames(placeholder) <- c("g", name1, name2)
  df <- merge(df, placeholder, by="g", all.x=T)
  return(df)
}

stats_cv <- function(df, measure_df, animals_list, measure) {
  placeholder <- data.frame()
  for (animal in animals_list) {
    a <- subset(measure_df, anim==animal)[[measure]]
    cv <- round(sd(a)/mean(a)*100,2)
    placeholder <- rbind(placeholder, c(animal, cv))
  }
  colnames(placeholder) <- c("anim", measure)
  df <- merge(df, placeholder, by="anim", all.x=T)
  return(df)
  }
    

tab <- rbind(
  c("E3", 19, "black"),
  c("E3&", 19, "forestgreen"),
  c("H2", 3, "black"),
  c("H2&", 3, "forestgreen"),
  c("H2xR", 4, "black"),
  #c("H2xR(ang)", 4, "blue"),
  c("H3", 1, "black"),
  c("H3*", 13, "black"),
  c("H3&", 1, "forestgreen"),
  c("Nil", 0, "black"),
  c("Nil*", 7, "black"),
  c("S3", 20, "black"),
  c("Solv", 5, "black"),
  c("Solv*", 9, "black"),
  c("Tree", 6, "black"),
  c("Twist",11, "black")
)


shapes <- as.numeric(t(tab) [2,])
colors <- t(tab) [3,]

median_ranks <- as.data.frame(geometries_small) 
colnames(median_ranks) <- "g"
min_ranks <- as.data.frame(geometries_small) 
colnames(min_ranks) <- "g"
max_ranks <- as.data.frame(geometries_small) 
colnames(max_ranks) <- "g"

perc_ranks <- as.data.frame(geometries_small) 
colnames(perc_ranks) <- "g"

cv_comp <- as.data.frame(animals) 
colnames(cv_comp) <- "anim"

bests_logliks <- data.frame()
bests_logliks <- find_best(bests_logliks, n, animals, geometries_small, "nll")
bests_logliks$best_nll <- as.numeric(bests_logliks$best_nll)
draw_best_rankings("figs/best_loglik.pdf", bests_logliks, "best_nll", "anim", shapes, colors, "normalized Log-likelihood")

a <- matrix(bests_logliks$best_nll, 15, length(animals), byrow = F)
#bests_logliks$ranking <- c(apply(a, 2, rank))
# for the opposite ranking
bests_logliks$ranking <- c(apply(a, 2, rank))
draw_best_rankings("figs/best_loglik_rank.pdf", bests_logliks, "ranking", "anim", shapes, colors, "Ranking for Log-likelihood")

min_ranks <- stat_table(min_ranks, bests_logliks, "loglik", geometries_small, 1)
median_ranks <- stat_table(median_ranks, bests_logliks, "loglik", geometries_small, 2)
max_ranks <- stat_table(max_ranks, bests_logliks, "loglik", geometries_small, 3)

perc_ranks <- stat_table2(perc_ranks, bests_logliks, "loglik", geometries_small)
cv_comp <- stats_cv(cv_comp, bests_logliks, animals, "best_nll") 

bests_success <- data.frame()
bests_success <- find_best(bests_success, n, animals, geometries_small, "success")
bests_success$best_success <- as.numeric(bests_success$best_success)
draw_best_rankings("figs/best_success.pdf", bests_success, "best_success", "anim", shapes, colors, "Greedy success rate")
a <- matrix(bests_success$best_success, 15,length(animals), byrow = F)
bests_success$ranking <- c(apply(a, 2, rank))
#bests_success$ranking <- c(apply(-a, 2, rank))
draw_best_rankings("figs/best_success_rank.pdf", bests_success, "ranking", "anim", shapes, colors, "Ranking for greedy success rate")

min_ranks <- stat_table(min_ranks, bests_success, "success", geometries_small, 1)
median_ranks <- stat_table(median_ranks, bests_success, "success", geometries_small, 2)
max_ranks <- stat_table(max_ranks, bests_success, "success", geometries_small, 3)

perc_ranks <- stat_table2(perc_ranks, bests_success, "success", geometries_small)
cv_comp <- stats_cv(cv_comp, bests_success, animals, "best_success") 

bests_maps <- data.frame()
bests_maps <- find_best(bests_maps, n, animals, geometries_small, "map")
bests_maps$best_map <- as.numeric(bests_maps$best_map)
draw_best_rankings("figs/best_map.pdf", bests_maps, "best_map", "anim", shapes, colors, "mAP")
a <- matrix(bests_maps$best_map, 15,length(animals), byrow = F)
bests_maps$ranking <- c(apply(a, 2, rank))
#bests_maps$ranking <- c(apply(-a, 2, rank))
draw_best_rankings("figs/best_map_ranking.pdf", bests_maps, "ranking", "anim", shapes, colors, "Ranking for mAP")

min_ranks <- stat_table(min_ranks, bests_maps, "map", geometries_small,1)
median_ranks <- stat_table(median_ranks, bests_maps, "map", geometries_small,2)
max_ranks <- stat_table(max_ranks, bests_maps, "map", geometries_small,3)

perc_ranks <- stat_table2(perc_ranks, bests_maps, "map", geometries_small)
cv_comp <- stats_cv(cv_comp, bests_maps, animals, "best_map") 

bests_meanranks <- data.frame()
bests_meanranks <- find_best(bests_meanranks, n, animals, geometries_small, "invmr")
bests_meanranks$best_invmr <- as.numeric(bests_meanranks$best_invmr)
draw_best_rankings("figs/best_mr.pdf", bests_meanranks, "best_invmr", "anim", shapes, colors, "1/MeanRank")
a <- matrix(bests_meanranks$best_invmr, 15,length(animals), byrow = F)
bests_meanranks$ranking <- c(apply(a, 2, rank))
#bests_meanranks$ranking <- c(apply(-a, 2, rank))
draw_best_rankings("figs/best_mr_ranking.pdf", bests_meanranks, "ranking", "anim", shapes, colors, "Ranking for 1/MeanRank")

min_ranks <- stat_table(min_ranks, bests_meanranks, "invmr", geometries_small,1)
median_ranks <- stat_table(median_ranks, bests_meanranks, "invmr", geometries_small,2)
max_ranks <- stat_table(max_ranks, bests_meanranks, "invmr", geometries_small,3)

perc_ranks <- stat_table2(perc_ranks, bests_meanranks, "invmr", geometries_small)
cv_comp <- stats_cv(cv_comp, bests_meanranks, animals, "best_invmr") 

bests_stretch <- data.frame()
bests_stretch <- find_best(bests_stretch, n, animals, geometries_small, "invstr")
bests_stretch$best_invstr <- as.numeric(bests_stretch$best_invstr)
draw_best_rankings("figs/best_stretch.pdf", bests_stretch, "best_invstr", "anim", shapes, colors, "1/stretch")
a <- matrix(bests_stretch$best_invstr, 15,length(animals), byrow = F)
#bests_stretch$ranking <- c(apply(a, 2, rank))
bests_stretch$ranking <- c(apply(a, 2, rank))
draw_best_rankings("figs/best_stretch_ranking.pdf", bests_stretch, "ranking", "anim", shapes, colors, "Ranking for 1/stretch")

min_ranks <- stat_table(min_ranks, bests_stretch, "stretch", geometries_small,1)
median_ranks <- stat_table(median_ranks, bests_stretch, "stretch", geometries_small,2)
max_ranks <- stat_table(max_ranks, bests_stretch, "stretch", geometries_small,3)

perc_ranks <- stat_table2(perc_ranks, bests_stretch, "stretch", geometries_small)
cv_comp <- stats_cv(cv_comp, bests_stretch, animals, "best_invstr") 

# condorcet winners/losers

# compute the probabilities
comp_prob_matrix <- function(df, orig, animals_list, geoms_list, measure_id) {
  for (anim in animals_list) {
  d2 <- subset(orig, orig$who==anim & orig$iteration>0 )
  for (geom in geoms_list) {
    for (geom2 in geoms_list) {
      A <- as.numeric(d2[d2$geometry==geom, measure_id])
      B <- as.numeric(d2[d2$geometry==geom2, measure_id])
      prob <- as.numeric(mean(outer(A, B, FUN='>')+0.5*outer(A,B, FUN='==')))
      print(c(anim,geom, geom2,prob))
      df <- rbind(df,c(anim,geom, geom2,prob))
    }
    }
  }
  colnames(df) <- c("anim", "geom","geom2", "prob")
  df$prob <- as.numeric(df$prob)
  return(df)
}

ranking <- function(db, idvar, criterion, qc) {
  library(dplyr)
  db <- db %>% arrange(idvar, criterion) %>%
    group_by(anim) %>%
    mutate(rank = rank(!!qc, ties.method = "last"))
}

condorcet_rankings <- function(df, prob_matrix, animals_list, geoms_list) {
  prob_matrix$result <- ifelse(prob_matrix$prob > 0.5, 1,0)
  prob_matrix$result <- ifelse(prob_matrix$prob < 0.5, -1, prob_matrix$result)
  for (anim2 in animals_list) {
    for (geom3 in geoms_list) {
      out <- sum(as.numeric(subset(prob_matrix, prob_matrix$anim==anim2 & prob_matrix$geom==geom3)$result))
      df <- rbind(df,c(anim2,geom3,out))
    }
  }
  colnames(df) <- c("anim", "geom", "out")
  df$out <- as.numeric(df$out)
  df <- ranking(df, "anim", "out", quo(desc(out)))
  return(df)
}

# loglik
condorcet_probs_loglik <- data.frame()
cond_ranks_loglik <- data.frame()
condorcet_probs_loglik <- comp_prob_matrix(condorcet_probs_loglik, n, animals, geometries_small, 5)
cond_ranks_loglik <- condorcet_rankings(cond_ranks_loglik, condorcet_probs_loglik, animals, geometries_small)

# success
condorcet_probs_success <- data.frame()
condorcet_probs_success <- comp_prob_matrix(condorcet_probs_success, n, animals, geometries_small, 6)
cond_ranks_success <- data.frame()
cond_ranks_success <- condorcet_rankings(cond_ranks_success, condorcet_probs_success, animals, geometries_small)

# map
condorcet_probs_map <- data.frame()
condorcet_probs_map <- comp_prob_matrix(condorcet_probs_map, n, animals, geometries_small, 7)
cond_ranks_map <- data.frame()
cond_ranks_map <- condorcet_rankings(cond_ranks_map, condorcet_probs_map, animals, geometries_small)

# invmr
condorcet_probs_mr <- data.frame()
condorcet_probs_mr <- comp_prob_matrix(condorcet_probs_mr, n, animals, geometries_small, 10)
cond_ranks_mr <- data.frame()
cond_ranks_mr <- condorcet_rankings(cond_ranks_mr, condorcet_probs_mr, animals, geometries_small)

# invstr
condorcet_probs_str <- data.frame()
condorcet_probs_str <- comp_prob_matrix(condorcet_probs_str, n, animals, geometries_small, 10)
cond_ranks_str <- data.frame()
cond_ranks_str <- condorcet_rankings(cond_ranks_str, condorcet_probs_str, animals, geometries_small)


# find the winner and the loser
voting <- function(measure, df_ranks, animals_list) {
  print(c(measure, " winner:"))
  print(subset(df_ranks, as.numeric(df_ranks$rank)==1),n=length(animals))
  print(c(measure, " loser:"))
  for (anim1 in animals_list) {
    x <- subset(df_ranks, df_ranks$anim==anim1)
    y <- as.data.frame(subset(x, x$rank==max(as.numeric(x$rank))))
    colnames(y) <- NULL
    print(y)
  }
}

voting("loglik", cond_ranks_loglik, animals)
voting("success", cond_ranks_success, animals)
voting("map", cond_ranks_map, animals)
voting("invmr", cond_ranks_mr, animals)
voting("invstr", cond_ranks_str, animals)


#for (anim3 in animals) {
#  anim_win <- max(as.numeric(subset(condorcet_winner, condorcet_winner$anim==anim3)$out))
#  print(c(anim3, anim_win, condorcet_winner[condorcet_winner$out == anim_win & condorcet_winner$anim==anim3,2]))
#}

# graph analysis

translate_to_agg <- function(edges_list, geom_dict) {
  edges_list$id <- paste(edges_list$geom, edges_list$geom2, sep="_")
  edges_list <- merge(edges_list, geom_dict, by.x="id", by.y="id1", all.x=T)
  edges_list <- edges_list[,c(1,2,3,4,7)]
  colnames(edges_list) <- c("id", "geom", "geom2", "sum_edges", "max1")
  edges_list <- merge(edges_list, geom_dict, by.x="id", by.y="id2", all.x=T)
  edges_list$max1 <- as.numeric(ifelse(is.na(edges_list$max1)==T, edges_list$max, edges_list$max1))
  edges_list$percent_weight <- round(100*(edges_list$sum_edges/edges_list$max1),0)
  edges_list <- subset(edges_list, edges_list$percent_weight > 50)
  #edges <- subset(edges, edges$percent_weight > 75)
  
  edges_list <- edges_list[, c(2,3,10)]
  return(edges_list)
}

make_edges_list <- function(df, df_probs, nodes_list, geom_list, agg) {
  df_probs$edge <- ifelse(df_probs$prob > 0.5, 1,0)
  df <- subset(df_probs, df_probs$edge==1)
  #str(df)
  #str(subset(df, df$geom2=="h2r"))
  
  library(dplyr)
  
  if (agg==T) {
    df <- merge(df, nodes_list, by.x="geom", by.y="name")
    df <- df[, c(2,18,3,5)]
    colnames(df) <- c("anim", "geom", "geom2","edge")
    df <- merge(df, nodes_list, by.x="geom2", by.y="name")
    df <- df[, c(2,3,17,4)]
    colnames(df) <- c("anim", "geom", "geom2", "edge")
    #str(subset(df, df$geom2=="3_product"))
  }
  
  # Group by sum using dplyr
  df <- df %>% group_by(geom, geom2) %>% 
    summarise(sum_edges = sum(as.numeric(edge)),
              .groups = 'drop') %>% as.data.frame()
  
  #str(subset(df, df$geom2=="3_product"))
  #make weights in percentages
  
  if (agg==T) {
   df <- subset(df, df$geom != df$geom2)
   #str(subset(df, df$geom2=="3_product"))
   df <- translate_to_agg(df, geom_list)
  } else {
    df$sum_edges <- round(100*(df$sum_edges/length(animals)),0)
  }
  
  return(df)
}

info$geom_id <- paste(info$dim, info$geometry, sep="_")


# dict for aggregations
geometrie <- c("2_hyperbolic", "2_tree", "3_euclid", "3_hyperbolic", "3_nil", 
  "3_other", "3_product", "3_solv", "3_sphere")

matches <- c()
for (m1 in geometrie) {
  for (m2 in geometrie) {
    matches <- rbind(matches, c(m1, m2,
      length(animals) * nrow(subset(info, info$geom_id==m1 & info$name %in% geometries_small))
      * nrow(subset(info, info$geom_id==m2 & info$name %in% geometries_small))
    ))
  }
}

matches <- as.data.frame(matches)
matches[,3] <- as.numeric(matches[,3])

colnames(matches) <- c("g1", "g2", "max")
matches$id1 <- paste(matches$g1, matches$g2, sep="_")
matches$id2 <- paste(matches$g2, matches$g1, sep="_")

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_loglik, info, matches, agg=F)

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_loglik, info, matches, agg=T)
write.table(edges, "edges_loglik.csv", row.names=F, sep=";")

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_success, info, matches, agg=T)
write.table(edges, "edges_success.csv", row.names=F, sep=";")

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_map, info, matches, agg=T)
write.table(edges, "edges_map.csv", row.names=F, sep=";")

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_mr, info, matches, agg=T)
write.table(edges, "edges_mr.csv", row.names=F, sep=";")

edges <- data.frame()
edges <- make_edges_list(edges, condorcet_probs_str, info, matches, agg=T)
write.table(edges, "edges_stretch.csv", row.names=F, sep=";")

geometries_small <- as.data.frame(c("e3b", "g435", "g435b2", "g711", "h2m", "h2r", "h3m", "nil", "s3", "sol3", "subnil", "subsol3", "t3", "tree", "twist"))
colnames(geometries_small) <- "geom"
nodes <- merge(geometries_small, info, by.x="geom", by.y="name")
write.table(nodes, "nodes.csv", row.names = F, sep = ";")


# an example visualization

library(igraph)
graph <- graph_from_edgelist(as.matrix(edges[,-3]), directed = T)
E(graph)$weight <- edges$percent_weight
plot(graph, layout=layout_with_fr, edge.label=E(graph)$weight, edge.curved=rep(0.5, ecount(graph)))
plot(graph, layout=layout_in_circle, edge.label=E(graph)$weight)

# regressions on measures
plot(density(n$loglik))

# additional regressions -- abandoned

m$who2 <- "human"
m$who2 <- ifelse(m$who %in% c("Cat1", "Cat2", "Cat3"), "cat", m$who2)
m$who2 <- ifelse(m$who %in% c("CElegans"), "celegans", m$who2)
m$who2 <- ifelse(m$who %in% c("Drosophila1", "Drosophila2"), "drosophila", m$who2)
m$who2 <- ifelse(m$who %in% c("Macaque1", "Macaque2", "Macaque3", "Macaque4"), "macaque", m$who2)
m$who2 <- ifelse(m$who %in% c("Mouse1", "Mouse2", "Mouse3"), "mouse", m$who2)
m$who2 <- ifelse(m$who %in% c("Rat1", "Rat2", "Rat3"), "rat", m$who2)
m$who2 <- ifelse(m$who %in% c("ZebraFinch2"), "zebra", m$who2)

add <- read.table("Rcode/data/parts.csv", sep=";", as.is=T, header=T)
m <- merge(m, add, by.x="who", by.y="name", all.x=T)

x <- subset(m, geom %in% geometries_small)
x <-x[order(as.factor(x$who)),]

x <- x[sample(1:nrow(x)), ]

w <- matrix(x$loglik, nrow=nrow(table(x$who)), byrow=T)
w2 <- apply(w, 1, normalize)
x$log_normalized <- c(w2)



reg <- lm(loglik ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) + as.factor(dim) +nodes + as.factor(closed) + as.factor(angular) + sagdist01 + sagdist10 + sagdist50,x)
summary(reg)

reg <- lm(I(100*log_normalized) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geom) + as.factor(dim) +nodes + as.factor(closed) + as.factor(angular) + sagdist01 + sagdist10 + sagdist50,x)
summary(reg)

reg <- lm(I(100*log_normalized) ~ as.factor(geom) + as.factor(dim) +nodes + as.factor(closed) + as.factor(angular) + sagdist01 + sagdist10 + sagdist50,x)
summary(reg)

reg <- lm(I(100*log_normalized) ~ as.factor(geom) + nodes + sagdist01 + sagdist10 + sagdist50 ,x)
summary(reg)

library(broom)
print(tidy(coeftest(reg, vcov = vcov(reg, type="HC1"))), n=25)

x$fitted <- predict(reg) 

a <- subset(x, x$fitted>100000)

library(lmtest)
resettest(reg, power=2:3, type=c("fitted"))  

library(car)
plot(reg, which = 1)

par(mfrow=c(1,2))
plot(density(reg$residuals))
qqPlot(reg$residuals)

par(mfrow=c(1,2))
plot(reg$residuals)
plot(ts(reg$residuals))


par(mfrow=c(1,1))
plot(reg, which=5)

reg <- lm(loglik ~ n + m + I(m/n) + assort + as.factor(geometry) + as.factor(dim) +nodes + as.factor(closed) + as.factor(angular) + sagdist01 + sagdist10 + sagdist50,subset(x, x$who2=="human"))
summary(reg)

# analysis of bigs

geoms_comp <- c("g435", "g435b2", "h2r", "sol3", "twist")

for (geom in geoms_comp) {
  name <- paste("BIG", geom, sep="-")
  print(name)
  df <- data.frame()
  df <- compare_pairs(df, n, "geometry", geom, name, animals)
  name2 <- paste("figs/", name, "comp", ".pdf", sep="") 
  pdf(name2, 7,7)
  heatmap(as.matrix(df), Rowv = NA, Colv = NA, scale="none")
  dev.off()
}

prepare_ranking <- function(df, cand, measure, anim_list, geom_list) {
  df <- find_best(df, cand, anim_list, geom_list, measure)
  name <- paste("best", measure, sep="_")
  df[[name]] <- as.numeric(df[[name]])
  a <- matrix(df[[name]], length(geom_list), length(anim_list), byrow = F)
  df$ranking <- c(apply(a, 2, rank))
  
  df$id <- paste(df$anim, df$geom, sep="")
  return(df)
}

compute_cohen <- function(class1, class2, measure, anim_list, geom_list) {
  bests_cand1 <- data.frame()
  bests_cand1 <- prepare_ranking(bests_cand1, class1, measure, anim_list, geom_list)
  bests_cand2 <- data.frame()
  bests_cand2 <- prepare_ranking(bests_cand2, class2, measure, anim_list, geom_list)
  bests_cand1 <- merge(bests_cand1, bests_cand2, by="id", all.x=T)
  library(psych)
  print(cohen.kappa(bests_cand1[,c(6,11)]))
}

compute_cohen2 <- function(class1, class2, measure, anim_list, geom_list1, geom_list2) {
  bests_cand1 <- data.frame()
  bests_cand1 <- prepare_ranking(bests_cand1, class1, measure, anim_list, geom_list1)
  bests_cand2 <- data.frame()
  bests_cand2 <- prepare_ranking(bests_cand2, class2, measure, anim_list, geom_list2)
  library(stringr)
  bests_cand2$id <- str_remove(bests_cand2$id, "BIG-")
  bests_cand1 <- merge(bests_cand1, bests_cand2, by="id", all.x=T)
  library(psych)
  print(cohen.kappa(bests_cand1[,c(6,11)]))
}

geoms_big <- c("BIG-g435", "BIG-g435b2", "BIG-h2r", "BIG-sol3", "BIG-twist")

# check if ranking of bigs is different than the rankings of shorts (original data)

compute_cohen2(n, n, "nll", animals, geoms_comp, geoms_big)
compute_cohen2(n, n, "map", animals, geoms_comp, geoms_big)
compute_cohen2(n, n, "invmr", animals, geoms_comp, geoms_big)
compute_cohen2(n, n, "success", animals, geoms_comp, geoms_big)
compute_cohen2(n, n, "invstr", animals, geoms_comp, geoms_big)

# analysis of long SA

x <- read.table("Rcode/data/brain-v5L.csv", sep=";", as.is=T, header=T)
x <- merge(x, info, by.x="geom", by.y="name", all.x=T)

x$s <- x$n * (x$n-1) / 2

x$entropy <- x$m * log(x$m / x$s) + (x$s - x$m) * log((x$s - x$m) / x$s)
x$nll <- 1 + x$loglik / x$entropy

library("ggplot2")
library(directlabels)

draw_iter <- function(d, xvar, yvar, colvar, groupvar) {
  ggplot(data= d, aes_string(x = xvar, y = yvar, col=colvar)) + geom_line() + 
    facet_wrap(groupvar) +
    geom_dl(aes_string(label = colvar), method = list(dl.combine("first.points", "last.points")), cex = 0.8) 
}  

for (anim in animals) {
  name = paste("iter/comps/",anim,"_iter.pdf", sep="")
  print(name)
  d2 <- subset(x, x$who==anim)
  pdf(name, 15,7)
  print(draw_iter(d2, "as.numeric(iteration)", "as.numeric(loglik)", "as.character(geometry)", "who"))
  dev.off()
  #name = paste("iter/MAP/",anim,"_mAP_iter_zoom.pdf", sep="")
  #pdf(name, 15,7)
  #d3 <- subset(d2, as.numeric(d2$iteration) >=50)
  #print(draw_iter(d3, "as.numeric(iteration)", "as.numeric(mAP)", "as.character(geometry)", "who"))
  #dev.off()
}

# for the comparison with m
x$newid <- paste(x$who,x$geom)
n1 <- as.data.frame(cbind(x$newid, x$who, x$geom, x$iteration, x$loglik, x$success, x$mAP, x$meanrank, x$stretch, x$nll))
colnames(n1) <- c("newid", "who", "geometry", "iteration", "loglik", "success", "map", "mr", "stretch", "nll")
n1 <- n1[with(n1, order(newid, as.numeric(iteration))),]
n1$iteration <- as.numeric(n1$iteration)
n1$loglik <- (-1)*as.numeric(n1$loglik)
n1$success <- as.numeric(n1$success)
n1$map <- as.numeric(n1$map)
n1$mr <- as.numeric(n1$mr) +1
n1$stretch <- as.numeric(n1$stretch)
n1$nll <- as.numeric(n1$nll)

# convenient for visualization: inv_meanrank <- 1 / meanrank ; inv_stretch <- 1/stretch
n1$invmr <- 1/n1$mr
n1$invstr <- 1/n1$stretch

#n1 <- subset(n1, n1$iteration <30)
n1$style <- paste(n1$geometry, "10x", sep = "")

geom_10x <- c("BIG-g435", "BIG-h2r", "BIG-sol3", "BIG-twist", "e3b", "g435", "g435b2", "g711" , "sol3", "subsol3", "twist")

a <- subset(n, n$geometry %in% geom_10x)
a$style <- a$geometry

n1 <- rbind(n1, a)

# 10x vs normal
# if there is a difference in favor of continuous we aim at rejecting H0
# we also control for the lack of significant difference


animals_short <- c("Cat1", "Cat2", "Cat3", "CElegans", "Drosophila1","Drosophila2", "Human1", "Human2", "Human6", "Human7", "Human8", "Macaque1", "Macaque2", "Macaque3", "Macaque4", "Mouse2", "Mouse3", "Rat1", "Rat2", "Rat3","ZebraFinch2")


for (geoms in geom_10x) {
  geom_long <- paste(geoms, "10x", sep="")
  print(geom_long)
  geometry_name <- paste(geoms, "comp", sep="")
  geometry_name <- data.frame()
  geometry_name <- compare_pairs(geometry_name, n1, "style", geoms, geom_long, animals_short)

  name <- paste("~/comps/", geom_long, ".pdf", sep="") 
  pdf(name, 7,7)
  heatmap(as.matrix(geometry_name), Rowv = NA, Colv = NA, scale="none")
  dev.off()
}

for (anim in animals) {
  name = paste("iter/comps/",anim,"_iter.pdf", sep="")
  print(name)
  d2 <- subset(n1, n1$who==anim)
  #print(head(d2))
  pdf(name, 15,7)
  print(draw_iter(d2, "as.numeric(iteration)", "as.numeric(success)", "as.character(style)", "who"))
  dev.off()
  #name = paste("iter/MAP/",anim,"_mAP_iter_zoom.pdf", sep="")
  #pdf(name, 15,7)
  #d3 <- subset(d2, as.numeric(d2$iteration) >=50)
  #print(draw_iter(d3, "as.numeric(iteration)", "as.numeric(mAP)", "as.character(geometry)", "who"))
  #dev.off()
}

library(stringr)
n1 <- subset(n1, str_detect(n1$style, "10x")==T)

geoms_notbig <- c("e3b", "g435", "g435b2", "g711" , "sol3", "subsol3", "twist")
n2 <- subset(n, n$geometry %in% geoms_notbig)

# check if the ranking of the small geometries differ if long vs not-long
compute_cohen(n2, n1, "nll", animals_short, geoms_notbig)
compute_cohen(n2, n1, "map", animals_short, geoms_notbig)
compute_cohen(n2, n1, "invmr", animals_short, geoms_notbig)
compute_cohen(n2, n1, "success", animals_short, geoms_notbig)
compute_cohen(n2, n1, "invstr", animals_short, geoms_notbig)

# check if the ranking of the small geometries vs big geometries differ if long
compute_cohen2(n1, n1, "nll", animals_short, geoms_comp, geoms_big)
compute_cohen2(n1, n1, "map", animals_short, geoms_comp, geoms_big)
compute_cohen2(n1, n1, "invmr", animals_short, geoms_comp, geoms_big)
compute_cohen2(n1, n1, "success", animals_short, geoms_comp, geoms_big)
compute_cohen2(n1, n1, "invstr", animals_short, geoms_comp, geoms_big)

# check if the ranking of big geometries differ if long
compute_cohen(n, n1, "nll", animals_short, geoms_big)
compute_cohen(n, n1, "map", animals_short, geoms_big)
compute_cohen(n, n1, "invmr", animals_short, geoms_big)
compute_cohen(n, n1, "success", animals_short, geoms_big)
compute_cohen(n, n1, "invstr", animals_short, geoms_big)

# check if the ranking of short small differ than big long
compute_cohen2(n, n1, "nll", animals_short, geoms_comp, geoms_big)
compute_cohen2(n, n1, "map", animals_short, geoms_comp, geoms_big)
compute_cohen2(n, n1, "invmr", animals_short, geoms_comp, geoms_big)
compute_cohen2(n, n1, "success", animals_short, geoms_comp, geoms_big)
compute_cohen2(n, n1, "invstr", animals_short, geoms_comp, geoms_big)

# check if the ranking of big geometries differ with regime in long
n_type3 <- as.data.frame(cbind(x$newid, x$who, x$geom, x$iteration, x$loglik, x$success, x$mAP, x$meanrank, x$stretch, x$nll))
colnames(n_type3) <- c("newid", "who", "geometry", "iteration", "loglik", "success", "map", "mr", "stretch", "nll")
n_type3 <- n_type3[with(n_type3, order(newid, as.numeric(iteration))),]
n_type3$iteration <- as.numeric(n_type3$iteration)
n_type3$loglik <- (-1)*as.numeric(n_type3$loglik)
n_type3$success <- as.numeric(n_type3$success)
n_type3$map <- as.numeric(n_type3$map)
n_type3$mr <- as.numeric(n_type3$mr) +1
n_type3$stretch <- as.numeric(n_type3$stretch)
n_type3$nll <- as.numeric(n_type3$nll)

# convenient for visualization: inv_meanrank <- 1 / meanrank ; inv_stretch <- 1/stretch
n_type3$invmr <- 1/n_type3$mr
n_type3$invstr <- 1/n_type3$stretch

n_type3 <- subset(n_type3, n_type3$iteration >=60)
compute_cohen(n, n_type3, "nll", animals_short, geoms_big)
compute_cohen(n, n_type3, "map", animals_short, geoms_big)
compute_cohen(n, n_type3, "invmr", animals_short, geoms_big)
compute_cohen(n, n_type3, "success", animals_short, geoms_big)
compute_cohen(n, n_type3, "invstr", animals_short, geoms_big)

compute_cohen(n, n_type3, "nll", animals_short, geoms_comp)
compute_cohen(n, n_type3, "map", animals_short, geoms_comp)
compute_cohen(n, n_type3, "invmr", animals_short, geoms_comp)
compute_cohen(n, n_type3, "success", animals_short, geoms_comp)
compute_cohen(n, n_type3, "invstr", animals_short, geoms_comp)


# check the rankings based on distributions (copeland)
# small vs small on long

compute_cohen3 <- function(class1, class2, anim_list, geom_list1, geom_list2, measure_id) {
  condorcet_probs_class1 <- data.frame()
  cond_ranks_class1 <- data.frame()
  condorcet_probs_class2 <- data.frame()
  cond_ranks_class2 <- data.frame()
  
  condorcet_probs_class1 <- comp_prob_matrix(condorcet_probs_class1, class1, anim_list, geom_list1, measure_id)
  cond_ranks_class1 <- condorcet_rankings(cond_ranks_class1, condorcet_probs_class1, anim_list, geom_list1)
  condorcet_probs_class2 <- comp_prob_matrix(condorcet_probs_class2, class2, anim_list, geom_list2, measure_id)
  cond_ranks_class2 <- condorcet_rankings(cond_ranks_class2, condorcet_probs_class2, anim_list, geom_list2)
  
  cond_ranks_class1$id <- paste(cond_ranks_class1$anim, cond_ranks_class1$geom, sep="")
  cond_ranks_class2$id <- paste(cond_ranks_class2$anim, cond_ranks_class2$geom, sep="")
  
  cond_ranks_class1 <- merge(cond_ranks_class1, cond_ranks_class2, by="id", all.x=T)
  library(psych)
  print(cohen.kappa(cond_ranks_class1[,c(5,9)]))
}

#loglik
compute_cohen3(n, n1, animals_short, geoms_comp, geoms_comp, 5)
#map
compute_cohen3(n, n1, animals_short, geoms_comp, geoms_comp, 7)
#mr
compute_cohen3(n, n1, animals_short, geoms_comp, geoms_comp, 8)
#success
compute_cohen3(n, n1, animals_short, geoms_comp, geoms_comp, 6)
#stretch
compute_cohen3(n, n1, animals_short, geoms_comp, geoms_comp, 9)

# big vs big
#loglik
compute_cohen3(n, n1, animals_short, geoms_big, geoms_big, 5)
#map
compute_cohen3(n, n1, animals_short, geoms_big, geoms_big, 7)
#mr
compute_cohen3(n, n1, animals_short, geoms_big, geoms_big, 8)
#success
compute_cohen3(n, n1, animals_short, geoms_big, geoms_big, 6)
#stretch
compute_cohen3(n, n1, animals_short, geoms_big, geoms_big, 9)
  
compute_cohen4 <- function(class1, class2, anim_list, geom_list1, geom_list2, measure_id) {
  condorcet_probs_class1 <- data.frame()
  cond_ranks_class1 <- data.frame()
  condorcet_probs_class2 <- data.frame()
  cond_ranks_class2 <- data.frame()
  
  condorcet_probs_class1 <- comp_prob_matrix(condorcet_probs_class1, class1, anim_list, geom_list1, measure_id)
  cond_ranks_class1 <- condorcet_rankings(cond_ranks_class1, condorcet_probs_class1, anim_list, geom_list1)
  condorcet_probs_class2 <- comp_prob_matrix(condorcet_probs_class2, class2, anim_list, geom_list2, measure_id)
  cond_ranks_class2 <- condorcet_rankings(cond_ranks_class2, condorcet_probs_class2, anim_list, geom_list2)
  
  cond_ranks_class1$id <- paste(cond_ranks_class1$anim, cond_ranks_class1$geom, sep="")
  cond_ranks_class2$id <- paste(cond_ranks_class2$anim, cond_ranks_class2$geom, sep="")
  cond_ranks_class2$id <- str_remove(cond_ranks_class2$id, "BIG-")
  
  cond_ranks_class1 <- merge(cond_ranks_class1, cond_ranks_class2, by="id", all.x=T)
  library(psych)
  print(cohen.kappa(cond_ranks_class1[,c(5,9)]))
}

# small vs big 
#loglik
compute_cohen4(n, n, animals_short, geoms_comp, geoms_big, 5)
#map
compute_cohen4(n, n, animals_short, geoms_comp, geoms_big, 7)
#mr
compute_cohen4(n, n, animals_short, geoms_comp, geoms_big, 8)
#success
compute_cohen4(n, n, animals_short, geoms_comp, geoms_big, 6)
#stretch
compute_cohen4(n, n, animals_short, geoms_comp, geoms_big, 9)

# small vs big on long
#loglik
compute_cohen4(n1, n1, animals_short, geoms_comp, geoms_big, 5)
#map
compute_cohen4(n1, n1, animals_short, geoms_comp, geoms_big, 7)
#mr
compute_cohen4(n1, n1, animals_short, geoms_comp, geoms_big, 8)
#success
compute_cohen4(n1, n1, animals_short, geoms_comp, geoms_big, 6)
#stretch
compute_cohen4(n1, n1, animals_short, geoms_comp, geoms_big, 9)

# small not-long vs big-long
#loglik
compute_cohen4(n, n1, animals_short, geoms_comp, geoms_big, 5)
#map
compute_cohen4(n, n1, animals_short, geoms_comp, geoms_big, 7)
#mr
compute_cohen4(n, n1, animals_short, geoms_comp, geoms_big, 8)
#success
compute_cohen4(n, n1, animals_short, geoms_comp, geoms_big, 6)
#stretch
compute_cohen4(n, n1, animals_short, geoms_comp, geoms_big, 9)

# various regimes for R and T

n1$newid <- paste(n1$who,n1$geometry, sep="^")
n1 <- subset(n1, n1$who!="Drosophila1" & n1$who!="Drosophila2")
#n12 <- as.data.frame(cbind(n1$newid, n1$loglik, n1$iteration))
#n12 <- as.data.frame(cbind(n1$newid, n1$map, n1$iteration))
#n12 <- as.data.frame(cbind(n1$newid, n1$invmr, n1$iteration))
#n12 <- as.data.frame(cbind(n1$newid, n1$success, n1$iteration))
n12 <- as.data.frame(cbind(n1$newid, n1$invstr, n1$iteration))
#colnames(n12) <- c("newid", "loglik", "iteration")
#colnames(n12) <- c("newid", "map", "iteration")
#colnames(n12) <- c("newid", "invmr", "iteration")
#colnames(n12) <- c("newid", "success", "iteration")
colnames(n12) <- c("newid", "invstr", "iteration")

a <- reshape(n12, idvar = "newid", timevar = "iteration", direction = "wide")

# penalty for logliks
for (i in 1:nrow(a)) {
  d <- as.numeric(a[i,3:90])
  # change in mean
  ansvar <- cpt.mean(d,penalty="Manual",pen.value="5*log(n)",method="BinSeg",Q=2,class=FALSE)
  print(c(as.numeric(ansvar), a[i,1]))
}

# penalty for others
for (i in 1:nrow(a)) {
  d <- as.numeric(a[i,3:90])
  # change in mean
  ansvar <- cpt.mean(d,penalty="Manual",method="BinSeg", Q=2,class=FALSE)
  print(paste(ansvar, a[i,1], sep=";"))
}

#unfortunately, manual work to prepare the file based on the log from the above loop
s <- read.table("~/regime.csv", sep=";", as.is=T, header = T)

# an example visualization of the results
pdf("~/checkpoint1.pdf", 15,7)
plot(density(s$nll.first, na.rm = T), main="Density plots for the first changepoint")
lines(density(s$map.first), col="red")
lines(density(s$imr.first), col="green")
lines(density(s$suc.first), col="blue")
lines(density(s$str.first), col="orange")
abline(v=30)
legend("topright", legend=c("NLL", "MAP", "IMR", "SUC", "IST"),
       col=c("black", "red", "green", "blue", "orange"), lty=1)
dev.off()

pdf("~/checkpoint2.pdf", 15,7)
plot(density(s$nll.second, na.rm = T), main="Density plots for the second changepoint")
lines(density(s$map.second), col="red")
lines(density(s$imr.second), col="green")
lines(density(s$suc.second), col="blue")
lines(density(s$str.second), col="orange")
abline(v=60)
legend("topright", legend=c("NLL", "MAP", "IMR", "SUC", "IST"),
       col=c("black", "red", "green", "blue", "orange"), lty=1)
dev.off()

# the impact of change in regime
x <- merge(x, add, by.x="who", by.y="name", all.x=T)

x$loglik <- (-1)*as.numeric(x$loglik)
x$success <- as.numeric(x$success)
x$mAP <- as.numeric(x$mAP)
x$mr <- as.numeric(x$meanrank) +1
x$stretch <- as.numeric(x$stretch)
x$nll <- as.numeric(x$nll)

# convenient for visualization: inv_meanrank <- 1 / meanrank ; inv_stretch <- 1/stretch
x$invmr <- 1/x$mr
x$invstr <- 1/x$stretch

x$regime <- 1
x$regime <- ifelse(x$iteration>29 & x$iteration<60,2,x$regime)
x$regime <- ifelse(x$iteration>59,3,x$regime)

reg <- lm(I(100*nll) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) +nodes  +as.factor(regime),x)
reg <- lm(I(100*mAP) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) +nodes   +as.factor(regime),x)
reg <- lm(I(100*invmr) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) +nodes   +as.factor(regime),x)
reg <- lm(I(100*invstr) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) +nodes   +as.factor(regime),x)
reg <- lm(I(100*success) ~ n + m + I(m/n) + assort + cluster + as.factor(area) + as.factor(node) + as.factor(geometry) +nodes   +as.factor(regime),x)

summary(reg)

# testing autocorrelation in regimes

# wide format (currently missing data in Drosophila...)
n1 <- subset(n1, n1$who!="Drosophila1" & n1$who!="Drosophila2")
nx <- n1[,-c(2,3,5,8,9,13)]
# uncomment either of those to check for autocorrelation in a specific regime
#nx <- subset(nx, nx$iteration > 29 & nx$iteration<60)
#nx <- subset(nx, nx$iteration > 60)
a <- reshape(nx, idvar = "newid", timevar = "iteration", direction = "wide")

# ljung-box test -- are there autocorrelations?
logliks <- seq(4, ncol(a), by=5)
successes <- seq(2, ncol(a), by=5)
maps <- seq(3, ncol(a), by=5)
mrs <- seq(5, ncol(a), by=5)
stretches <- seq(6, ncol(a), by=5)

measures <- list(logliks, successes, maps, mrs, stretches)

for (measure in measures) {
  pvalues <- c()
  
  for (i in 1:nrow(a)) {
    pvalues <- c(pvalues, Box.test(ts(t(a[i, measure])), lag = 1, type="Ljung")$p.value)
  }
  
  # Bonferroni correction
  print(sum(pvalues < 0.01/(length(animals)*length(geometries))))
} 
