### mix of discrete and kernel density estimator for marginal distributions
### applicable if data continuous but has some realisations with significant prob mass

### helper funs

# scores for continuous data of mixed case
cont_scores_mix <- function(indata_dense,
                            grid_dense,
                            offset,
                            n_train){

  # save order of data to reverse later
  indata_order <- order(indata_dense)
  indata_dense <- indata_dense[indata_order]

  n_grid <- length(grid_dense)
  b <- n_train
  thresh <- b^-1

  # calculate outlier scores of data
  sample_scores <- rep(0, length(indata_dense))
  pos <- 1
  temp <-1
  for(i in 1:length(indata_dense)){
    temp <- Position(function(x) x > indata_dense[i], grid_dense[pos:n_grid])
    if(is.na(temp)){
      sample_scores[i:length(sample_scores)] <- 0
      break
    } else {
      temp2 <- (1-offset)*((temp+pos-2) / n_grid)
      if(temp2 < thresh){
        sample_scores[i] <- 1
      } else {
        sample_scores[i] <- -log(temp2, base = b)
      }
    }
    pos <- pos + temp - 1
  }

  return(sample_scores[order(indata_order)])
}

### calculate sample score for mix of discrete and continuous data
discr_scores <- function(indata,
                         fit_info){

  # unpack fit_info
  d_tab <- fit_info[["d_tab"]]
  n_train <- fit_info[["n_train"]]

  # allocate vector to contain the sample scores
  sample_scores <- rep(0, length(indata))

  # extract overall percentage of discrete data
  offset <- d_tab[dim(d_tab)[1], 3]

  # find entries in indata which are discrete and calculate score
  c_entries <- c()
  for(i in 1:length(indata)){
    temp_j <- Position(function(x) x == indata[i], d_tab[, 1])
    if(!is.na(temp_j)){
      temp_k <- (d_tab[temp_j, 3]+1-offset)
      if(temp_k < 1/n_train){
        sample_scores[i] <- 1
      } else {
        # choose as base for log n_train, s.t. an event which occurs only once
        # receives a score of 1
        sample_scores[i] <- -log(temp_k, base = n_train)
      }
    } else {
      # first option: there are only discrete values
      # if not: gets overridden in the following
      sample_scores[i] <- 1
      c_entries <- c(c_entries, i)
    }
  }

  if(!is.null(fit_info[["fit"]]) && length(c_entries) > 0){
    # calculate densities of continuous-like data
    indata_dense <- kde1d::dkde1d(indata[c_entries], fit_info[["fit"]])

    sample_scores[c_entries] <- cont_scores_mix(indata_dense = indata_dense,
                                    grid_dense = fit_info[["grid_dense"]],
                                        offset = offset,
                                       n_train = n_train)

  }

  return(sample_scores)
}

### end helper_funs


# fit marginal outlier score function in non-continuous case
marg_discr_fit <- function(indata,
                           discr_thresh,
                           calc_fit,
                           grid_size){
  # convert indata to factor
  indata_tab <- as.matrix(table(factor(indata, levels = unique(indata))))
  indata_tab <- cbind(as.double(row.names(indata_tab)),
                      indata_tab[, 1]/length(indata))


  # fist: find out whether mixed or completely discrete
  if(min(indata_tab[, 2]) > 1/length(indata)){
    full_discr <- T
  } else {
    # percentage of data which is realized only once
    t1 <- length(which(indata_tab[, 2] == 1/length(indata)))/length(indata)
    if(t1 < 0.1){
      full_discr <- T
    } else {
      full_discr <- F
    }
  }

  # mixed case
  if(!full_discr){
    ### for discrete-like data
    # find points which have probability mass > discr_thresh
    d_tab <- matrix(indata_tab[indata_tab[, 2] >= discr_thresh, ], ncol = 2)

    # there must be at least two unique observations left, or none
    if((dim(indata_tab)[1] - dim(d_tab)[1]) == 1){
      d_tab <- indata_tab
      fit <- NULL
      grid_dense <- c()
    } else {

      # find entries in indata which are above threshold
      d_entries <- which(indata %in% d_tab[, 1])

      # check if there is any continuous-like data
      if(length(d_entries) != length(indata)){

        # fit kernel to continuous-like data
        fit <- kde1d::kde1d(indata[setdiff(1:length(indata), d_entries)])

        # determine evaluation grid
        c_prop <- 1 - (length(d_entries)/length(indata))
        new_size <- max(ceiling(c_prop*grid_size), 10)
        grid_dense <- grid_dense_maker(grid_size = new_size,
                                       fit = fit)

        # check if fit to continuous-like data valid
        # if fit not valid, treat as completely discrete
        if(length(grid_dense) == 0){
          d_tab <- indata_tab
          fit <- NULL
          grid_dense <- c()
        }
      }
    }
  } else {
    fit <- NULL
    grid_dense <- c()
    d_tab <- indata_tab
  } #end if(!full_discr)
  rm(indata_tab)

  # fully discrete or mixed failed

  # order (remaining) discrete observations
  d_tab <- matrix(d_tab[order(d_tab[, 2], decreasing = F), ], ncol = 2)
  d_tab <- cbind(d_tab, cumsum(d_tab[, 2]))

  # gather fit info to return
  fit_info <- list("d_flag" = TRUE,
                   "grid_dense" = grid_dense,
                   "fit" = fit,
                   "d_tab" = d_tab,
                   "n_train" = length(indata))

  # calculate sample scores if requested
  if(calc_fit){
    fit_info[["sample_scores"]] <- discr_scores(indata = indata,
                                    fit_info = fit_info)
  } else {
    fit_info[["sample_scores"]] <- c()
  }

  return(fit_info)
}


