# Estimators

anderson_rubin_test <- function(X, I, Y, beta, rank){

  n <- nrow(X)
  d <- ncol(X)
  
  P <- I %*% solve(t(I) %*% I, t(I))
  res <- matrix(Y - X %*% beta, ncol=1)

  tstat <- (t(res) %*% P %*% res)/(t(res) %*% (diag(n) - P) %*% res)*(n-rank)/rank
  return(pf(tstat, df1=rank, df2=n-rank, lower.tail=FALSE))
}


spaceIV_estimator_inner <- function(X, I, Y, size=2, use_liml=FALSE){
  n <- nrow(X)
  m <- ncol(I)
  d <- ncol(X)

  # normalize
  covIX <- cov(I, X)
  covIY <- cov(I, Y)
  
  # size cannot be larger than min(ncol(X), ncol(I))
  size <- min(c(size, d, ncol(I)))

  # all subsets
  subsets <- combn(1:d, size, simplify=FALSE)
  loss <- rep(NA, length(subsets))
  ## Pre-compute P_I
  if(use_liml){
    P_I <- I%*%solve(t(I)%*%I)%*%t(I)
  }
  for(i in 1:length(subsets)){
    S <- subsets[[i]]
    beta_hat <- matrix(0, nrow=d, ncol=1)
    if(use_liml){
      beta_hat[S] <- liml(X[,S,drop=F], Y, P_I, n)
      R  <- matrix(Y - X[,S,drop=F] %*% beta_hat[S], ncol=1)
      loss[i] <- (t(R) %*% P_I %*% R)/(t(R) %*% (diag(n) - P_I) %*% R)*(n-m)/m
    }
    else{
      beta_hat[S] <- coefficients(lm.fit(covIX[,S,drop=F], covIY))
      loss[i] <- sum((covIX %*% beta_hat - covIY)^2)
    }
  }

  # recompute beta for minimal loss
  S <- subsets[[which.min(loss)]]
  beta_hat <- matrix(0, nrow=d, ncol=1)  
  if(use_liml){
    beta_hat[S] <- liml(X[,S,drop=F], Y, P_I, n)
  }
  else{
    beta_hat[S] <- coefficients(lm.fit(covIX[,S,drop=F], covIY))
  }
  
  return(beta_hat)
}


spaceIV_estimator <- function(X, I, Y, max_size=2,
                               alpha=0.05, use_liml=FALSE){
  n <- nrow(X)
  m <- ncol(I)
  d <- ncol(X)

  # run cross-validation fits
  pval_mat <- matrix(NA, B, max_size)
  size <- 0
  accepted <- FALSE
  while(size < max_size & !accepted){
    size <- size + 1
    beta_hat <- spaceIV_estimator_inner(X, I, Y, size=size, use_liml=use_liml)
    # perform test
    pval <- anderson_rubin_test(X, I, Y, beta_hat, m)
    accepted <- pval >= alpha
  }

  return(list(beta_hat=beta_hat,
              pval=pval))
}


l0_OLS_estimator_inner <- function(X, Y, size=2){
  n <- nrow(X)
  d <- ncol(X)

  # size cannot be larger than min(ncol(X), ncol(I))
  size <- min(c(size, d, ncol(I)))

  # all subsets
  subsets <- combn(1:d, size, simplify=FALSE)
  loss <- rep(NA, length(subsets))
  for(i in 1:length(subsets)){
    S <- subsets[[i]]
    beta_hat <- matrix(0, nrow=d, ncol=1)
    beta_hat[S] <- coefficients(lm.fit(X[,S,drop=F], Y))
    loss[i] <- sum((X %*% beta_hat - Y)^2)
  }

  # recompute beta for minimal loss
  S <- subsets[[which.min(loss)]]
  beta_hat <- matrix(0, nrow=d, ncol=1)
  modfit <- lm(Y ~ -1 + X[,S,drop=F])
  beta_hat[S] <- coefficients(modfit)

  return(list(beta_hat=beta_hat,
              modfit=modfit))
}

l0_OLS_estimator <- function(X, Y, max_size=2){
  n <- nrow(X)
  d <- ncol(X)

  # iterate over sparsity
  pval_mat <- matrix(NA, B, max_size)
  size <- 0
  improve <- TRUE
  aic <- Inf
  while(size < max_size & improve){
    size <- size + 1
    bestfit <- l0_OLS_estimator_inner(X, Y, size=size)
    # compute AIC
    aic_tmp <- AIC(bestfit$modfit)
    if(aic_tmp < aic){
      aic <- aic_tmp
    }
    else{
      improve <- FALSE
    }
  }
  return(list(beta_hat=bestfit$beta_hat,
              aic=aic))
}
