#include <Rcpp.h>
#include <RcppEigen.h>
#include <RcppParallel.h>
#include <vector>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <ctime> 


using namespace Rcpp;
using namespace RcppParallel;
using namespace Eigen;
#include <unistd.h>
#include<iostream>
#include <sys/unistd.h>
// #define gettid() syscall(__NR_gettid)

typedef std::vector<std::vector<double>> Mat;
typedef std::vector<double> Vec;


Mat dataframe_to_mat(DataFrame df) {
  int nrows = df.nrows();
  int ncols = df.size();
  Mat mat(nrows, Vec(ncols));
  for (int i = 0; i < ncols; ++i) {
    NumericVector col = df[i];
    for (int j = 0; j < nrows; ++j) {
      mat[j][i] = col[j];
    }
  }
  return mat;
}
Eigen::MatrixXd mat_to_Eimatrix(NumericMatrix df) {

  int nrows = df.nrow();
  int ncols = df.ncol();

  Eigen::MatrixXd mat(nrows, ncols);
  

  for (int j = 0; j < ncols; ++j) {
    for (int i = 0; i < nrows; ++i) {
      mat(i, j) = df(i, j);
    }
  }
  
  return mat;
}
// Kernel function rewritten to accept DataFrame as input
// [[Rcpp::export]]
Vec kernels(DataFrame x_df) {
  // Convert DataFrame to Mat format
  Mat x = dataframe_to_mat(x_df);
  
  // Bandwidth calculation
  int n = x.size();
  int d = x[0].size();
  Vec b(d, 0.0);
  Vec b2(d, 0.0);
  Vec mean(d, 0.0);
  double b_mean = 1;
  // Calculate mean and variance in a single loop for efficiency
  for (int i = 0; i < d; ++i) {
    double sum = 0.0;
    double sum_sq = 0.0;
    for (int j = 0; j < n; ++j) {
      sum += x[j][i];
      sum_sq += x[j][i] * x[j][i];
    }
    mean[i] = sum / n;
    b[i] = sqrt((sum_sq - n * mean[i] * mean[i]) / (n - 1));
    b[i]= 1.06 * b[i] * pow(n, -1.0 / (4.0 + d));
    b_mean*=b[i];
  }
  
  // Calculate scaled x and Gaussian kernel values
  Vec q(n, 0.0);
  double constant = 1.0 / (pow(2 * M_PI, d / 2.0) *b_mean ) ;
  //accumulate(b.begin(), b.end(), 1, multiplies<int>())
  //Vec b_(1, 0.0); 
  //b_[0] =  constant ;
  for (int i = 0; i < n; ++i) {
    double xx = 0.0;
    for (int j = 0; j < d; ++j) {
      double scaled = x[i][j]/b[j] ;
      xx += scaled * scaled /d;// /d
    }
    q[i] = constant * exp(-xx / 2.0);
  }
  // q[(n+1)] = b_mean;
  return q;
}


// Kernel function rewritten to accept DataFrame as input
VectorXd kernels_(const MatrixXd& x) {
  int n = x.rows();  
  int d = x.cols(); 

  VectorXd b(d);
  VectorXd mean = x.colwise().mean(); 
  

  for (int i = 0; i < d; ++i) {
    VectorXd col = x.col(i);           
    double variance = (col.array() - mean[i]).square().sum() / (n - 1); 
    b[i] = 1.06 * std::sqrt(variance) * std::pow(n, -1.0 / (4.0 + d)); 
  }
  

  double b_mean = b.prod();  
  

  double constant = 1.0 / (std::pow(2 * M_PI, d / 2.0) * b_mean);
  

  MatrixXd scaled_x = x.array().rowwise() / b.transpose().array(); 
  MatrixXd squared_x = scaled_x.array().square()/d ;             
  VectorXd q = squared_x.rowwise().sum();  

  q = (-(q / 2.0).array()).exp() * constant;  
  // NumericVector qq = wrap(q);
  return q;
}


struct KernelWorker : public Worker {
  const NumericMatrix& x_df;               
  const NumericMatrix& kernel_X_df;       
  const NumericVector& y_df;               
  std::vector<Vec>& results;  
  Vec& y_estimates;          
  std::vector<std::string>& log;  
  

  KernelWorker(const NumericMatrix& x_df, const NumericMatrix& kernel_X_df, const NumericVector& y_df,
               std::vector<Vec>& results, Vec& y_estimates, std::vector<std::string>& log)//
    : x_df(x_df), kernel_X_df(kernel_X_df), y_df(y_df), results(results), y_estimates(y_estimates), log(log) {}//
  

  void operator()(std::size_t begin, std::size_t end) {

    auto start = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());

    int nrows = x_df.nrow();
    int ncols = x_df.ncol();
    
    Eigen::Map<const Eigen::MatrixXd>  x(Eigen::Map<const Eigen::MatrixXd>(x_df.begin(), nrows, ncols));
    // Eigen::Map<const Eigen::MatrixXd> x = as<Eigen::Map<const Eigen::MatrixXd> >(x_df);
    // MatrixXd kernel_X = mat_to_Eimatrix(kernel_X_df);
    nrows = kernel_X_df.nrow();
    ncols = kernel_X_df.ncol();
    
    Eigen::Map<const Eigen::MatrixXd>  kernel_X(Eigen::Map<const Eigen::MatrixXd>(kernel_X_df.begin(), nrows, ncols));
    
    Eigen::Map<const Eigen::VectorXd> y(y_df.begin(), y_df.size());
    
    for (std::size_t i = begin; i < end; ++i) {
      MatrixXd X_x = kernel_X.row(i).replicate(x.rows(), 1) - x;
      VectorXd K = kernels_(X_x);
      
      double Ksum = K.sum();
      double k_y = 0.0;
      if (Ksum != 0) {
        K /= Ksum;
        k_y = K.dot(y);
      }
      
      // local_y_estimates[i - begin] = k_y;
      // local_results[i - begin] = kernel_X.row(i);
      y_estimates[i] = k_y;
      // results[i] = kernel_X.row(i);
      for (std::size_t j = 0; j < x.cols(); ++j) {
        results[i][j] = kernel_X_df(i, j);
      }
      
    }
    

  }
};

// [[Rcpp::export]]
DataFrame get_pre_y(NumericMatrix x_df, NumericVector y_df, NumericMatrix kernel_X_df) {

  std::size_t kernel_X_nrows = kernel_X_df.nrow();
  std::size_t kernel_X_ncols = kernel_X_df.ncol();
  

  std::vector<Vec> results(kernel_X_nrows, Vec(kernel_X_ncols));
  Vec y_estimates(kernel_X_nrows);
  

  std::vector<std::string> log(kernel_X_nrows);
  

  KernelWorker worker(x_df, kernel_X_df, y_df, results, y_estimates,log);//, log
  parallelFor(0, kernel_X_nrows, worker);

  List result_list;
  for (std::size_t j = 0; j < kernel_X_ncols; ++j) {
    NumericVector col(kernel_X_nrows);
    for (std::size_t i = 0; i < kernel_X_nrows; ++i) {
      col[i] = results[i][j];
    }
    result_list.push_back(col);
  }
  

  
  result_list.push_back(y_estimates);

  
  return DataFrame(result_list);
}


// [[Rcpp::export]]
Vec get_pre_y_(NumericMatrix x_df, NumericVector y_df, NumericMatrix kernel_X_df) {

  std::size_t kernel_X_nrows = kernel_X_df.nrow();
  std::size_t kernel_X_ncols = kernel_X_df.ncol();
  
  std::vector<Vec> results(kernel_X_nrows, Vec(kernel_X_ncols));
  Vec y_estimates(kernel_X_nrows);
  
  
  List result_list;
  for (std::size_t j = 0; j < kernel_X_ncols; ++j) {
    NumericVector col(kernel_X_nrows);
    for (std::size_t i = 0; i < kernel_X_nrows; ++i) {
      col[i] = kernel_X_df(i,j);
    }
    result_list.push_back(col);
  }
  
  return y_estimates;
}

// New function to replicate the behavior of the R code snippet
// [[Rcpp::export]]
DataFrame generate_kx(DataFrame data) {
  // Extract columns 'Y', 'X1', 'X2' from data
  NumericVector Y = data["Y"];
  NumericVector X1 = data["X1"];
  NumericVector X2 = data["X2"];
  int n = Y.size();
  int total_size = n * n;
  
  // Pre-allocate result vectors for better performance
  NumericVector result_X1(total_size);
  NumericVector result_X2(total_size);
  NumericVector result_Y(total_size);
  
  // Fill result vectors in a single loop to minimize overhead
  int idx = 0;
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      result_X1[idx] = X1[i];
      result_X2[idx] = X2[i];
      result_Y[idx] = Y[j];
      ++idx;
    }
  }
  
  // Create DataFrame from the result vectors
  DataFrame kx = DataFrame::create(_["X1"] = result_X1, _["X2"] = result_X2, _["Y"] = result_Y);
  return kx;
}

// [[Rcpp::export]]
DataFrame generate_kx1(DataFrame data) {
  // Extract columns 'Y', 'X1', 'X2' from data
  NumericVector Y = data["Y"];
  NumericVector Age = data["Age"];
  // NumericVector Gender = data["Gender"];
  NumericVector Eduyear = data["Eduyear"];
  
  int n = Y.size();
  int total_size = n * n;
  
  // Pre-allocate result vectors for better performance
  NumericVector result_Age(total_size);
  // NumericVector result_Gender(total_size);
  NumericVector result_Eduyear(total_size);
  
  NumericVector result_Y(total_size);
  
  // Fill result vectors in a single loop to minimize overhead
  int idx = 0;
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      result_Age[idx] = Age[i];
      // result_Gender[idx] = Gender[i];
      result_Eduyear[idx] = Eduyear[i];
      result_Y[idx] = Y[j];
      ++idx;
    }
  }
  
  // Create DataFrame from the result vectors_["Gender"] = result_Gender,
  DataFrame kx = DataFrame::create( _["Age"] = result_Age, _["Eduyear"] = result_Eduyear, _["Y"] = result_Y);
  return kx;
}

// [[Rcpp::export]]
NumericVector vector_operation(NumericVector o1, NumericVector o2, NumericVector o3) {

  if (o1.size() != o2.size() || o2.size() != o3.size()) {
    stop("All vectors must have the same length.");
  }
  

  if (is_true(any(o3 == 0))) {
    stop("Division by zero encountered in o3.");
  }
  
  return (o1 * o2) / o3;
}

// [[Rcpp::export]]
List fast_vector_operations(NumericMatrix ii, 
                            NumericVector pr_A_S_1_given_X, 
                            NumericVector pr_A_S_0_given_X, 
                            int nrow_data) {
  int n = ii.nrow();
  int groups = n / nrow_data;
  
  std::vector<double> pr_Y_given_A_star_1_X_all;
  std::vector<double> pr_Y_given_A_star_0_X_all;
  
  for (int j = 0; j < groups; j++) {
    int start_idx = j * nrow_data;
    int end_idx = start_idx + nrow_data;
    
    NumericVector o_1(nrow_data), o_2(nrow_data), o_4(nrow_data);
    
    for (int k = 0; k < nrow_data; k++) {
      o_1[k] = ii(start_idx + k, 0);
      o_2[k] = ii(start_idx + k, 1);
      o_4[k] = ii(start_idx + k, 2);
    }
    
    NumericVector result_1 = (o_1 * o_2) / pr_A_S_1_given_X;
    NumericVector result_2 = (o_1 * o_4) / pr_A_S_0_given_X;
    
    pr_Y_given_A_star_1_X_all.insert(pr_Y_given_A_star_1_X_all.end(), result_1.begin(), result_1.end());
    pr_Y_given_A_star_0_X_all.insert(pr_Y_given_A_star_0_X_all.end(), result_2.begin(), result_2.end());
    
    if ((j + 1) % 200 == 0) {
      Rcpp::Rcout << (j + 1) << " groups processed" << std::endl;
    }
  }
  
  NumericVector result_1 = wrap(pr_Y_given_A_star_1_X_all);
  NumericVector result_2 = wrap(pr_Y_given_A_star_0_X_all);
  
  return List::create(
    Named("pr_Y_given_A_star_1_X_all") = result_1,
    Named("pr_Y_given_A_star_0_X_all") = result_2
  );
}