
#ifndef CLUPIG_H
#define CLUPIG_H

#include "header.h"

class clupig {

private:
    int n_rotate_ = 3;
    int fhtDim_;

protected:

    int n_points;
    int n_features;

public:

    int n_proj = 1024;
    int top_s = 5;
    int top_m = 50;
    bool propagation_cutoff = false;
    int min_cluster_size = 50;

    // top project vectors for each layer, should be >= top_s
    // This parameter is not very important, as data is often clustered in a few directions so the majority of buckets are dense (> top_m)
    // After insert a point into top_p^2 buckets, top_m plays the key roles as we only keep top_m points in each bucket
    // Hence, setting larger top_p will not help much but slow down the process.
    int top_p = top_s;

    int n_threads = 8;

    float ker_sigma = 1.0; // Only used for Fourier features of L1 and L2
    int ker_n_features = 1024; // Only used for Fourier features of L1 and L2

    float ker_intervalSampling = 0.4; // Only used for interval sampling of Chi2 and JS

    bool verbose = false;
    string output;

    // minimum cluster size for cluster expansion - we always expand clusters if the cluster does not have min_cluster_size points
    // When it has enough points, we will only expand if dist(Xi, Xj) < kNN_dist(Xi) + kNN_dist(Xj) - to ensure cluster spreads slowly from dense region to sparse region
    // This is to avoid large clusters that cover most points


    // used on DNP to cut off neighbors that are too far away when inserting into the priority queue, default is false.
    // This is to reduce the number of neighbors to be extended, and improve efficiency.
    // If it is true, then we only extend neighbors to min(neighborSize, c * minPts),
    // where neighborSize is the current number of neighbors found, and c is a constant (default 1)


    string distance = "Cosine";
    int seed = -1; // -1 is random seed

private:

    // 1 layer random sign for FHT
    boost::dynamic_bitset<> bitHD_;

    // 2 layers of random sign for FHT
    boost::dynamic_bitset<> bitHD1_;
    boost::dynamic_bitset<> bitHD2_;

    // 1 layer random gauss
    MatrixXf matrix_G_;

    // 2 layers random gausss
    MatrixXf matrix_G1_;
    MatrixXf matrix_G2_;

    // Random Gaussian matrix for Fourier embedding on L1 and L2
    // Chi^2 and JS do not need as its embeddings are deterministic
    MatrixXf matrix_R_;

    MatrixXi matrix_top_s_; // For each point (each col), keep topK closest/furthest random vectors
    MatrixXi matrix_top_m_; // For each random vector (each col), keep topM closest/furthest points

    // Data structures of DNP
    vector< vector< pair<int, float> > > vec2D_NeighborDist_; // vector of approx neighborhoods and its distances

public:

    RowMajorMatrixXf matrix_X; // public as we will have to load data into it (when data is big)

    // Clustering's output
    IVector labels;
    int n_clusters = 0;

    // vector<IVector> indices_;
    // vector<FVector> distances_;
    // vector<int> flat_indices_;
    // vector<float> flat_distances_;
    // vector<int> flat_offset_;
    // MatrixXi matrix_flat_indices_; // for fast access to flat indices
    // MatrixXf matrix_flat_distances_;
    //    const Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>& get_indices() const {
    //        return matrix_flat_indices_;
    //    }
    //    const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>& get_distances() const {
    //        return matrix_flat_distances_;
    //    }

    clupig(int n, int d){
        n_points = n;
        n_features = d;
    }

    void set_params(int numProj = 1024, int s = 5, int m = 50, int p = 5, string dist = "Cosine",
                   int kDim = 1024, float kSigma = 1.0, float kSam = 0.4,
                   bool ver = false, int numThreads = 8, int randomSeed = -1, string filename = ""){
        n_proj = numProj;
        top_s = s;
        top_m = m;
        top_p = p;
        distance = dist;
        ker_n_features = kDim;
        ker_sigma = kSigma;
        ker_intervalSampling = kSam;
        verbose = ver;

        set_threads(numThreads);

        seed = randomSeed;
        output = filename;
        min_cluster_size = 50;

        // Must set
        if (distance == "Cosine") {
            ker_n_features = n_features;
        }

        // have to set fhtDim
        if (distance == "Cosine")
        {
            // Must set
            ker_n_features = n_features;

            if (n_proj <= n_features){
                fhtDim_ = 1 << int(ceil(log2(n_features)));
            }
            else{
                fhtDim_ = 1 << int(ceil(log2(n_proj)));
            }
        }
        else // the rest uses kernel embedding
        {
            if (n_proj <= ker_n_features)
                fhtDim_ = 1 << int(ceil(log2(ker_n_features)));
            else
                fhtDim_ = 1 << int(ceil(log2(n_proj)));
        }
    }



    void clear(){

        n_clusters = 0;
        labels.clear();

        // for (int n = 0; n < n_points; ++n)
        // {
        //     indices_[n].clear();
        //     distances_[n].clear();
        //     vec2D_NeighborDist[n].clear();
        // }
        //
        // flat_indices_.clear();
        // flat_distances_.clear();
        // flat_offset_.clear();
        // matrix_flat_indices_.resize(0, 0);
        // matrix_flat_distances_.resize(0, 0);
        // indices_.clear();
        // distances_.clear();

        vec2D_NeighborDist_.clear(); // vector of approx neighborhoods and its distances

        bitHD_.clear();
        bitHD1_.clear();
        bitHD2_.clear();

        matrix_G_.resize(0, 0);
        matrix_G1_.resize(0, 0);
        matrix_G2_.resize(0, 0);

        matrix_R_.resize(0, 0); // Random matrix for Fourier features (L1, L2)

        matrix_top_s_.resize(0, 0); // For each point (each col), keep topK closest/furthest random vectors
        matrix_top_m_.resize(0, 0); // For each random vector (each col), keep topM closest/furthest points

    }

    ~clupig(){
        matrix_X.resize(0, 0);
        clear();
    }


    void set_top_m(int m){ top_m = m; }
    void set_top_s(int s){ top_s = s; }
    void set_top_p(int p){ top_p = p; }
    void set_min_cluster_size(float s){ min_cluster_size = s; }
    void set_proj(int p){ n_proj = p; }
    void set_propagation_cutoff(bool b){ propagation_cutoff = b; }

    void set_threads(int t)
    {
        if (t <= 0)
            n_threads = omp_get_max_threads();
        else
            n_threads = t;
    }

    // Placeholder for released version: DNP, DBSCAN, LPA...
    void fit(const Ref<const RowMajorMatrixXf> &, const string& , int , float);
    void fit_from_file(const string&, const string& , const string& , const string& , int , float);
    void fit_from_knn(const Ref<const RowMajorMatrixXi> & , const Ref<const RowMajorMatrixXf> & , const string& , const string& , int , float);

    // BF symmetric kNN construction and DNP
    void brute_knn_dnp(const Ref<const RowMajorMatrixXf> &, int, float=1.0); // will add

    // 1 layer for small data sets
    void ceos_minmax_dnp(const Ref<const RowMajorMatrixXf> &, int, float=1.0);
    void ceos1_dnp_from_file(const string&, int, float=1.0); // need to load data from file

    // 2 layers for million-point datasets
    void ceos2_dnp(const Ref<const RowMajorMatrixXf> &, int, float=2.0); // 2 layers
    void ceos2_dnp_from_file(const string&, int, float = 2.0); // 2 layers

    // TODO: density_propagation_from_knn_graph(knn_indices, knn_distances, density_threshold, graph_type='symmetric')
    // DNP with precomputed kNN
    void dnp_from_knn(const Ref<const RowMajorMatrixXi> & , const Ref<const RowMajorMatrixXf> & , int , float );

    // Find approx kNN using CEOs2
    tuple<MatrixXi, MatrixXf> brute_knn_from_file(const string&, int);
    tuple<MatrixXi, MatrixXf> ceos2_knn_from_file(const string&, int);

    // For testing
    void ceos_minmax_dnp_multi_k(const Ref<const RowMajorMatrixXf> & , int , int, float=1.0);
    void ceos_minmax_dnp_multi_k_from_file(const string& , int , int, float=2.0 );
    void ceos2_dnp_multi_k_from_file(const string& , int , int, float=2.0 );



private:

    // 1 layer using top-s closest and farthest vectors (2 * top-s * top-m distances per point)
    void fht_index_minmax_();
    void gauss_index_minmax_();

    // 1 layer using top-s closest among 2D random vector (top-s * top-m distances per point)
    void fht_index1_();

    // 2 layers
    void fht_index2_();
    void gauss_index2_();

    // bruteforce symmetric kNNG construction
    void bf_sym_Gk_(int);
    void ceos_minmax_sym_Gsm_(); // this will be used with fht_index0 and gauss_index0 as they use (2 * top-s) x n_point matTopS
    void ceos_sym_Gsm_(); // This will be used with fht_index1, fht_index2 and gauss_index2 as they use top-s x n_point matTopS
    void dnp_(int, float=2.0);

    tuple<MatrixXi, MatrixXf> ceos2_kNN_(int);
    tuple<MatrixXi, MatrixXf> ceos2_kNN_temp_(int);

};


#endif // CLUPIG_H
