#include "clupig.h"
#include "utilities.h"
#include <queue>
#include <algorithm>
#include <iterator>
#include <fstream>

/**
 *
 * @param: MATRIX_X : Eigen matrix X
 * @param knn_alg : Algorithm for approximate kNN
 * @param graph_type : Type of graph (default symmetric kNN)
 * @param propagation_alg : Propagation algorithm: DNP or DBSCAN
 * @param k
 * @param c
 */
void clupig::fit(const Ref<const RowMajorMatrixXf> & MATRIX_X, const string& knn_alg, int k, float c)
{
    // Step 1: Copy data, check support distance
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;
        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;

        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin, local_begin;
    begin = chrono::steady_clock::now();
    matrix_X = MATRIX_X;
    transformData(matrix_X, distance);

    if (verbose)
        cout << "Copy data and check supporting distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Step 2: kNN graph construction
    begin = chrono::steady_clock::now();
    if (knn_alg == "brute") {
        local_begin = chrono::steady_clock::now();
        bf_sym_Gk_(k);

        if (verbose)
            cout << "Bruteforce kNN graph construction time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - local_begin).count() << "[ms]" << endl;

    }
    else if (knn_alg == "ceos_minmax") {
        local_begin = chrono::steady_clock::now();
        fht_index_minmax_();
        if (verbose)
            cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - local_begin).count() << "[ms]" << endl;

        local_begin = chrono::steady_clock::now();
        ceos_minmax_sym_Gsm_();
        if (verbose)
            cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - local_begin).count() << "[ms]" << endl;
    }
    else { // default is ceo2
        local_begin = chrono::steady_clock::now();
        fht_index2_();
        if (verbose)
            cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - local_begin).count() << "[ms]" << endl;

        // Find core point
        local_begin = chrono::steady_clock::now();
        ceos_sym_Gsm_();
        if (verbose)
            cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - local_begin).count() << "[ms]" << endl;
    }

    // Step 3: Propagation
    begin = chrono::steady_clock::now();
    dnp_(k, c);
    if (verbose)
        cout << "Run DNP time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
}

/**
 *
 * @param: dataset : filename containing dataset of n x d
 * @param knn_alg : Algorithm for approximate kNN
 * @param graph_type : Type of graph (default symmetric kNN)
 * @param propagation_alg : Propagation algorithm: DNP or DBSCAN
 * @param k
 * @param c
 */
void clupig::fit_from_file(const string& dataset, const string& knn_alg, const string& graph_type, const string& propagation_alg, int k, float c)
{

}

void clupig::fit_from_knn(const Ref<const RowMajorMatrixXi> & matIndices, const Ref<const RowMajorMatrixXf> & matDistances,
    const string& graph_type, const string& propagation_alg, const int k, const float c)
{

}


/**
 * Compute exact kNN using bruteforce method using distance matrix of clupig
 *
 * @param dataset
 * @param k
 * @return: Row-major matrix of indices and distances (N x k)
 */
tuple<MatrixXi, MatrixXf> clupig::brute_knn_from_file(const string& dataset, const int k)
{
    if (verbose)
    {
        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;

        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;
        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin, start;

    begin = chrono::steady_clock::now();

    // loadtxtData(dataset, sVDC::distance, sVDC::n_points, sVDC::n_features, sVDC::matrix_X);
    loadbinData(dataset, distance, n_points, n_features, matrix_X);

    if (verbose)
        cout << "Loading data time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();

    MatrixXi matrix_indices_ = -Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Ones(n_points, k);
    MatrixXf matrix_distances_ = -Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Ones(n_points, k);

    // Simple parallel that executes n^2 distance computations
#pragma omp parallel for
    for (int n1 = 0; n1 < n_points; ++n1)
    {
        VectorXf vecXn = matrix_X.row(n1);

        priority_queue <IFPair, vector<IFPair>> vectorMaxQue_TopK;

        for (int n2 = 0; n2 < n_points; ++n2)
        {
            if (n2 == n1)
                continue;

            float dist = computeDist(vecXn,  matrix_X.row(n2), distance);

            if ((int)vectorMaxQue_TopK.size() < k)
                vectorMaxQue_TopK.emplace(n2, dist);

            else if (dist < vectorMaxQue_TopK.top().m_fValue)
            {
                vectorMaxQue_TopK.pop();
                vectorMaxQue_TopK.emplace(n2, dist);
            }
        }

        int k_idx = k - 1;
        while (!vectorMaxQue_TopK.empty())
        {
            IFPair pair = vectorMaxQue_TopK.top(); // pointIdx, dist
            vectorMaxQue_TopK.pop();

            matrix_indices_(n1, k_idx) = pair.m_iIndex;
            matrix_distances_(n1, k_idx) = pair.m_fValue;

            k_idx--;
        }
    }


    if (verbose) {
        cout << "Bruteforce computation time = "
             << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]"
             << endl;

        // Write binary
        string filename = distance + "_k_" + int2str(k) + "_indices.bin";
        std::ofstream outIdx(filename, std::ios::binary);
        outIdx.write(reinterpret_cast<const char*>(matrix_indices_.data()), matrix_indices_.size() * sizeof(int));
        outIdx.close();

        filename = distance + "_k_" + int2str(k) + "_distances.bin";
        std::ofstream outDist(filename, std::ios::binary);
        outDist.write(reinterpret_cast<const char*>(matrix_distances_.data()), matrix_distances_.size() * sizeof(float));
        outDist.close();

    }

    return {matrix_indices_, matrix_distances_};
}

/**
 * Finding weighted symmetric G_k using bruteforce method after loading data into matrix_X
 * - Store them in vec2D_NeighborDist (varied size)
 * Note that points in dense areas might have more than k neighbors
 *
 * @param k = k in kNN
 */
void clupig::bf_sym_Gk_(const int k)
{
    vec2D_NeighborDist_ = vector< vector< pair<int, float> > > (n_points, vector< pair<int, float> >());

//    vector<IVector> vec_kNN = vector<IVector> (sVDC::n_points, IVector(minPts, -1)); // kNN matrix, each row is a point, each column is a neighbor index
//    vector<FVector> vec_kNNDist = vector<FVector> (sVDC::n_points, FVector(minPts, 0.0)); // kNN matrix, each row is a point, each column is a neighbor index

    vector<priority_queue <IFPair, vector<IFPair>> > vectorMaxQue_TopK(n_points);

    // Simple parallel that executes n^2 distance computations
#pragma omp parallel for
    for (int n1 = 0; n1 < n_points; ++n1)
    {
        VectorXf vecXn = matrix_X.row(n1);

        for (int n2 = 0; n2 < n_points; ++n2)
        {
            if (n2 == n1)
                continue;

            float dist = computeDist(vecXn,  matrix_X.row(n2), distance);

            if ((int)vectorMaxQue_TopK[n1].size() < k)
                vectorMaxQue_TopK[n1].emplace(n2, dist);

            else if (dist < vectorMaxQue_TopK[n1].top().m_fValue)
            {
                vectorMaxQue_TopK[n1].pop();
                vectorMaxQue_TopK[n1].emplace(n2, dist);
            }
        }
    }


    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks. As lock ~ 4 byte, so 64 KB
    // Initialize locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }

#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        // Dequeue and insert into both sides: kNN and Reverse kNN
        // Note: Maintaining undirected graph is very important for label propagation as we will check kNN's labels for each new node

        // int k_idx = k - 1;
        while (!vectorMaxQue_TopK[n].empty())
        {
            IFPair pair = vectorMaxQue_TopK[n].top(); // pointIdx, dist
            vectorMaxQue_TopK[n].pop();

            // To store the bf computation
//            vec_kNN[n][k_idx] = pair.m_iIndex; // store the index of the point in kNN vector
//            vec_kNNDist[n][k_idx] = pair.m_fValue; // store the distance of the point in kNNDist vector

            // k_idx--;

            // Note: If using omp_set_lock, it must be used on both close and far case
            // Mixing up with #pragma omp critical causes bug
            omp_set_lock(&locks[n % NUM_LOCKS]);
            vec2D_NeighborDist_[n].emplace_back(pair.m_iIndex, pair.m_fValue);
            omp_unset_lock(&locks[n % NUM_LOCKS]);

            omp_set_lock(&locks[pair.m_iIndex % NUM_LOCKS]);
            vec2D_NeighborDist_[pair.m_iIndex].emplace_back(n, pair.m_fValue); // so vector is much better than map()
            omp_unset_lock(&locks[pair.m_iIndex % NUM_LOCKS]);
        }
    }

    // Sorting vec2D_NeighborDist[n] by distance and remove duplicates x1: x2, x3 and x2: x1, x3 then x1: x2, x3, x2 (!)
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n) {

        // Step 1: Sort by value (float)
        std::sort(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end(), [](const auto& a, const auto& b) {
            // Compare based on the float value first
                    if (a.second != b.second) {
                        return a.second < b.second; // Sort by float in ascending order
                    }
                    // If float values are equal, compare based on the int value
                    return a.first < b.first; // Sort by int in ascending order
        });

        // Step 2: Linear scan and merge duplicates
        std::vector<std::pair<int, float>> dedup;
        // dedup.reserve(sVDC::vec2D_NeighborDist[n].size());  // optional optimization

        for (auto & ifpair : vec2D_NeighborDist_[n]) {
            if (dedup.empty() || ifpair.first != dedup.back().first) {
                dedup.push_back(ifpair);
            } else {
                // Keep max value (can switch to min or average) if found duplicates
                dedup.back().second = min(dedup.back().second, ifpair.second);
            }
        }

        vec2D_NeighborDist_[n] = dedup;
    }



    // Destroy locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

}

/**
 * This function constructs 2 layer CEOs using two different sets R and S of random vectors, creating [2D]^2 buckets
 * We use Gaussian matrix G1 and G2 to store two sets of random vectors, and consider Ri + Sj as the composite random vector
 * For each point, we find its top-s closest random vectors
 * For each random vector, we find its top-m close/far points.
 *
 * However, it is time-consuming to find top-m points for each random vector as we have [2D]^2 random vectors.
 * Hence, we use the heuristic (see Falconn++) that only consider points hashed into the closest top-p random vectors,
 * and hence hashed into (top_p)^2 buckets.
 * Since the data is often distributed in a non-uniform manner,
 * there will be several empty buckets (safe as no points close to them), and some buckets will have > top-m points (safe as we only keep top-m closest to the composite vectors).
 *
 * Algorithm:
 * - We process in parallel for each point Xi, compute its projection value on each random vector Ri and Sj
 * - For each Xi, we keep topK / topP closest random vectors for each set. TopK for querying later, topP for hashing into buckets
 * - We convert D random vectors to 2D random vectors by considering the sign so no need furthest vectors
 * - Note that each bucket can received parallel update from multiple points, so we need locks to ensure no conflict.
 * Also, the matrix top_m might contain several pointIdx = -1, which indicate the bucket is empty or has less than top-m points.
 *
 * Data structure:
 * - We store the information in a matrix_top_s of size top_s x n_points
 * - We store the information in a matrix_top_m of size top_m x n_points (init as -1)
 *
 */
void clupig::gauss_index2_()
{
    /** Param for embedding L1 and L2 **/
    int iFourierEmbed_D = ker_n_features / 2; // This is becase we need cos() and sin()

    // See: https://github.com/hichamjanati/srf/blob/master/RFF-I.ipynb
    if (distance == "L1")
        matrix_R_ = cauchyGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // K(x, y) = exp(-gamma * L1_dist(X, y))) where gamma = 1/sigma
    else if (distance == "L2")
        matrix_R_ = gaussGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // std = 1/sigma, K(x, y) = exp(-gamma * L2_dist^2(X, y))) where gamma = 1/2 sigma^2

    /** This vector contains (2D)^2 minQue, one queue for each random vector **/
    // Each minQue is a bucket, and each point is hashed into K buckets
    // However, we cannot check all combination (2D)^2, so we greedily check K^2 buckets
    int numBuckets = 4 * n_proj * n_proj;
    int num2D = 2 * n_proj;
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_TopM(numBuckets);

    // Note: Since we do not want to consider (2D)^2 choices of pairs of random vectors as it costs n(2D)^2 (i.e. CEOs idea)
    // Particularly, for each random vector, we have to keep O(n) projection values so that we can have a good estimate
    // and then aggregate to form the top-M points for each pairs of random vectors.
    // Note: We use the heuristic that considers (2D)^2 as number of buckets, and only consider points hashed into this bucket (i.e. Falconn++ idea)
    // We hash a point into (top-P)^2 buckets, and then we only consider the top-M points in each bucket

    // Note: Only work for random seeds
    matrix_G1_ = gaussGenerator(n_proj, n_features, 0.0, 1.0, seed);
    matrix_G2_ = gaussGenerator(n_proj, n_features, 0.0, 1.0, seed);

    /** Param for index **/
    matrix_top_s_ = MatrixXi::Zero(top_s, n_points); // the first top_s is for close, the second top_s is for far away

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // if D = 128 = 2^7, then numBuckets = 2^16 = 65536. We aim at 256 KB memory for locks
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }


    /**
    Parallel for each the point Xi: (1) Compute and store dot product, and (2) Extract top-k close/far random vectors
    **/
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n) {
        /**
        Random embedding
        **/
        VectorXf vecX = matrix_X.row(n);
        VectorXf vecEmbed = VectorXf::Zero(ker_n_features); // sOptics::ker_n_features >= D

        /// must ensure ker_n_features = n_features on Cosine
        if (distance == "Cosine")
            vecEmbed.segment(0, n_features) = vecX;
        else if ((distance == "L1") || (distance == "L2"))
        {
            VectorXf vecProject = matrix_R_ * vecX;
            vecEmbed.segment(0, iFourierEmbed_D) = vecProject.array().cos();
            vecEmbed.segment(iFourierEmbed_D, iFourierEmbed_D) = vecProject.array().sin(); // start from iEmbbed, copy iEmbed elements
        }
        else if (distance == "Chi2")
            embedChi2(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);
        else if (distance == "JS")
            embedJS(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);

        /**
        Random projection
        **/

        VectorXf rotatedX1 = matrix_G1_ * vecEmbed; // vecRotation is of size n_proj
        VectorXf rotatedX2 = matrix_G2_ * vecEmbed; // vecRotation is of size n_proj

        // cout << "We finish random rotating" << endl;

        // This queue is used for finding top-k max hash values and hash index for iProbes on each layer
        priority_queue< IFPair, vector<IFPair>, greater<> > minQueTopS1, minQueTopP1; // 1st layer
        priority_queue< IFPair, vector<IFPair>, greater<> > minQueTopS2, minQueTopP2; // 2nd layer

        /**
        We use a priority queue to keep top-max abs projection for each repeat
        Always ensure fhtDim >= n_proj
        **/
        for (int r = 0; r < n_proj; ++r)
        {
            // 1st rotation
            int iSign = sgn(rotatedX1(r));
            float fAbsHashValue = iSign * rotatedX1(r);

            int Ri_2D = r; // index of random vector in [2D] after consider the sign
            if (iSign < 0)
                // iBucketIndex |= 1UL << log2Project; // set bit at position log2(D)
                    Ri_2D += n_proj; // Be aware the case that n_proj is not 2^(log2Proj)

            // top_s
            if ((int)minQueTopS1.size() < top_s)
                minQueTopS1.emplace(Ri_2D, fAbsHashValue); // emplace is push without creating temp data
            else if (fAbsHashValue > minQueTopS1.top().m_fValue)
            {
                minQueTopS1.pop();
                minQueTopS1.emplace(Ri_2D, fAbsHashValue); // No need IFPair()
            }
            // TopP-Falconn++
            if ((int)minQueTopP1.size() < top_p)
                minQueTopP1.emplace(Ri_2D, fAbsHashValue); // emplace is push without creating temp data
            else if (fAbsHashValue > minQueTopP1.top().m_fValue)
            {
                minQueTopP1.pop();
                minQueTopP1.emplace(Ri_2D, fAbsHashValue); // No need IFPair()
            }

            // 2nd rotation
            iSign = sgn(rotatedX2(r));
            fAbsHashValue = iSign * rotatedX2(r);

            Ri_2D = r;
            if (iSign < 0)
                // iBucketIndex |= 1UL << log2Project; // set bit at position log2(D)
                    Ri_2D += n_proj; // set bit at position log2(D)

            // top_s
            if ((int)minQueTopS2.size() < top_s)
                minQueTopS2.emplace(Ri_2D, fAbsHashValue);
            else if (fAbsHashValue > minQueTopS2.top().m_fValue)
            {
                minQueTopS2.pop();
                minQueTopS2.emplace(Ri_2D, fAbsHashValue);
            }
            // TopP-Falconn++ (top-P random vector closest to Xn)
            if ((int)minQueTopP2.size() < top_p)
                minQueTopP2.emplace(Ri_2D, fAbsHashValue);
            else if (fAbsHashValue > minQueTopP2.top().m_fValue)
            {
                minQueTopP2.pop();
                minQueTopP2.emplace(Ri_2D, fAbsHashValue);
            }
        }

        // Convert to vector
        vector<IFPair> vec_topS1(top_s), vec_topS2(top_s);
        vector<IFPair> vec_topP1(top_p), vec_topP2(top_p);

        // top_s
        for (int s = top_s - 1; s >= 0; --s)
        {
            vec_topS1[s] = minQueTopS1.top();
            minQueTopS1.pop();

            vec_topS2[s] = minQueTopS2.top();
            minQueTopS2.pop();
        }
        // TopP-Falconn++
        for (int p = top_p - 1; p >= 0; --p)
        {
            vec_topP1[p] = minQueTopP1.top();
            minQueTopP1.pop();

            vec_topP2[p] = minQueTopP2.top();
            minQueTopP2.pop();
        }

        // cout << "We finish extracting top-K on 2 layers." << endl;

        /**
        Use minQue to find the top-k over 2 layers via sum of 2 estimators
        vec1 and vec2 are already sorted, and has length of sOptics::topK
        Note: Heuristic: We consider top-k * top-k pairs for Top-K, and top-p * top-p pairs for Top-M
        Note: We cannot check all combinations due to significant cost
        **/
        priority_queue<IFPair, vector<IFPair>, greater<>> minQueTopS;

        // top_s
        for (const auto& ifPair1: vec_topS1)
        {
            int Ri_2D_1st = ifPair1.m_iIndex;
            float fAbsHashValue1 = ifPair1.m_fValue;

            for (const auto& ifPair2: vec_topS2)
            {
                int R2_2D_2nd = ifPair2.m_iIndex;
                float fAbsSumHash = ifPair2.m_fValue + fAbsHashValue1; // sum of 2 estimators

                //We have 2D * 2D buckets (i.e. random vectors)
                int iBucketIndex = Ri_2D_1st * num2D + R2_2D_2nd; // (totally we have 2D * 2D buckets)

                // assert(iBucketIndex < vectorMinQue_TopM.size());

                // Push all points into the bucket
                if ((int)minQueTopS.size() < top_s)
                    minQueTopS.emplace(iBucketIndex, fAbsSumHash);
                else if (fAbsSumHash > minQueTopS.top().m_fValue)
                {
                    minQueTopS.pop();
                    minQueTopS.emplace(iBucketIndex, fAbsSumHash);
                }
            }
        }

        /** Extract the random vector idx (in the form r1 * 2D + r2) for the point idx n **/
        int s = top_s - 1;
        // MinQue has the size TopK
        while (!minQueTopS.empty())
        {
            IFPair ifPair = minQueTopS.top(); // index is bucketID, value is sumAbsHash
            minQueTopS.pop();
            matrix_top_s_(s, n) = ifPair.m_iIndex;
            s--;
        }

        // TopM for Falconn++
        for (const auto& ifPair1: vec_topP1)
        {
            int Ri_2D_1st = ifPair1.m_iIndex;
            float fAbsHashValue1 = ifPair1.m_fValue;

            for (const auto& ifPair2: vec_topP2)
            {
                int R2_2D_2nd = ifPair2.m_iIndex;
                float fAbsSumHash = ifPair2.m_fValue + fAbsHashValue1; // sum of 2 estimators

                //We have 2D * 2D buckets (i.e. random vectors)
                int iBucketIndex = Ri_2D_1st * num2D + R2_2D_2nd; // (totally we have 2D * 2D buckets)

                // assert(iBucketIndex < vectorMinQue_TopM.size());

                // Push all points into the bucket
                omp_set_lock(&locks[iBucketIndex % NUM_LOCKS]);

                if ((int)vectorMinQue_TopM[iBucketIndex].size() < top_m)
                    vectorMinQue_TopM[iBucketIndex].emplace(n, fAbsSumHash);
                else if (fAbsSumHash > vectorMinQue_TopM[iBucketIndex].top().m_fValue)
                {
                    vectorMinQue_TopM[iBucketIndex].pop();
                    vectorMinQue_TopM[iBucketIndex].emplace(n, fAbsSumHash);
                }

                omp_unset_lock(&locks[iBucketIndex % NUM_LOCKS]);
            }
        }

    }

    // Destroy locks for Falconn++
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; ++i) {
        omp_destroy_lock(&locks[i]);
    }

    /**
    For each random vector, extract top-m closest data points
    **/
    matrix_top_m_ = -MatrixXi::Ones(top_m, numBuckets);

    // Extract top-M for each bucketIdx - Falconn++
#pragma omp parallel for
    for (int b = 0; b < numBuckets; ++b)
    {
        int m = (int)vectorMinQue_TopM[b].size();
        assert(m <= top_m);

        while (!vectorMinQue_TopM[b].empty())
        {
            matrix_top_m_(m - 1, b) = vectorMinQue_TopM[b].top().m_iIndex;
            vectorMinQue_TopM[b].pop();
            m--;
        }
    }
}

/**
 * This function constructs 2 layer CEOs using two different sets R and S of random vectors, creating [2D]^2 buckets
 * We use FHT with HD1 and HD2 to simulate these two sets of random vectors, and consider Ri + Sj as the composite random vector
 * For each point, we find its top-s closest random vectors
 * For each random vector, we find its top-m close/far points.
 *
 * However, it is time-consuming to find top-m points for each random vector as we have [2D]^2 random vectors.
 * Hence, we use the heuristic (see Falconn++) that only consider points hashed into the closest top-p random vectors,
 * and hence hashed into (top_p)^2 buckets.
 * Since the data is often distributed in a non-uniform manner,
 * there will be several empty buckets (safe as no points close to them), and some buckets will have > top-m points (safe as we only keep top-m closest to the composite vectors).
 *
 * Algorithm:
 * - We process in parallel for each point Xi, compute its projection value on each random vector Ri and Sj
 * - For each Xi, we keep topK / topP closest random vectors for each set. TopK for querying later, topP for hashing into buckets
 * - We convert D random vectors to 2D random vectors by considering the sign so no need furthest vectors
 * - Note that each bucket can received parallel update from multiple points, so we need locks to ensure no conflict.
 * Also, the matrix top_m might contain several pointIdx = -1, which indicate the bucket is empty or has less than top-m points.
 *
 * Data structure:
 * - We store the information in a matrix_top_s of size top_s x n_points
 * - We store the information in a matrix_top_m of size top_m x n_points (init as -1)
 *
 */
void clupig::fht_index2_()
{
    /** Param for embedding L1 and L2 **/
    int iFourierEmbed_D = ker_n_features / 2; // This is becase we need cos() and sin()

    // See: https://github.com/hichamjanati/srf/blob/master/RFF-I.ipynb
    if (distance == "L1")
        matrix_R_ = cauchyGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // K(x, y) = exp(-gamma * L1_dist(X, y))) where gamma = 1/sigma
    else if (distance == "L2")
        matrix_R_ = gaussGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // std = 1/sigma, K(x, y) = exp(-gamma * L2_dist^2(X, y))) where gamma = 1/2 sigma^2

    /** This vector contains (2D)^2 minQue, one queue for each random vector **/
    // Each minQue is a bucket, and each point is hashed into K buckets
    // However, we cannot check all combination (2D)^2, so we greedily check K^2 buckets
    int numBuckets = 4 * n_proj * n_proj;
    int num2D = 2 * n_proj;
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_TopM(numBuckets);

    // Note: Since we do not want to consider (2D)^2 choices of pairs of random vectors as it costs n(2D)^2 (i.e. CEOs idea)
    // Particularly, for each random vector, we have to keep O(n) projection values so that we can have a good estimate
    // and then aggregate to form the top-M points for each pairs of random vectors.
    // Note: We use the heuristic that considers (2D)^2 as number of buckets, and only consider points hashed into this bucket (i.e. Falconn++ idea)
    // We hash a point into (top-P)^2 buckets, and then we only consider the top-M points in each bucket

    int log2Project = log2(fhtDim_);
    bitHD3Generator2(fhtDim_ * n_rotate_, seed, bitHD1_, bitHD2_);

    /** Param for index **/
    matrix_top_s_ = MatrixXi::Zero(top_s, n_points); // the first topK is for close, the second topK is for far away

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // if D = 128 = 2^7, then numBuckets = 2^16 = 65536. We aim at 256 KB memory for locks
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
    // Initialize locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }


    /**
    Parallel for each the point Xi: (1) Compute and store dot product, and (2) Extract top-k close/far random vectors
    **/
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n) {
        /**
        Random embedding
        **/
        VectorXf vecX = matrix_X.row(n);
        VectorXf vecEmbed = VectorXf::Zero(ker_n_features); // sOptics::ker_n_features >= D

        /// must ensure ker_n_features = n_features on Cosine
        if (distance == "Cosine")
            vecEmbed.segment(0, n_features) = vecX;
        else if ((distance == "L1") || (distance == "L2"))
        {
            VectorXf vecProject = matrix_R_ * vecX;
            vecEmbed.segment(0, iFourierEmbed_D) = vecProject.array().cos();
            vecEmbed.segment(iFourierEmbed_D, iFourierEmbed_D) = vecProject.array().sin(); // start from iEmbbed, copy iEmbed elements
        }
        else if (distance == "Chi2")
            embedChi2(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);
        else if (distance == "JS")
            embedJS(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);

        /**
        Random projection
        **/

        VectorXf rotatedX1 = VectorXf::Zero(fhtDim_); // NUM_PROJECT > PARAM_KERNEL_EMBED_D
        rotatedX1.segment(0, ker_n_features) = vecEmbed;

        VectorXf rotatedX2 = rotatedX1;

        for (int r = 0; r < n_rotate_; ++r)
        {
            // Component-wise multiplication with a random sign
            for (int d = 0; d < fhtDim_; ++d)
            {
                rotatedX1(d) *= (2 * static_cast<float>(bitHD1_[r * fhtDim_ + d]) - 1);
                rotatedX2(d) *= (2 * static_cast<float>(bitHD2_[r * fhtDim_ + d]) - 1);
            }

            // Multiple with Hadamard matrix by calling FWHT transform
            fht_float(rotatedX1.data(), log2Project);
            fht_float(rotatedX2.data(), log2Project);
        }

        // cout << "We finish random rotating" << endl;

        // This queue is used for finding top-k max hash values and hash index for iProbes on each layer
        priority_queue< IFPair, vector<IFPair>, greater<> > minQueTopS1, minQueTopP1; // 1st layer
        priority_queue< IFPair, vector<IFPair>, greater<> > minQueTopS2, minQueTopP2; // 2nd layer

        /**
        We use a priority queue to keep top-max abs projection for each repeat
        Always ensure fhtDim >= n_proj
        **/
        for (int r = 0; r < n_proj; ++r)
        {
            // 1st rotation
            int iSign = sgn(rotatedX1(r));
            float fAbsHashValue = iSign * rotatedX1(r);

            int Ri_2D = r; // index of random vector in [2D] after consider the sign
            if (iSign < 0)
                // iBucketIndex |= 1UL << log2Project; // set bit at position log2(D)
                    Ri_2D += n_proj; // Be aware the case that n_proj is not 2^(log2Proj)

            // TopK
            if ((int)minQueTopS1.size() < top_s)
                minQueTopS1.emplace(Ri_2D, fAbsHashValue); // emplace is push without creating temp data
            else if (fAbsHashValue > minQueTopS1.top().m_fValue)
            {
                minQueTopS1.pop();
                minQueTopS1.emplace(Ri_2D, fAbsHashValue); // No need IFPair()
            }
            // TopP-Falconn++
            if ((int)minQueTopP1.size() < top_p)
                minQueTopP1.emplace(Ri_2D, fAbsHashValue); // emplace is push without creating temp data
            else if (fAbsHashValue > minQueTopP1.top().m_fValue)
            {
                minQueTopP1.pop();
                minQueTopP1.emplace(Ri_2D, fAbsHashValue); // No need IFPair()
            }

            // 2nd rotation
            iSign = sgn(rotatedX2(r));
            fAbsHashValue = iSign * rotatedX2(r);

            Ri_2D = r;
            if (iSign < 0)
                // iBucketIndex |= 1UL << log2Project; // set bit at position log2(D)
                    Ri_2D += n_proj; // set bit at position log2(D)

            // TopK
            if ((int)minQueTopS2.size() < top_s)
                minQueTopS2.emplace(Ri_2D, fAbsHashValue);
            else if (fAbsHashValue > minQueTopS2.top().m_fValue)
            {
                minQueTopS2.pop();
                minQueTopS2.emplace(Ri_2D, fAbsHashValue);
            }
            // TopP-Falconn++ (top-P random vector closest to Xn)
            if ((int)minQueTopP2.size() < top_p)
                minQueTopP2.emplace(Ri_2D, fAbsHashValue);
            else if (fAbsHashValue > minQueTopP2.top().m_fValue)
            {
                minQueTopP2.pop();
                minQueTopP2.emplace(Ri_2D, fAbsHashValue);
            }
        }

        // Convert to vector
        vector<IFPair> vec_topS1(top_s), vec_topS2(top_s);
        vector<IFPair> vec_topP1(top_p), vec_topP2(top_p);

        // TopS
        for (int s = top_s - 1; s >= 0; --s)
        {
            vec_topS1[s] = minQueTopS1.top();
            minQueTopS1.pop();

            vec_topS2[s] = minQueTopS2.top();
            minQueTopS2.pop();
        }
        // TopP-Falconn++
        for (int p = top_p - 1; p >= 0; --p)
        {
            vec_topP1[p] = minQueTopP1.top();
            minQueTopP1.pop();

            vec_topP2[p] = minQueTopP2.top();
            minQueTopP2.pop();
        }

        // cout << "We finish extracting top-K on 2 layers." << endl;

        /**
        Use minQue to find the top-k over 2 layers via sum of 2 estimators
        vec1 and vec2 are already sorted, and has length of sOptics::topK
        Note: Heuristic: We consider top-k * top-k pairs for Top-K, and top-p * top-p pairs for Top-M
        Note: We cannot check all combinations due to significant cost
        **/
        priority_queue<IFPair, vector<IFPair>, greater<>> minQueTopS;

        // TopK
        for (const auto& ifPair1: vec_topS1)
        {
            int Ri_2D_1st = ifPair1.m_iIndex;
            float fAbsHashValue1 = ifPair1.m_fValue;

            for (const auto& ifPair2: vec_topS2)
            {
                int R2_2D_2nd = ifPair2.m_iIndex;
                float fAbsSumHash = ifPair2.m_fValue + fAbsHashValue1; // sum of 2 estimators

                //We have 2D * 2D buckets (i.e. random vectors)
                int iBucketIndex = Ri_2D_1st * num2D + R2_2D_2nd; // (totally we have 2D * 2D buckets)

                // assert(iBucketIndex < vectorMinQue_TopM.size());

                // Push all points into the bucket
                if ((int)minQueTopS.size() < top_s)
                    minQueTopS.emplace(iBucketIndex, fAbsSumHash);
                else if (fAbsSumHash > minQueTopS.top().m_fValue)
                {
                    minQueTopS.pop();
                    minQueTopS.emplace(iBucketIndex, fAbsSumHash);
                }
            }
        }

        /** Extract the random vector idx (in the form r1 * 2D + r2) for the point idx n **/
        int s = top_s - 1;
        // MinQue has the size TopK
        while (!minQueTopS.empty())
        {
            IFPair ifPair = minQueTopS.top(); // index is bucketID, value is sumAbsHash
            minQueTopS.pop();
            matrix_top_s_(s, n) = ifPair.m_iIndex;
            s--;
        }

        // TopM for Falconn++
        for (const auto& ifPair1: vec_topP1)
        {
            int Ri_2D_1st = ifPair1.m_iIndex;
            float fAbsHashValue1 = ifPair1.m_fValue;

            for (const auto& ifPair2: vec_topP2)
            {
                int R2_2D_2nd = ifPair2.m_iIndex;
                float fAbsSumHash = ifPair2.m_fValue + fAbsHashValue1; // sum of 2 estimators

                //We have 2D * 2D buckets (i.e. random vectors)
                int iBucketIndex = Ri_2D_1st * num2D + R2_2D_2nd; // (totally we have 2D * 2D buckets)

                // assert(iBucketIndex < vectorMinQue_TopM.size());

                // Push all points into the bucket
                omp_set_lock(&locks[iBucketIndex % NUM_LOCKS]);

                if ((int)vectorMinQue_TopM[iBucketIndex].size() < top_m)
                    vectorMinQue_TopM[iBucketIndex].emplace(n, fAbsSumHash);

                else if (fAbsSumHash > vectorMinQue_TopM[iBucketIndex].top().m_fValue)
                {
                    vectorMinQue_TopM[iBucketIndex].pop();
                    vectorMinQue_TopM[iBucketIndex].emplace(n, fAbsSumHash);
                }

                omp_unset_lock(&locks[iBucketIndex % NUM_LOCKS]);
            }
        }

    }

    // Destroy locks for Falconn++
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; ++i) {
        omp_destroy_lock(&locks[i]);
    }

    /**
    For each random vector, extract top-m closest data points
    **/
    matrix_top_m_ = -MatrixXi::Ones(top_m, numBuckets);

    // Extract top-M for each bucketIdx - Falconn++
#pragma omp parallel for
    for (int b = 0; b < numBuckets; ++b)
    {
        int m = (int)vectorMinQue_TopM[b].size();
        // ASSERT_RELEASE(m <= sVDC::top_m, "Not enough topM");

        while (!vectorMinQue_TopM[b].empty())
        {
            matrix_top_m_(m - 1, b) = vectorMinQue_TopM[b].top().m_iIndex;
            vectorMinQue_TopM[b].pop();
            m--;
        }
    }
}

/**
 * Finding approx weighted symmetric G_sm using CEOs approach with both closest and furthest random vectors
 * - Need to call fht_index0() or gauss_index0() first to construct the index
 * - Store them in vec2D_NeighborDist (varied size)
 * Note that points in dense areas might have more than 2sm neighbors, since each vector always has top-m points
 */
void clupig::ceos_minmax_sym_Gsm_()
{
    vec2D_NeighborDist_ = vector< vector< pair<int, float> > > (n_points, vector< pair<int, float> >());

    // 16K locks is good for million-point data set though it is not good for small data sets.
    size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB

    // Initialize locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }

    chrono::steady_clock::time_point begin;
    begin = chrono::steady_clock::now();

#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        VectorXf vecXn = matrix_X.row(n);

        VectorXi vecTopS = matrix_top_s_.col(n); // size 2s: first S is close, last S is far

        boost::dynamic_bitset<> approxNeighbor(n_points);

        for (int s = 0; s < top_s; ++s)
        {
            // Closest
            int Ri = vecTopS(s);
            for (int i = 0; i < top_m; ++i)
            {
                // Compute distance between Xn and Xi
                int iPointIdx = matrix_top_m_(i, Ri);

                if (iPointIdx == n)
                    continue;

                if (!approxNeighbor[iPointIdx]) // cannot find
                {
                    approxNeighbor[iPointIdx] = true;
                    float fDist = computeDist(vecXn, matrix_X.row(iPointIdx), distance);

                    // Note: If using omp_set_lock, it must be used on both close and far case
                    // Mixing up with #pragma omp critical causes bug
                    omp_set_lock(&locks[n % NUM_LOCKS]);
                    vec2D_NeighborDist_[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
                    omp_unset_lock(&locks[n % NUM_LOCKS]);

                    omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);
                    vec2D_NeighborDist_[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
                    omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);

                }
            }


            // Far
            Ri = vecTopS(s + top_s);
            for (int i = 0; i < top_m; ++i)
            {
                // Compute distance between Xn and Xi
                int iPointIdx = matrix_top_m_(i + top_m, Ri);

                if (iPointIdx == n)
                    continue;

                if (!approxNeighbor[iPointIdx]) // cannot find
                {
                    approxNeighbor[iPointIdx] = true;

                    float fDist = computeDist(vecXn, matrix_X.row(iPointIdx), distance);


                    omp_set_lock(&locks[n % NUM_LOCKS]); // size_t lock_id = idx % NUM_LOCKS;
                    vec2D_NeighborDist_[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
                    omp_unset_lock(&locks[n % NUM_LOCKS]);

                    omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);
                    vec2D_NeighborDist_[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
                    omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);

                }
            }
        }
    }

    // Destroy locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

    // Note: Should not clear if running testcase
    matrix_X.resize(0, 0);
    matrix_R_.resize(0, 0);
    matrix_top_s_.resize(0, 0); // For each point (each col), keep topK closest/furthest random vectors
    matrix_top_m_.resize(0, 0); // For each random vector (each col), keep topM closest/furthest points

    if (verbose)
    {
        cout << "Finish computing distance. " << endl;
        cout << "Distance computation time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
    }

    begin = chrono::steady_clock::now();

#pragma omp parallel for // reduction(+:counter1, counter2)
    for (int n = 0; n < n_points; ++n)
    {
        // TODO: Replace unorder_map by vector for cache friendly (see the code below)
        unordered_map<int, float> mapNeighborhood(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end());

        vec2D_NeighborDist_[n].clear();
        vec2D_NeighborDist_[n].insert(vec2D_NeighborDist_[n].end(), mapNeighborhood.begin(), mapNeighborhood.end());
        mapNeighborhood.clear();

        // We need to sort it for DNP
        sort(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end(), [](const pair<int, float>& a, const pair<int, float>& b)
        { return a.second < b.second; });

        // // Step 1: Sort by value (float)
        // std::sort(sVDC::vec2D_NeighborDist[n].begin(), sVDC::vec2D_NeighborDist[n].end(), [](const auto& a, const auto& b) {
        //     // Compare based on the float value first
        //             if (a.second != b.second) {
        //                 return a.second < b.second; // Sort by float in ascending order
        //             }
        //             // If float values are equal, compare based on the int value
        //             return a.first < b.first; // Sort by int in ascending order
        // });
        //
        // // Step 2: Linear scan and merge duplicates
        // std::vector<std::pair<int, float>> dedup;
        // // dedup.reserve(vec2D_NeighborDist[n].size());  // optional optimization
        //
        // for (size_t i = 0; i < sVDC::vec2D_NeighborDist[n].size(); ++i) {
        //     if (dedup.empty() || sVDC::vec2D_NeighborDist[n][i].first != dedup.back().first) {
        //         dedup.push_back(sVDC::vec2D_NeighborDist[n][i]);
        //     } else {
        //         // Keep max value (can switch to min or average)
        //         dedup.back().second = min(dedup.back().second, sVDC::vec2D_NeighborDist[n][i].second);
        //     }
        // }
        //
        // sVDC::vec2D_NeighborDist[n] = dedup;

        // Note: We can keep neighborhoods within beta * minPts-dist to reduce the memory complexity
        // However, it might affect the quality of clustering
        // int minPts = 20; // hard code
        // float minPts_dist = vec2D_NeighborDist[n][minPts - 1].second;
        // for (size_t i = minPts; i < vec2D_NeighborDist[n].size(); ++i)
        // {
        //     if (vec2D_NeighborDist[n][i].second > 2 * minPts_dist)
        //     {
        //         // counter1 += (vec2D_NeighborDist[n].size() - i);
        //         sVDC::vec2D_NeighborDist[n].erase(sVDC::vec2D_NeighborDist[n].begin() + i,sVDC::vec2D_NeighborDist[n].end());
        //         break;
        //     }
        // }

    }

    if (verbose)
    {
        cout << "Finish organizing data structure. " << endl;
        cout << "Organizing structure time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;    // cout << "Points outside (1 + alpha) kNN: " << counter1 << endl;

        float avgSize = 0.0;
        for (int n = 0; n < n_points; ++n)
        {
            avgSize += vec2D_NeighborDist_[n].size();
        }

        avgSize /= n_points;
        cout << "Avg size = " << avgSize << endl;
    }

}

/**
 * Finding approx weighted symmetric G_sm using CEOs approach
 * - Need to call fht_index1() or fht_index2() or gauss_index1() or fht_index2() first to construct the index
 * - Store them in vec2D_NeighborDist (varied size)
 * Note that points in dense areas might have more than 2sm neighbors, since each vector always has top-m points
 */
void clupig::ceos_sym_Gsm_()
{
    vec2D_NeighborDist_ = vector< vector< pair<int, float> > > (n_points, vector< pair<int, float> >());

    chrono::steady_clock::time_point begin;
    begin = chrono::steady_clock::now();

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
    // Initialize locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }


#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        VectorXf vecXn = matrix_X.row(n);

        VectorXi vecTopS = matrix_top_s_.col(n);

        boost::dynamic_bitset<> approxNeighbor(n_points);

        // For each random vectors
        for (int s = 0; s < top_s; ++s)
        {
            int Ri = vecTopS(s); // Ri in [2D] or [(2D)^2]

            for (int i = 0; i < top_m; ++i)
            {
                // Compute distance between Xn and Xi
                int iPointIdx = matrix_top_m_(i, Ri);

                assert (iPointIdx < n_points);

                if (iPointIdx <= -1)
                    break;

                if (iPointIdx == n)
                    continue;

                if (!approxNeighbor[iPointIdx]) // cannot find
                {
                    approxNeighbor[iPointIdx] = true;

                    float fDist = computeDist(vecXn, matrix_X.row(iPointIdx), distance);

// #pragma omp critical
                    // {
                        // sVDC::vec2D_NeighborDist[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
                        // sVDC::vec2D_NeighborDist[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
                    // }

                    omp_set_lock(&locks[n % NUM_LOCKS]);
                    vec2D_NeighborDist_[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
                    omp_unset_lock(&locks[n % NUM_LOCKS]);

                    omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);
                    vec2D_NeighborDist_[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
                    omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);
                }
            }
        }
    }

    // Destroy locks
// #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

    // Note: Should not clear when running test-case
    // sVDC::matrix_X.resize(0, 0); // if clear X, then we cannot repeat several times with minPts

    matrix_R_.resize(0, 0);
    matrix_top_s_.resize(0, 0); // For each point (each col), keep topK closest/furthest random vectors
    matrix_top_m_.resize(0, 0); // For each random vector (each col), keep topM closest/furthest points

    if (verbose)
    {
        cout << "Finish computing distance. " << endl;
        cout << "Distance computation time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
    }

    begin = chrono::steady_clock::now();

#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        // Step 1: Sort by value (float)
        std::sort(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end(), [](const auto& a, const auto& b) {
            // Compare based on the float value first
                    if (a.second != b.second) {
                        return a.second < b.second; // Sort by float in ascending order
                    }
                    // If float values are equal, compare based on the int value
                    return a.first < b.first; // Sort by int in ascending order
        });

        // Step 2: Linear scan and merge duplicates
        std::vector<std::pair<int, float>> dedup;
        // dedup.reserve(sVDC::vec2D_NeighborDist[n].size());  // optional optimization

        for (auto & ifpair : vec2D_NeighborDist_[n]) {
            if (dedup.empty() || ifpair.first != dedup.back().first) {
                dedup.push_back(ifpair);
            } else {
                // Keep max value (can switch to min or average)
                dedup.back().second = min(dedup.back().second, ifpair.second);
            }
        }

        vec2D_NeighborDist_[n] = dedup;

        // Note: We can keep neighborhoods within (1 + beta) minPts-dist to reduce the memory complexity
        // However, for 2 layer (i.e. Falconn++), the role of beta is not very important since the bucket size is governed by topP probing
        // Often, the bucket size is smaller than m.

        // float minPts_dist = vec2D_NeighborDist[n][minPts - 1].second;
        // for (size_t i = minPts; i < vec2D_NeighborDist[n].size(); ++i)
        // {
        //     if (vec2D_NeighborDist[n][i].second > (1 + beta) * minPts_dist)
        //     {
        //         // counter1 += (vec2D_NeighborDist[n].size() - i);
        //         vec2D_NeighborDist[n].erase(sVDC::vec2D_NeighborDist[n].begin() + i, vec2D_NeighborDist[n].end());
        //         break;
        //     }
        // }
    }

    if (verbose)
    {
        cout << "Finish organizing data structure. " << endl;
        cout << "Organizing structure time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;    // cout << "Points outside (1 + alpha) kNN: " << counter1 << endl;
        float avgSize = 0.0;
        for (int n = 0; n < n_points; ++n)
        {
            avgSize += vec2D_NeighborDist_[n].size();
        }

        avgSize /= n_points;
        cout << "Avg size = " << avgSize << endl;
    }

}

/** Compute approximate kNN using CEOs2, and output to RowMajor matrix forms: indices_ and distances_ (equiv to Faiss)
 * We also save them into files if verbose = True
 * Note: This implementation is not efficient as storing approx O(top_s * top-m) distances into vec2D_NeighborDist, and then sorting it later
 * Need to call fht_index2() or gauss_index2() before this function
 *
 * @param k
 * @return
 */
tuple<MatrixXi, MatrixXf> clupig::ceos2_kNN_(const int k)
{
    vec2D_NeighborDist_ = vector< vector< pair<int, float> > > (n_points, vector< pair<int, float> >());

    // sVDC::indices_ = vector<IVector>(sVDC::n_points, IVector());
    // sVDC::distances_ = vector<FVector>(sVDC::n_points, FVector());

    // sVDC::flat_offset_ = IVector(); // offset for each point
    // sVDC::flat_offset_.push_back(0);

    MatrixXi matrix_indices_ = Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Constant(n_points, k, -1);
    MatrixXf matrix_distances_ = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Constant(n_points, k, POS_INF);

    chrono::steady_clock::time_point begin;
    begin = chrono::steady_clock::now();

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
    // Initialize locks
    // #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }


#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        VectorXf vecXn = matrix_X.row(n);

        VectorXi vecTopS = matrix_top_s_.col(n);

        boost::dynamic_bitset<> approxNeighbor(n_points);

        // For each random vectors
        for (int s = 0; s < top_s; ++s)
        {
            int Ri = vecTopS(s); // Ri in [2D] or [(2D)^2]

            for (int i = 0; i < top_m; ++i)
            {
                // Compute distance between Xn and Xi
                int iPointIdx = matrix_top_m_(i, Ri);

                assert (iPointIdx < n_points);

                if (iPointIdx <= -1)
                    break;

                if (iPointIdx == n)
                    continue;

                if (!approxNeighbor[iPointIdx]) // cannot find
                {
                    approxNeighbor[iPointIdx] = true;

                    float fDist = computeDist(vecXn, matrix_X.row(iPointIdx), distance);

                    omp_set_lock(&locks[n % NUM_LOCKS]);
                    vec2D_NeighborDist_[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
                    omp_unset_lock(&locks[n % NUM_LOCKS]);

                    omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);
                    vec2D_NeighborDist_[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
                    omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);
                }
            }
        }
    }

    // Destroy locks
    // #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

    // Note: Should not clear when running testcase
    matrix_X.resize(0, 0);
    matrix_R_.resize(0, 0);
    matrix_top_s_.resize(0, 0); // For each point (each col), keep topK closest/furthest random vectors
    matrix_top_m_.resize(0, 0); // For each random vector (each col), keep topM closest/furthest points

    if (verbose)
    {
        cout << "Finish computing distance. " << endl;
        cout << "Distance computation time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
    }

    begin = chrono::steady_clock::now();

#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        // Step 1: Sort by value (float)
        std::sort(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end(), [](const auto& a, const auto& b) {
            // Compare based on the float value first
                    if (a.second != b.second) {
                        return a.second < b.second; // Sort by float in ascending order
                    }
                    // If float values are equal, compare based on the int value
                    return a.first < b.first; // Sort by int in ascending order
        });

        // Step 2: Linear scan and merge duplicates
        std::vector<std::pair<int, float>> dedup;
        // dedup.reserve(sVDC::vec2D_NeighborDist[n].size());  // optional optimization

        for (size_t i = 0; i < vec2D_NeighborDist_[n].size(); ++i) {
            if (dedup.empty() || vec2D_NeighborDist_[n][i].first != dedup.back().first) {
                dedup.push_back(vec2D_NeighborDist_[n][i]);
            } else {
                // Keep max value (can switch to min or average)
                dedup.back().second = min(dedup.back().second, vec2D_NeighborDist_[n][i].second);
            }
        }

        // sVDC::vec2D_NeighborDist[n] = dedup;
        if (! verbose)
            vec2D_NeighborDist_[n].clear();

        for (int i = 0; i < min(k, (int)dedup.size()); ++i)
        {
            // sVDC::indices_[n].push_back(dedup[i].first);
            // sVDC::distances_[n].push_back(dedup[i].second);

            matrix_indices_(n , i) = dedup[i].first;
            matrix_distances_(n , i) = dedup[i].second;

            // sVDC::matrix_flat_indices_(n , i) = sVDC::vec2D_NeighborDist[n][i].first;
            // sVDC::matrix_flat_distances_(n , i) = sVDC::vec2D_NeighborDist[n][i].second;

            // Note: This code cannot run in parallel
            // sVDC::flat_indices_.push_back(sVDC::vec2D_NeighborDist[n][i].first);
            // sVDC::flat_distances_.push_back(sVDC::vec2D_NeighborDist[n][i].second);
        }

        // Note: This code cannot run in parallel
        // sVDC::flat_offset_.push_back(sVDC::flat_offset_[sVDC::flat_offset_.size() - 1] + sVDC::vec2D_NeighborDist[n].size());
    }

    if (verbose)
    {
        cout << "Finish organizing data structure. " << endl;
        cout << "Organizing structure time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;    // cout << "Points outside (1 + alpha) kNN: " << counter1 << endl;
        float avgSize = 0.0;
        for (int n = 0; n < n_points; ++n)
        {
            avgSize += vec2D_NeighborDist_[n].size();
        }

        avgSize /= n_points;
        cout << "Avg size = " << avgSize << endl;

        // Write binary
        string filename = distance + "_k_" + int2str(k) + int2str(top_s) +
                          "_m_" + int2str(top_m) + "_p_" + int2str(top_p) + "_indices";

        std::ofstream outIdx(filename, std::ios::binary);
        outIdx.write(reinterpret_cast<const char*>(matrix_indices_.data()), matrix_indices_.size() * sizeof(int));
        outIdx.close();

        filename = distance + "_k_" + int2str(k) + int2str(top_s) +
                "_m_" + int2str(top_m) + "_p_" + int2str(top_p) + "_distances";

        std::ofstream outDist(filename, std::ios::binary);
        outDist.write(reinterpret_cast<const char*>(matrix_distances_.data()), matrix_distances_.size() * sizeof(float));
        outDist.close();

    }

    return {matrix_indices_, matrix_distances_};
}

/** Compute approximate kNN using CEOs2, and output to RowMajor matrix forms: indices_ and distances_ (equiv to Faiss
 * Need to call fht_index2() or gauss_index2() before this function
 *
 * Algorithm:
 * - For each point Xi, retrieve top-s closest random vectors and top-m candidate according to this random vector
 * - Compute distance d(Xi, Xj) and update the kNN set of both Xi and Xj if needed. These kNN sets will be sorted later
 * - Using find max/min of Eigen to find the max distance in the current kNN set for update
 *
 * @param k
 * @return
 */
tuple<MatrixXi, MatrixXf> clupig::ceos2_kNN_temp_(const int k)
{
    MatrixXi matrix_indices_ = Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Constant(n_points, k, -1);
    MatrixXf matrix_distances_ = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Constant(n_points, k, POS_INF);

    chrono::steady_clock::time_point begin;
    begin = chrono::steady_clock::now();

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
    // Initialize locks
    // #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }

#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        VectorXf vecXn = matrix_X.row(n);

        VectorXi vecTopS = matrix_top_s_.col(n);

        boost::dynamic_bitset<> approxNeighbor(n_points);

        // For each random vectors
        for (int s = 0; s < top_s; ++s)
        {
            int Ri = vecTopS(s); // Ri in [2D] or [(2D)^2]

            for (int i = 0; i < top_m; ++i)
            {
                // Compute distance between Xn and Xi
                int iPointIdx = matrix_top_m_(i, Ri);

                assert (iPointIdx < n_points);

                if (iPointIdx <= -1)
                    break;

                if (iPointIdx == n)
                    continue;

                if (!approxNeighbor[iPointIdx]) // cannot find
                {
                    approxNeighbor[iPointIdx] = true;

                    float fDist = computeDist(vecXn, matrix_X.row(iPointIdx), distance);

                    // We find max distance, and update if needed
                    omp_set_lock(&locks[n % NUM_LOCKS]);

                    int maxIndex1;
                    float maxVal1 = matrix_distances_.row(n).maxCoeff(&maxIndex1);
                    if (maxVal1 > fDist && matrix_indices_(n, maxIndex1) != iPointIdx) {
                        matrix_distances_(n, maxIndex1) = fDist; // update distance
                        matrix_indices_(n, maxIndex1) = iPointIdx;
                    }

                    omp_unset_lock(&locks[n % NUM_LOCKS]);

                    // We find max distance, and update if needed

                    omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);

                    int maxIndex2;
                    float maxVal2 = matrix_distances_.row(iPointIdx).maxCoeff(&maxIndex2);

                    if (maxVal2 > fDist && matrix_indices_(iPointIdx, maxIndex2) != n) {
                        matrix_distances_(iPointIdx, maxIndex2) = fDist; // update distance
                        matrix_indices_(iPointIdx, maxIndex2) = n; // update index
                    }

                    omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);
                }
            }
        }
    }

    // Destroy locks
    // #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

    // Note: Should not clear when running testcase
    matrix_X.resize(0, 0);
    matrix_R_.resize(0, 0);
    matrix_top_s_.resize(0, 0); // For each point (each col), keep topK closest/furthest random vectors
    matrix_top_m_.resize(0, 0); // For each random vector (each col), keep topM closest/furthest points

    // Sorting distance and index
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        Eigen::VectorXf row_dist = matrix_distances_.row(n);
        Eigen::VectorXi row_idx  = matrix_indices_.row(n);

        sort_by_distance(row_dist, row_idx);

        // Write back to matrix
        matrix_distances_.row(n) = row_dist.transpose();  // row expects RowVector
        matrix_indices_.row(n)   = row_idx.transpose();

    }

    if (verbose)
    {
        cout << "Finish computing distance. " << endl;
        cout << "Distance computation time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
    }

    return {matrix_indices_, matrix_distances_};
}

/**
 * Execute density-aware neighborhood propagation
 * We form cluster by propagating labels from the highest density point to its neighbors
 * The priority queue is sorted by the distance to the connected higher density point and the kNN-distance of the new point
 *
 * Novelty:
 * - This is similar to Optics, but we use the highest density point to form initial seeds
 * - We keep track minConnectedDist to store the best-so-far distance between a point to the higher density point,
 * this will reduce the size of priority queue and ensure points are connected by the shortest distance to the higher density point
 *
 *
 * @param k: govern the density estimation
 * @param c: govern the size of neighborhood to check the label consistency to decide whether to add the new point to the cluster
 *
 */
void clupig::dnp_(const int k, const float c)
{
    if (verbose) {

        float avgSize = 0.0;

        // Counting points with empty neighborhoods, less than minPts, less than c*minPts
        int counter0 = 0, counter1 = 0, counter2 = 0;

        for (int n = 0; n < n_points; ++n)
        {
            auto const neighborSize = static_cast<float>(vec2D_NeighborDist_[n].size());

            if (neighborSize <= 0) {
                counter0++;
            }
            if (neighborSize < k) {
                counter1++;
            }
            if (neighborSize < c * k) {
                counter2++;
            }

            avgSize += neighborSize;
        }

        avgSize /= n_points;

        cout << "Avg size = " << avgSize << endl;
        cout << "Number of points with empty neighborhoods: " << counter0 << endl;
        cout << "Number of points with less than " << k << " neighbors: " << counter1 << endl;
        cout << "Number of points with less than " << c * k << " neighbors: " << counter2 << endl;
    }

    labels = IVector(n_points, -1);

    boost::dynamic_bitset<> processSet(n_points);
    FVector minConnectedDist(n_points, POS_INF); // assign best_so_far distance

    int ck = static_cast<int>(round(c * k));
    FVector vec_density(n_points, 0.0);
    IVector sortedIndex_density = IVector(n_points, -1);

    // Note: if using avg kNN dist, then it might be useful with omp parallel for
// #pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        // init index from 0 to n
        sortedIndex_density[n] = n;

        // This is for the case that some points do not have enough minPts neighbors
        // In this case, we use the size of neighborhood as density estimate
        // This might be true since points in the dense region should share similar closest random vectors
        // And we want to start the cluster from dense regions
        vec_density[n] = static_cast<float>(vec2D_NeighborDist_[n].size());

        // We can use kNN dist. If not enough k neighbors, density = 0 (default)
        // if ( (int)vec2D_NeighborDist[n].size() >= k )
        // {
        //     float density_dist = vec2D_NeighborDist[n][k - 1].second; // minPts-1, since index starts from 0
        //
        //     if (density_dist > 0.0) // we might use [k - 1]
        //         vec_density[n] = 1.0 / density_dist;
        //     else
        //         vec_density[n] = 1.0 / EPSILON; // avoid division by zero
        // }

        // avg kNN-dist
        // for (int i = 0; k < k; ++k)
        // {
        //     float dist = vec2D_NeighborDist[n][k].second; // second: distance
        //     if (dist > 0) // we might use [k - 1]
        //         vec_density[n] += (dist / k);
        // }
        // vec_density[n] = vec_density[n] != 0.0 ? 1.0 / vec_density[n] : 0.0; // avoid division by zero
    }

    sort(sortedIndex_density.begin(), sortedIndex_density.end(),
        [&](int i1, int i2){
            return vec_density[i1] > vec_density[i2];
        }
    );

    // Note: We still need to use density to compute the average density for cluster
    // Note: Since distance range is too large, compare to 1/dist \in [0, 1]
    // Note: The sensitivity of the cluster quality is much better with density, compared to distance

    // Store cluster size
    vector<int> vecClusters;

    // Starting with cluster Id = -1
    int clusterId = -1;

    // Start from the highest density point idx
    for (const auto& topDens_Idx : sortedIndex_density)
    {
        // If it is already processed, then skip and go to next point
        if (processSet[topDens_Idx])
            continue;

        processSet[topDens_Idx] = true;

        // increase cluster Id
        clusterId = clusterId + 1;
        labels[topDens_Idx] = clusterId;
        vecClusters.emplace_back(1); // vecClusters contains cluster size for each cluster id

        // Min PQ has 3 values: (1) Xi, (2) Predecessor Idx, (3) weight
        Min_PQ_Triple seedSet;

        const auto& Xi_neighborhood = vec2D_NeighborDist_[topDens_Idx];

        // For all Xj is neighbor of core Xi, insert into the PQ with its predecessor Xi
        // We use sVDC::neighbor_cutoff to control the size of neighborhood to insert into PQ
        int Xi_neighborSize = static_cast<int>(Xi_neighborhood.size());
        if (propagation_cutoff)
            Xi_neighborSize = min(Xi_neighborSize, ck);

//        for (const auto & point : Xi_neighborhood)
        for (auto it = Xi_neighborhood.begin();
                 it != Xi_neighborhood.begin() + Xi_neighborSize;
                 ++it)
        {
            const auto& point = *it;

            int Xj = point.first; // first: idx, second: dist

            // if (Xj < 0 || Xj >= n_points)
            // {
            //     cout << "Bug in Xj: " << Xj << endl;
            //     continue;
            // }

            // only update if it is not processed
            if (processSet[Xj])
                continue;

            // Note: We might want to add more parameters to control the running time
            // This is for the case (2km + additional points) neighbors are too large and cover points are not on similar density, i.e. dist(Xi, Xj) >> kNN(Xi)
            // We will pick the first top-minPts points, then the rest depends on dist(Xi, Xj) < (1 +- alpha) kNN(Xi)
            // Since Xi_neighbor is sorted, so we should break
            // if (vec_density[topDens_Idx] * point.second > 1 + sVDC::alpha)
            //     break;

            // Simulate Density-Peak, keep min connected distance with higher density points
            // This will reduce the size of PQ, improving running time
            if (minConnectedDist[Xj] < point.second) // point.second= dist(Xi, Xj)
                continue;

            // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
            // This idea is similar to Optics, i.e. keeping the minimum reachability dist so far
            minConnectedDist[Xj] = point.second;

            // Xi_neighborhood[j].second = dist(Xi, Xj)
            // float weight = (Xi_neighborhood[j].second + sOptics::vec_CoreDist[Xj]) / 2;
            // There are border/noise points which do not have enough k neighbors. If so, we use d(Xi, Xj) as weight
            // This will help such border/noise points to be absorbed by the cluster formed by processed core points
            float weight = 0.0;
            if ((int)vec2D_NeighborDist_[Xj].size() < k)
                weight = point.second;
            else
                weight = (point.second + vec2D_NeighborDist_[Xj][k - 1].second) / 2;

            // Sorted by weight, but store extra information, i.e. highest-index = connected core point,
            // to form cluster
            seedSet.emplace(Xj, topDens_Idx, weight); // point idx, predecessor idx, weight

        }

        // Processing PQ for label propagation
        while (!seedSet.empty())
        {
            int Xj = seedSet.top().m_iIndex; // consider the new point which is connected by the highest density point
            int Xi = seedSet.top().m_iPred;

            // Compute d(Xi, Xj) to decide whether to add Xj to the cluster of Xi (see the condition clusterSize > sVDC::min_cluster_size)
            float distXiXj = 0.0;
            if ((int)vec2D_NeighborDist_[Xj].size() < k)
                distXiXj = seedSet.top().m_fValue;
            else
                distXiXj = seedSet.top().m_fValue * 2 - vec2D_NeighborDist_[Xj][k - 1].second; // dist(Xi, Xj) = (weight * 2 - kNN(Xj))

            seedSet.pop();

            if (processSet[Xj])
                continue;

            processSet[Xj] = true; // set processed

            int predLabel = labels[Xi];

            // if (predLabel < 0 || predLabel >= clusterId + 1)
            // {
            //     cout << "Bug in predLabel: " << predLabel << endl;
            //     continue;
            // }

            int clusterSize = vecClusters[predLabel];

            bool bExpandCluster = true;

            // If clusterSize < 50, then always propagate labels to its neighbors
            // Note: This is important to control the local expansion, e.g. not spreading too far away points
            if (clusterSize > min_cluster_size)
            {
                // Note: We should remove beta as we prefer less parameter to tune
                size_t t1 = min((size_t)k, vec2D_NeighborDist_[Xi].size());
                size_t t2 = min((size_t)k, vec2D_NeighborDist_[Xj].size());

                // if (t1 == 0 || t2 == 0)
                //     cout << "Bug in distXiXj: " << t1 << " " << t2 << endl;

                // If Xi and Xj are too far away, then we do not expand the cluster
                // This is to control the noise of approx neighborhoods returned by ANNS solvers
                // If Xj belongs to Xi's cluster, it should be connected via another point Xk, i.e.
                if ( distXiXj > (vec2D_NeighborDist_[Xi][t1 - 1].second + vec2D_NeighborDist_[Xj][t2 - 1].second))
                    bExpandCluster = false;
            }

            if ( bExpandCluster )
            {
                const auto& Xj_neighborhood = vec2D_NeighborDist_[Xj];
                int Xj_neighborSize = static_cast<int>(Xj_neighborhood.size());

                // vector<pair<int, float>> top_KNN(Xj_neighborhood.begin(), Xj_neighborhood.begin() + min(ck, Xj_neighborSize));
                // if (top_KNN.empty())
                //     cout << "Bug in Xj_neighborhood: " << top_KNN.size() << endl;

                // Note: Check one of minPts neighbors has label as the predecessor
                // as we want to spread cluster info via min reachability-dist
                bool hasLabel = false;

                // for (const auto& p : top_KNN)
                for (auto it = Xj_neighborhood.begin(); it != Xj_neighborhood.begin() + min(ck, Xj_neighborSize);++it)
                {
                    if (labels[it->first] == predLabel)
                    {
                        hasLabel = true;
                        break;
                    }
                }

                // All kNN points do not have predecessor label, create new cluster
                if ( !hasLabel )
                {
                    clusterId = clusterId + 1;
                    labels[Xj] = clusterId;
                    vecClusters.emplace_back(1);
                }
                else // Use the predecessor's label
                {
                    labels[Xj] = predLabel; // label of predecessor
                    vecClusters[predLabel] += 1;
                }

                // Now we extend the seedSet with Xj_neighborhood
                // Case 1: If Xj starts the new cluster, we tend to process the points around Xj in the new cluster
                // It this is the case, then we might process border points from previous cluster
                // This is why we keep predecessors' label to connect border points to previous cluster.
                // Case 2: If Xj is connected to the old cluster, we also extend PQ with Xj's neighbors
                if (propagation_cutoff)
                    Xj_neighborSize = min(Xj_neighborSize, ck);

//                for (auto & p : Xj_neighborhood)
                for (auto it = Xj_neighborhood.begin(); it != Xj_neighborhood.begin() + Xj_neighborSize;++it)
                {
                    const auto& p = *it;

                    int Xk = p.first; // first: point idx, second: dist

                    // only update if it is not processed
                    if (processSet[Xk])
                        continue;

                    // Note: This condition is nice to reduce PQ since we aim at finding min reachability distance
                    if (minConnectedDist[Xk] < p.second)
                        continue;

                    // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
                    minConnectedDist[Xk] = p.second;

                    float weight = 0.0;
                    if ((int)vec2D_NeighborDist_[Xk].size() < k)
                        weight = p.second;
                    else
                        weight = (p.second + vec2D_NeighborDist_[Xk][k - 1].second) / 2;

                    seedSet.emplace(Xk, Xj, weight);

                }
            }
            else
            {
                // Note: If we reset PQ for new cluster Xj, then we mis-classify the border point from previous cluster
                // The cluster quality is significantly decreased
                processSet[Xj] = false;

            }
        }
    }
}

/**
 * Wrapper function to call DNP() with matrix form of indices and distances (constructed externally by ANNS solvers)
 *
 * Algorithm:
 * - We construct sym_kNNG and store it in vec2D_NeighborDist
 *
 * @param matIndices: RowMajor matrix of indices, each row is the kNN indices for a point
 * @param matDistances: RowMajor matrix of distances, each row is the kNN distances for a point
 * @param k: govern the density estimation
 * @param c: govern the size of neighborhood to check the label consistency to decide whether to add the new point to the cluster
 *
 */
void clupig::dnp_from_knn(const Ref<const RowMajorMatrixXi> & matIndices, const Ref<const RowMajorMatrixXf> & matDistances, const int k, const float c)
{
    // Form vec2D_NeighborDist from vecIndices and vecDistances
    // sVDC::n_points = vecIndices.size();
    n_points = matIndices.rows();
    int n_neighbors = matIndices.cols();

    vec2D_NeighborDist_ = vector< vector< pair<int, float> > > (n_points, vector< pair<int, float> >());

    // Note: If NUM_LOCKS is large, we might not have enough stack memory if using array
    // 16K locks is good for million-point data set though it is not good for small data sets.
    constexpr size_t NUM_LOCKS = 16384;
    vector<omp_lock_t> locks(NUM_LOCKS); // NUM_LOCK = 16K locks = only 256 KB
    // Initialize locks
    // #pragma omp parallel for
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_init_lock(&locks[i]);
    }

    // for (int i = 0; i < vecIndices[0].size(); i++) {
    //     cout << vecIndices[0][i] << " " << vecDistances[0][i] << endl;
    // }


#pragma omp parallel for
    for (int n = 0; n < n_points; n++ ) {
        for (int i = 0; i < n_neighbors; ++i) {
            int iPointIdx = matIndices(n, i); // vecIndices[n][i];
            float fDist = matDistances(n, i); //vecDistances[n][i];

            // Skip if the point is not in the range of [0, n_points)
            if (iPointIdx < 0 || iPointIdx >= n_points || iPointIdx == n)
                continue;

            omp_set_lock(&locks[n % NUM_LOCKS]);
            vec2D_NeighborDist_[n].emplace_back(iPointIdx, fDist); // duplicate at most twice
            omp_unset_lock(&locks[n % NUM_LOCKS]);

            omp_set_lock(&locks[iPointIdx % NUM_LOCKS]);
            vec2D_NeighborDist_[iPointIdx].emplace_back(n, fDist); // so vector is much better than map()
            omp_unset_lock(&locks[iPointIdx % NUM_LOCKS]);
        }
    }

    // Destroy locks
    for (size_t i = 0; i < NUM_LOCKS; i++) {
        omp_destroy_lock(&locks[i]);
    }

    // Sorting vec2D_NeighborDist[n] by distance
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n) {

        // Step 1: Sort by value (float)
        std::sort(vec2D_NeighborDist_[n].begin(), vec2D_NeighborDist_[n].end(), [](const auto& a, const auto& b) {
            // Compare based on the float value first
            if (a.second != b.second) {
                return a.second < b.second; // Sort by float in ascending order
            }
            // If float values are equal, compare based on the int value
            return a.first < b.first; // Sort by int in ascending order
        });


        // Step 2: Linear scan and merge duplicates
        std::vector<std::pair<int, float>> dedup;
        // dedup.reserve(sVDC::vec2D_NeighborDist[n].size());  // optional optimization

        for (size_t i = 0; i < vec2D_NeighborDist_[n].size(); ++i) {
            if (dedup.empty() || vec2D_NeighborDist_[n][i].first != dedup.back().first) {
                dedup.push_back(vec2D_NeighborDist_[n][i]);
            } else {
                // Keep max value (can switch to min or average)
                dedup.back().second = min(dedup.back().second, vec2D_NeighborDist_[n][i].second);
            }
        }

        vec2D_NeighborDist_[n] = dedup;
    }

    if (verbose)
    {
        float avgSize = 0.0;
        int counter0 = 0, counter1 = 0, counter2 = 0;
        for (int n = 0; n < n_points; ++n) {
            if (vec2D_NeighborDist_[n].empty()) {
                counter0++;
            }
            if ((int)vec2D_NeighborDist_[n].size() < k) {
                counter1++;
            }
            if (vec2D_NeighborDist_[n].size() < c * k) {
                counter2++;
            }

            avgSize += vec2D_NeighborDist_[n].size();
        }

        avgSize /= n_points;

        cout << "Avg size = " << avgSize << endl;
        cout << "Number of points with empty neighborhoods: " << counter0 << endl;
        cout << "Number of points with less than " << k << " neighbors: " << counter1 << endl;
        cout << "Number of points with less than " << c * k << " neighbors: " << counter2 << endl;
    }

    dnp_(k, c);

//     sVDC::labels_ = IVector(sVDC::n_points, -1);
//
//     boost::dynamic_bitset<> processSet(sVDC::n_points);
//     FVector minConnectedDist(sVDC::n_points, POS_INF); // assign max distance
//     FVector vec_density(sVDC::n_points, 0.0);
//     IVector sortedIndex_density = IVector(sVDC::n_points, -1);
//
// #pragma omp parallel for
//     for (int n = 0; n < sVDC::n_points; ++n)
//     {
//         // init index from 0 to n
//         sortedIndex_density[n] = n;
//
//         // Note: Using minPts-dist is quite important.
//         // Density = 0 If there is not enough minPts points
//
//         vec_density[n] = sVDC::vec2D_NeighborDist[n].size();
//
//         // if ( (int)sVDC::vec2D_NeighborDist[n].size() >= minPts )
//         // {
//         //     float density_dist = sVDC::vec2D_NeighborDist[n][minPts - 1].second; // minPts-1, since index starts from 0
//         //
//         //     if (density_dist > 0.0) // we might use [minPts - 1]
//         //         vec_density[n] = 1.0 / density_dist;
//         //     else
//         //         vec_density[n] = 1.0 / EPSILON; // avoid division by zero
//         // }
//
//         // Get avg top-k to have better density estimate
//         // for (int i = 0; i < minPts; ++i)
//         // {
//         //     float dist = sOptics::vec2D_NeighborDist[n][i].second; // second: distance
//         //     if (dist > 0) // we might use [minPts - 1]
//         //         vec_density[n] += (dist / minPts);
//         // }
//         // vec_density[n] = vec_density[n] != 0.0 ? 1.0 / vec_density[n] : 0.0; // avoid division by zero
//     }
//
//     sort(sortedIndex_density.begin(), sortedIndex_density.end(),
//         [&](int i1, int i2){
//             return vec_density[i1] > vec_density[i2];
//         }
//     );
//
//     // Note: We still need to use density to compute the average density for cluster
//     // Note: Since distance range is too large, compare to 1/dist \in [0, 1]
//     // Note: The sensitivity of the cluster quality is much better with density, compared to distance
//
//     // Store cluster size
//     vector<int> vecClusters;
//
//     // Starting with cluster Id = -1
//     int clusterId = -1;
//
//     // Start from the highest density point idx
//     for (const auto& topDens_Idx : sortedIndex_density)
//     {
//         // If it is already processed, then skip and go to next point
//         if (processSet[topDens_Idx])
//             continue;
//
//         processSet[topDens_Idx] = true;
//
//         // increase cluster Id
//         clusterId = clusterId + 1;
//         sVDC::labels_[topDens_Idx] = clusterId;
//         vecClusters.emplace_back(1);
//
//         // Min PQ has 3 values: (1) Xi, (2) Predecessor Idx, (3) weight
//         Min_PQ_Triple seedSet;
//
//         vector< pair<int, float> > Xi_neighborhood = sVDC::vec2D_NeighborDist[topDens_Idx];
//
//         // For all Xj is neighbor of core Xi, insert into the PQ with its predecessor Xi
//         for (const auto & point : Xi_neighborhood)
//         {
//
//             int Xj = point.first; // first: idx, second: dist
//
//             // if (Xj < 0 || Xj >= sVDC::n_points)
//             // {
//             //     cout << "Bug in Xj: " << Xj << endl;
//             //     continue;
//             // }
//
//             // only update if it is not processed
//             if (processSet[Xj])
//                 continue;
//
//             // Note: we can add more parameter to control the running time
//             // This is for the case (2km + additional points) neighbors are too large and cover points are not on similar density, i.e. dist(Xi, Xj) >> kNN(Xi)
//             // We will pick the first top-minPts points, then the rest depends on dist(Xi, Xj) < (1 +- alpha) kNN(Xi)
//             // Since Xi_neighbor is sorted, so we should break
//             // if (vec_density[topDens_Idx] * point.second > 1 + sOptics::alpha)
//             //     break;
//
//             // Simulate Density-Peak, keep min connected distance with higher density points
//             // This will reduce the size of PQ, improving running time
//             if (minConnectedDist[Xj] < point.second) // point.second= dist(Xi, Xj)
//                 continue;
//
//             // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
//             // This idea is similar to Optics, i.e. keeping the minimum reachability dist so far
//             minConnectedDist[Xj] = point.second;
//
//             // Xi_neighborhood[j].second = dist(Xi, Xj)
//             // float weight = (Xi_neighborhood[j].second + sOptics::vec_CoreDist[Xj]) / 2;
//             float weight = 0.0;
//             if ((int)sVDC::vec2D_NeighborDist[Xj].size() < k)
//                 weight = point.second;
//             else
//                 weight = (point.second + sVDC::vec2D_NeighborDist[Xj][k - 1].second) / 2;
//
//             // Sorted by weight, but store extra information, i.e. highest-index = connected core point,
//             // to form cluster
//             seedSet.emplace(Xj, topDens_Idx, weight); // point idx, predecessor idx, weight
//
//         }
//
//         // Processing PQ for label propagation
//         while (!seedSet.empty())
//         {
//             int Xj = seedSet.top().m_iIndex; // consider the new point which is connected by the highest density point
//             int Xi = seedSet.top().m_iPred;
//
//             float distXiXj = 0.0;
//             if ((int)sVDC::vec2D_NeighborDist[Xj].size() < k)
//                 distXiXj = seedSet.top().m_fValue;
//             else
//                 distXiXj = seedSet.top().m_fValue * 2 - sVDC::vec2D_NeighborDist[Xj][k - 1].second; // dist(Xi, Xj) = (weight * 2 - kNN(Xj))
//
//             seedSet.pop();
//
//             if (processSet[Xj])
//                 continue;
//
//             processSet[Xj] = true; // set processed
//
//             int predLabel = sVDC::labels_[Xi];
//
//             // if (predLabel < 0 || predLabel >= clusterId + 1)
//             // {
//             //     cout << "Bug in predLabel: " << predLabel << endl;
//             //     continue;
//             // }
//
//             int clusterSize = vecClusters[predLabel];
//
//             bool bExpandCluster = true;
//
//             // Cluster size > 50 to ensure the std estimation is correct
//             // If clusterSize < 50, then always propagate labels to its neighbors
//             // Note: This is important to control the local expansion, e.g. not spreading too far away points
//             if (clusterSize > min_cluster_size)
//             {
//                 // Note: We should remove beta as we prefer less parameter to tune
//                 size_t t1 = min((size_t)k, sVDC::vec2D_NeighborDist[Xi].size());
//                 size_t t2 = min((size_t)k, sVDC::vec2D_NeighborDist[Xj].size());
//
//                 // if (t1 == 0 || t2 == 0)
//                 //     cout << "Bug in distXiXj: " << t1 << " " << t2 << endl;
//
//                 // If Xi and Xj are too far away, then we do not expand the cluster
//                 if ( distXiXj > (sVDC::vec2D_NeighborDist[Xi][t1 - 1].second + sVDC::vec2D_NeighborDist[Xj][t2 - 1].second))
//                     bExpandCluster = false;
//             }
//
//             if ( bExpandCluster )
//             {
//                 vector< pair<int, float> > Xj_neighborhood = sVDC::vec2D_NeighborDist[Xj];
//                 vector<pair<int, float>> top_KNN(Xj_neighborhood.begin(), Xj_neighborhood.begin() +
//                     min(static_cast<int>(round(c * k)), static_cast<int>(Xj_neighborhood.size())));
//
//                 // if (top_KNN.empty())
//                 //     cout << "Bug in Xj_neighborhood: " << top_KNN.size() << endl;
//
//                 // Note: Check one of minPts neighbors has label as the predecessor
//                 // as we want to spread cluster info via min reachability-dist
//                 bool hasLabel = false;
//
//                 for (const auto& p : top_KNN)
//                 {
//                     if (labels_[p.first] == predLabel)
//                     {
//                         hasLabel = true;
//                         break;
//                     }
//                 }
//
//                 // All kNN points do not have predecessor label, create new cluster
//                 if ( !hasLabel )
//                 {
//                     clusterId = clusterId + 1;
//                     sVDC::labels_[Xj] = clusterId;
//                     vecClusters.emplace_back(1);
//                 }
//                 else // Use the predecessor's label
//                 {
//                     sVDC::labels_[Xj] = predLabel; // label of predecessor
//                     vecClusters[predLabel] += 1;
//                 }
//
//                 // Now we extend the seedSet with Xj_neighborhood
//                 // Case 1: If Xj starts the new cluster, we tend to process the points around Xj in the new cluster
//                 // It this is the case, then we might process border points from previous cluster
//                 // This is why we keep predecessors' label to connect border points to previous cluster.
//                 // Case 2: If Xj is connected to the old cluster, we also extend PQ with Xj's neighbors
//
//                 for (auto & p : Xj_neighborhood)
//                 {
//                     int Xk = p.first; // first: point idx, second: dist
//
//                     // only update if it is not processed
//                     if (processSet[Xk])
//                         continue;
//
//                     // Note: This condition is nice to reduce PQ since we aim at finding min reachability distance
//                     if (minConnectedDist[Xk] < p.second)
//                         continue;
//
//                     // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
//                     minConnectedDist[Xk] = p.second;
//
//                     float weight = 0.0;
//                     if ((int)sVDC::vec2D_NeighborDist[Xk].size() < k)
//                         weight = p.second;
//                     else
//                         weight = (p.second + sVDC::vec2D_NeighborDist[Xk][k - 1].second) / 2;
//
//                     seedSet.emplace(Xk, Xj, weight);
//
//                 }
//             }
//             else
//             {
//                 // Note: If we reset PQ for new cluster Xj, then we mis-classify the border point from previous cluster
//                 // The cluster quality is significantly decreased
//                 processSet[Xj] = false;
//
//             }
//         }
//     }
}

/**
 * We use brute-force kNN to construct sym Gk exactly, and run (c, k)-DNP to form clusters
 * Note that points in dense region might have degree > k
 *
 * Data structure:
 * - Output of bf_sym_kNN(k): store the weighted symmetric minPts-NN graph
 * Note that the degree of each point might be larger than k, especially for points in dense region
 * - Output of run_DNP(): labels_: store the cluster labels
 *
 * @param MATRIX_X
 * @param k
 * @param c
 */
void clupig::brute_knn_dnp(const Ref<const RowMajorMatrixXf> & MATRIX_X, const int k, const float c)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;

        cout << "n_proj: " << n_proj << endl;
        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;

        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;

    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin = chrono::steady_clock::now();

    matrix_X = MATRIX_X; //copy data
    transformData(matrix_X, distance); // transform data to support the distance

    if (verbose)
        cout << "Check X supporting distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    bf_sym_Gk_(k);

    if (verbose)
        cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    dnp_(k, c);

    if (verbose)
        cout << "Form bfVDC time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

}

/**
 * We use 2 layer CEOs to construct weighted symmetric kNN G_sm, and run (c, k)-DNP to form clusters
 * - We call fht_index2() that use FHT to simulate Gaussian matrix,
 * and considers both close and far random vectors for each point.
 * Alternatively, we can use gauss_index2() that uses Gaussian matrix directly
 * - We call approx_sym_kNN() to find the weighted symmetric kNN graph
 * - Hence we need top_s * top_m distance computation for each point
 *
 * Data structure:
 * - Output of fht_index2(): vec2D_NeighborDist: store the approx weighted symmetric minPts-NN graph
 * Note that the degree of each point might be larger than k, especially for points in dense region
 * - Output of run_DNP(): labels_: store the cluster labels
 *
 * @param MATRIX_X
 * @param k
 * @param c
 */
void clupig::ceos2_dnp(const Ref<const RowMajorMatrixXf> & MATRIX_X, const int k, const float c)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;
        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;

        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin;

    begin = chrono::steady_clock::now();
    matrix_X = MATRIX_X;
    transformData(matrix_X, distance);

    if (verbose)
        cout << "Copy data and check supporting distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    fht_index2_();
    // gauss_index2_();
    if (verbose)
        cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Find core point
    begin = chrono::steady_clock::now();
    ceos_sym_Gsm_();
    if (verbose)
        cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    dnp_(k, c);
    if (verbose)
        cout << "Form sVDC2 time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
}

/**
 * We use 2 layer CEOs to construct weighted symmetric kNN G_sm, and run (c, k)-DNP to form clusters
 * - We call fht_index2() that use FHT to simulate Gaussian matrix,
 * and considers both close and far random vectors for each point.
 * Alternatively, we can use gauss_index2() that uses Gaussian matrix directly
 * - We call approx_sym_kNN() to find the weighted symmetric kNN graph
 * - Hence we need top_s * top_m distance computation for each point
 *
 * Data structure:
 * - Output of fht_index2(): vec2D_NeighborDist: store the approx weighted symmetric minPts-NN graph
 * Note that the degree of each point might be larger than k, especially for points in dense region
 * - Output of run_DNP(): labels_: store the cluster labels
 *
 * @param dataset: file path of the dataset
 * @param k
 * @param c
 */
void clupig::ceos2_dnp_from_file(const string& dataset, const int k, const float c)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;


        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;
        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin, start;

    begin = chrono::steady_clock::now();
    // loadtxtData(dataset, sVDC::distance, sVDC::n_points, sVDC::n_features, sVDC::matrix_X);
    loadbinData(dataset, distance, n_points, n_features, matrix_X);
    if (verbose)
        cout << "Loading data and check distance support time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    start = chrono::steady_clock::now();

    begin = chrono::steady_clock::now();
    fht_index2_();
    if (verbose)
        cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Find core point
    begin = chrono::steady_clock::now();
    ceos_sym_Gsm_();
    if (verbose)
        cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    dnp_(k, c);
    if (verbose)
        cout << "Form sVDC time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    cout << "Form sVDC2 time (excluding loading data) = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - start).count() << "[ms]" << endl;

}

/**
 * We use 2 layer CEOs to approximate kNN, and return the Row-major indices_ and distances_
 * - We call fht_index2() that use FHT to simulate Gaussian matrix,
 * - We call search_CEOs2() to find the approximate kNN
 * - Hence we have top_s * top_m distance computation for each point
 *
 * @param dataset: file path of the dataset
 * @param k
 * @return (indices, distances)
 */
tuple<MatrixXi, MatrixXf> clupig::ceos2_knn_from_file(const string& dataset, const int k)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;

        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;
        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin = chrono::steady_clock::now();
    // loadtxtData(dataset, sVDC::distance, sVDC::n_points, sVDC::n_features, sVDC::matrix_X);
    loadbinData(dataset, distance, n_points, n_features, matrix_X);

    if (verbose)
        cout << "Loading data and check distance support time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;


    begin = chrono::steady_clock::now();
    fht_index2_();
    if (verbose)
        cout << "Build CEOs2 time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    return ceos2_kNN_(k);
}

/**
 * We use CEOs-minmax to construct weighted symmetric kNN G_sm, and run (c, k)-DNP to form clusters for several k
 *
 * @param MATRIX_X
 * @param base_k: starting k
 * @param range_k: step size of k
 * @param c
 */
void clupig::ceos_minmax_dnp_multi_k(const Ref<const RowMajorMatrixXf> & MATRIX_X, const int base_k, const int range_k, const float c)
{
    cout << "base k: " << base_k << endl;
    cout << "range k: " << range_k << endl;
    cout << "c: " << c << endl;

    cout << "n_points: " << n_points << endl;
    cout << "n_features: " << n_features << endl;
    cout << "n_proj: " << n_proj << endl;
    cout << "top_s: " << top_s << endl;
    cout << "top_m: " << top_m << endl;
    cout << "distance: " << distance << endl;
    cout << "kernel features: " << ker_n_features << endl;
    cout << "sigma: " << ker_sigma << endl;
    cout << "interval sampling: " << ker_intervalSampling << endl;
    cout << "n_threads: " << n_threads << endl;

    omp_set_num_threads(n_threads);

    matrix_X = MATRIX_X;
    transformData(matrix_X, distance);
    verbose = true; // set true since we want to test

    chrono::steady_clock::time_point begin;

    begin = chrono::steady_clock::now();
    fht_index_minmax_();
    cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    ceos_minmax_sym_Gsm_();
    cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Try several k
    for (int i = 0; i < 10; ++i)
    {
        int new_k = base_k + i * range_k;

        cout << "k: " << new_k << endl;

        begin = chrono::steady_clock::now();
        dnp_(new_k, c);
        cout << "Form clusters time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

            string sFileName = output + + "_" + distance +
                               "_k_" + int2str(new_k) +
                               "_KerFeatures_" + int2str(ker_n_features) +
                               "_NumProj_" + int2str(n_proj) +
                               "_TopM_" + int2str(top_m) +
                               "_TopS_" + int2str(top_s);

            outputLabels(labels, sFileName);


    }
}

/**
 * We use CEOs-minmax to construct weighted symmetric kNN G_sm, and run (c, k)-DNP to form clusters for several k
 *
 * @param MATRIX_X
 * @param base_k: starting k
 * @param range_k: step size of k
 * @param c
 */
void clupig::ceos_minmax_dnp_multi_k_from_file(const string& dataset, const int base_k, const int range_k, const float c)
{
    cout << "base k: " << base_k << endl;
    cout << "range k: " << range_k << endl;
    cout << "c: " << c << endl;

    cout << "n_points: " << n_points << endl;
    cout << "n_features: " << n_features << endl;
    cout << "n_proj: " << n_proj << endl;
    cout << "top_s: " << top_s << endl;
    cout << "top_m: " << top_m << endl;
    cout << "top_p: " << top_p << endl;
    cout << "distance: " << distance << endl;
    cout << "kernel features: " << ker_n_features << endl;
    cout << "sigma: " << ker_sigma << endl;
    cout << "interval sampling: " << ker_intervalSampling << endl;
    cout << "n_threads: " << n_threads << endl;

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    verbose = true; // set true since we want to test

    chrono::steady_clock::time_point begin;

    begin = chrono::steady_clock::now();
    loadbinData(dataset, distance, n_points, n_features, matrix_X);
    cout << "Loading data and check distance support time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    fht_index_minmax_();
    cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    ceos_minmax_sym_Gsm_();
    cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Try several minPts
    for (int i = 0; i < 10; ++i)
    {
        int new_k = base_k + i * range_k;

        cout << "k: " << new_k << endl;

        begin = chrono::steady_clock::now();
        dnp_(new_k, c);
        cout << "Form clusters time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

            string sFileName = output + + "_" + distance +
                               "_k_" + int2str(new_k) +
                               "_KerFeatures_" + int2str(ker_n_features) +
                               "_NumProj_" + int2str(n_proj) +
                               "_TopM_" + int2str(top_m) +
                               "_TopS_" + int2str(top_s);

            outputLabels(labels, sFileName);


    }
}

/**
 * We use CEOs2 to construct weighted symmetric kNN G_sm, and run (c, k)-DNP to form clusters for several k
 *
 * @param MATRIX_X
 * @param base_k: starting k
 * @param range_k: step size of k
 * @param c
 */
void clupig::ceos2_dnp_multi_k_from_file(const string& dataset, const int base_k, const int range_k, const float c)
{
    cout << "base k: " << base_k << endl;
    cout << "range k: " << range_k << endl;
    cout << "c: " << c << endl;

    cout << "n_points: " << n_points << endl;
    cout << "n_features: " << n_features << endl;
    cout << "n_proj: " << n_proj << endl;
    cout << "top_s: " << top_s << endl;
    cout << "top_m: " << top_m << endl;
    cout << "top_p: " << top_p << endl;
    cout << "distance: " << distance << endl;
    cout << "kernel features: " << ker_n_features << endl;
    cout << "sigma: " << ker_sigma << endl;
    cout << "interval sampling: " << ker_intervalSampling << endl;
    cout << "n_threads: " << n_threads << endl;

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    verbose = true; // set true since we want to test

    chrono::steady_clock::time_point begin = chrono::steady_clock::now();

    // loadtxtData(dataset, sVDC::distance, sVDC::n_points, sVDC::n_features, sVDC::matrix_X);
    loadbinData(dataset, distance, n_points, n_features, matrix_X);
    cout << "Loading data and check distance support time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    fht_index2_();
    // gauss_parIndex2();
    cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    ceos_sym_Gsm_();
    cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Try several minPts
    for (int i = 0; i < 10; ++i)
    {
        int new_k = base_k + i * range_k;

        cout << "k: " << new_k << endl;

        begin = chrono::steady_clock::now();
        dnp_(new_k, c);
        cout << "Form clusters time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

        string sFileName = output + + "_" + distance +
                           "_k_" + int2str(new_k) +
                           "_KerFeatures_" + int2str(ker_n_features) +
                           "_NumProj_" + int2str(n_proj) +
                           "_TopM_" + int2str(top_m) +
                           "_TopS_" + int2str(top_s) +
                           "_TopP_" + int2str(top_p);

        outputLabels(labels, sFileName);

    }
}












/**
 * This function construct 1 layer CEOs using Gaussian matrix, aiming for supporting small data set
 * For each point, we find its top-s closest and farthest random vectors
 * For each random vector, we find its top-m close/far points. There are always top-m close and top-m far points in case of 1 layer
 *
 * Data structure:
 * - We store the information in a matrix_top_s of size 2*top_s x n_points, where the first top_s is for close, the second top_s is for far away
 *
 * Algorithm:
 * - We use vectorMinQue_CloseTopM and vectorMinQue_FarTopM of size D (n_proj) to keep track top-m close/far points for each random vector
 * - We parallel for each point Xi, compute its projection value on each random vector Ri and update vecMinQue_CloseTopM and vecMinQue_FarTopM
 * - While computing projection value, we also keep track top-s close/far random vectors for each point Xi using minCloseTopK and minFarTopK
 *
 */
void clupig::gauss_index_minmax_()
{
    /** Param for embedding L1 and L2 **/
    int iFourierEmbed_D = ker_n_features / 2; // This is becase we need cos() and sin()

    // See: https://github.com/hichamjanati/srf/blob/master/RFF-I.ipynb
    if (distance == "L1")
        matrix_R_ = cauchyGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // K(x, y) = exp(-gamma * L1_dist(X, y))) where gamma = 1/sigma
    else if (distance == "L2")
        matrix_R_ = gaussGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // std = 1/sigma, K(x, y) = exp(-gamma * L2_dist^2(X, y))) where gamma = 1/2 sigma^2

    /** Param for random projection via FHT **/
    // MatrixXf MATRIX_FHT = MatrixXf::Zero(sOptics::n_proj, sOptics::n_points);
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_CloseTopM(n_proj);
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_FarTopM(n_proj);

    matrix_G_ = gaussGenerator(n_proj, n_features, 0.0, 1.0, seed);

    /** Param for index **/
    matrix_top_s_ = MatrixXi::Zero(2 * top_s, n_points); // the first top_s is for close, the second top_s is for far away

    // omp_lock_t locks[sVDC::n_proj]; // stack memory
    vector<omp_lock_t> locks(n_proj); // heap memory

    // Initialize all locks
    for (int d = 0; d < n_proj; ++d) {
        omp_init_lock(&locks[d]);
    }


    /**
    Parallel for each the point Xi: (1) Compute and store dot product, and (2) Extract top-k close/far random vectors
    **/
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        /**
        Random embedding
        TODO: create buildKernelFeatures and random projection as a new function since sOptics-1NN also use it
        **/
        VectorXf vecX = matrix_X.row(n);
        VectorXf vecEmbed = VectorXf::Zero(ker_n_features); // sOptics::ker_n_features >= D

        // NOTE: must ensure ker_n_features = n_features on Cosine
        if (distance == "Cosine")
            vecEmbed.segment(0, n_features) = vecX;
        else if ((distance == "L1") || (distance == "L2"))
        {
            VectorXf vecProject = matrix_R_ * vecX;
            vecEmbed.segment(0, iFourierEmbed_D) = vecProject.array().cos();
            vecEmbed.segment(iFourierEmbed_D, iFourierEmbed_D) = vecProject.array().sin(); // start from iEmbbed, copy iEmbed elements
        }
        else if (distance == "Chi2")
            embedChi2(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);
        else if (distance == "JS")
            embedJS(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);

        /**
        Random projection
        **/

        VectorXf vecRotation = matrix_G_ * vecEmbed; // vecRotation is of size n_proj

        // Store projection matrix for faster parallel, and no need to scale since we keep top-k and top-MinPts
        // MATRIX_FHT.col(n) = vecRotation.segment(0, sOptics::n_proj); // only get up to #n_proj

        /**
        Extract top-k closes and furtherest random vectors
        **/

        Min_PQ_Pair minCloseTopS, minFarTopS;

        for (int d = 0; d < n_proj; ++d)
        {
            float fValue = vecRotation(d); // take the value up to n_proj

            /**
            1) For each random vector Ri, get top-MinPts closest index and top-MinPts furthest index
            2) For each point Xi, get top-K closest random vector and top-K furthest random vector
            **/

            // Note: Different point can change the same random vector in parallel
            omp_set_lock(&locks[d]);
            // #pragma omp critical
            // {

            // Close case
            if ((int)vectorMinQue_CloseTopM[d].size() < top_m)
                vectorMinQue_CloseTopM[d].emplace(n, fValue);

            else if (fValue > vectorMinQue_CloseTopM[d].top().m_fValue)
            {
                vectorMinQue_CloseTopM[d].pop();
                vectorMinQue_CloseTopM[d].emplace(n, fValue);
            }

            // Far case
            if ((int)vectorMinQue_FarTopM[d].size() < top_m)
                vectorMinQue_FarTopM[d].emplace(n, -fValue);

            else if (-fValue > vectorMinQue_FarTopM[d].top().m_fValue)
            {
                vectorMinQue_FarTopM[d].pop();
                vectorMinQue_FarTopM[d].emplace(n, -fValue);
            }

            // }
            omp_unset_lock(&locks[d]);

            // (1) Close: Using priority queue to find top-k closest vectors for each point
            if ((int)minCloseTopS.size() < top_s)
                minCloseTopS.emplace(d, fValue);
            else
            {
                if (fValue > minCloseTopS.top().m_fValue)
                {
                    minCloseTopS.pop();
                    minCloseTopS.emplace(d, fValue);
                }
            }

            // (2) Far: Using priority queue to find top-k furthest vectors
            if ((int)minFarTopS.size() < top_s)
                minFarTopS.emplace(d, -fValue);
            else
            {
                if (-fValue > minFarTopS.top().m_fValue)
                {
                    minFarTopS.pop();
                    minFarTopS.emplace(d, -fValue);
                }
            }
        }

        // Get (sorted by projection value) top-k closest and furthest vector for each point
        for (int s = top_s - 1; s >= 0; --s)
        {
            matrix_top_s_(s, n) = minCloseTopS.top().m_iIndex;
            minCloseTopS.pop();

            matrix_top_s_(s + top_s, n) = minFarTopS.top().m_iIndex;
            minFarTopS.pop();
        }

    }

    // Destroy all locks
    for (int d = 0; d < n_proj; ++d) {
        omp_destroy_lock(&locks[d]);
    }

    /**
    For each random vector, extract top-m close/far data points
    **/
    matrix_top_m_ = -MatrixXi::Ones(2 * top_m, n_proj); // the first topM is for close, the second topM is for far away

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
//    omp_set_num_threads(sOptics::n_threads); // TODO: do we need to set it for each for call?

    /**
    Parallel for each random vector, getting 2*top-m as close and far candidates
    **/
#pragma omp parallel for
    for (int d = 0; d < n_proj; ++d)
    {
        // sort(begin(matProject.col(d)), end(matProject.col(d)), [](float lhs, float rhs){return rhs > lhs});

        // Min_PQ_Pair minPQ_Close;
        // Min_PQ_Pair minPQ_Far;

        // VectorXf vecProject = MATRIX_FHT.row(d); // it must be row since D x N
        //
        // for (int n = 0; n < sOptics::n_points; ++n)
        // {
        //     float fValue = vecProject(n);
        //
        //     // Close
        //     if ((int)minPQ_Close.size() < sOptics::topM)
        //         minPQ_Close.emplace(n, fValue);
        //     else
        //     {
        //         if (fValue > minPQ_Close.top().m_fValue)
        //         {
        //             minPQ_Close.pop();
        //             minPQ_Close.emplace(n, fValue);
        //         }
        //     }
        //
        //     // Far
        //     if ((int)minPQ_Far.size() < sOptics::topM)
        //         minPQ_Far.emplace(n, -fValue);
        //     else
        //     {
        //         if (-fValue > minPQ_Far.top().m_fValue)
        //         {
        //             minPQ_Far.pop();
        //             minPQ_Far.emplace(n, -fValue);
        //         }
        //     }
        // }
        //
        // for (int m = sOptics::topM - 1; m >= 0; --m)
        // {
        //     // Close
        //     sOptics::matrix_topM(m, d) = minPQ_Close.top().m_iIndex;
        //     minPQ_Close.pop();
        //
        //     // Far
        //     sOptics::matrix_topM(m + sOptics::topM, d) = minPQ_Far.top().m_iIndex;
        //     minPQ_Far.pop();
        // }

        for (int m = top_m - 1; m >= 0; --m)
        {
            // Close
            matrix_top_m_(m, d) = vectorMinQue_CloseTopM[d].top().m_iIndex;
            vectorMinQue_CloseTopM[d].pop();

            // Far
            matrix_top_m_(m + top_m, d) = vectorMinQue_FarTopM[d].top().m_iIndex;
            vectorMinQue_FarTopM[d].pop();
        }

    }
}

/**
 * This function constructs 1 layer CEOs using FHT, aiming for supporting small data set
 * For each point, we find its top-s closest and farthest random vectors
 * For each random vector, we find its top-m close/far points. There are always top-m close and top-m far points in case of 1 layer
 *
 * Data structure:
 * - We store the information in a matrix_top_s of size 2*top_s x n_points, where the first top_s is for close, the second top_s is for far away
 *
 * Algorithm:
 * - We use vectorMinQue_CloseTopM and vectorMinQue_FarTopM of size D (n_proj) to keep track top-m close/far points for each random vector
 * - We parallel for each point Xi, compute its projection value on each random vector Ri and update vecMinQue_CloseTopM and vecMinQue_FarTopM
 * - While computing projection value, we also keep track top-s close/far random vectors for each point Xi using minCloseTopK and minFarTopK
 *
 */
void clupig::fht_index_minmax_()
{
    /** Param for embedding L1 and L2 **/
    int iFourierEmbed_D = ker_n_features / 2; // We need cos() and sin()

    // See: https://github.com/hichamjanati/srf/blob/master/RFF-I.ipynb
    if (distance == "L1")
        matrix_R_ = cauchyGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // K(x, y) = exp(-gamma * L1_dist(X, y))) where gamma = 1/sigma
    else if (distance == "L2")
        matrix_R_ = gaussGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // std = 1/sigma, K(x, y) = exp(-gamma * L2_dist^2(X, y))) where gamma = 1/2 sigma^2

    /** Param for random projection via FHT **/
    // MatrixXf MATRIX_FHT = MatrixXf::Zero(sOptics::n_proj, sOptics::n_points);
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_CloseTopM(n_proj);
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_FarTopM(n_proj);

    int log2Project = log2(fhtDim_);
    bitHD3Generator(fhtDim_ * n_rotate_, seed, bitHD_);

    /** Param for index **/
    matrix_top_s_ = MatrixXi::Zero(2 * top_s, n_points); // the first top-s is for close, the second top-s is for far away

    // omp_lock_t locks[sVDC::n_proj]; // stack memory
    vector<omp_lock_t> locks(n_proj); // heap memory

    // Initialize all locks
// #pragma omp parallel for
    for (int d = 0; d < n_proj; ++d) {
        omp_init_lock(&locks[d]);
    }

    /**
    Parallel for each the point Xi: (1) Compute and store dot product, and (2) Extract top-k close/far random vectors
    **/
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        /**
        Random embedding
        TODO: create buildKernelFeatures and random projection as a new function since sOptics-1NN also use it
        **/
        VectorXf vecX = matrix_X.row(n);
        VectorXf vecEmbed = VectorXf::Zero(ker_n_features); // sOptics::ker_n_features >= D

        // NOTE: must ensure ker_n_features = n_features on Cosine
        if (distance == "Cosine")
            vecEmbed.segment(0, n_features) = vecX;
        else if ((distance == "L1") || (distance == "L2"))
        {
            VectorXf vecProject = matrix_R_ * vecX;
            vecEmbed.segment(0, iFourierEmbed_D) = vecProject.array().cos();
            vecEmbed.segment(iFourierEmbed_D, iFourierEmbed_D) = vecProject.array().sin(); // start from iEmbbed, copy iEmbed elements
        }
        else if (distance == "Chi2")
            embedChi2(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);
        else if (distance == "JS")
            embedJS(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);

        /**
        Random projection
        **/

        VectorXf vecRotation = VectorXf::Zero(fhtDim_); // NUM_PROJECT > PARAM_KERNEL_EMBED_D
        vecRotation.segment(0, ker_n_features) = vecEmbed;

        for (int r = 0; r < n_rotate_; ++r)
        {
            // Component-wise multiplication with a random sign
            for (int d = 0; d < fhtDim_; ++d)
            {
                vecRotation(d) *= (2 * static_cast<float>(bitHD_[r * fhtDim_ + d]) - 1);
            }

            // Multiple with Hadamard matrix by calling FWHT transform
            fht_float(vecRotation.data(), log2Project);
            // FWHT(vecRotation);
        }


        /**
        Extract top-k closes and furtherest random vectors
        **/
        Min_PQ_Pair minCloseTopS;
        Min_PQ_Pair minFarTopS;

        for (int d = 0; d < n_proj; ++d)
        {
            float fValue = vecRotation(d); // take the value up to n_proj

            /**
            1) For each random vector Ri, get top-MinPts closest index and top-MinPts furthest index
            2) For each point Xi, get top-K closest random vector and top-K furthest random vector
            **/

            // Note: Different point can change the same random vector in parallel
            omp_set_lock(&locks[d]);

            // Close case
            if ((int)vectorMinQue_CloseTopM[d].size() < top_m)
                vectorMinQue_CloseTopM[d].emplace(n, fValue);

            else if (fValue > vectorMinQue_CloseTopM[d].top().m_fValue)
            {
                vectorMinQue_CloseTopM[d].pop();
                vectorMinQue_CloseTopM[d].emplace(n, fValue);
            }

            // Far case
            if ((int)vectorMinQue_FarTopM[d].size() < top_m)
                vectorMinQue_FarTopM[d].emplace(n, -fValue);

            else if (-fValue > vectorMinQue_FarTopM[d].top().m_fValue)
            {
                vectorMinQue_FarTopM[d].pop();
                vectorMinQue_FarTopM[d].emplace(n, -fValue);
            }

            omp_unset_lock(&locks[d]);

            // (1) Close: Using priority queue to find top-k closest vectors for each point
            if ((int)minCloseTopS.size() < top_s)
                minCloseTopS.emplace(d, fValue);
            else
            {
                if (fValue > minCloseTopS.top().m_fValue)
                {
                    minCloseTopS.pop();
                    minCloseTopS.emplace(d, fValue);
                }
            }

            // (2) Far: Using priority queue to find top-k furthest vectors
            if ((int)minFarTopS.size() < top_s)
                minFarTopS.emplace(d, -fValue);
            else
            {
                if (-fValue > minFarTopS.top().m_fValue)
                {
                    minFarTopS.pop();
                    minFarTopS.emplace(d, -fValue);
                }
            }
        }

        ASSERT_RELEASE((int)minCloseTopS.size() == top_s, "Not enough topK");
        ASSERT_RELEASE((int)minFarTopS.size() == top_s, "Not enough topK");

        // Get (sorted by projection value) top-k closest and furthest vector for each point
        for (int s = top_s - 1; s >= 0; --s)
        {
            matrix_top_s_(s, n) = minCloseTopS.top().m_iIndex;
            minCloseTopS.pop();

            matrix_top_s_(s + top_s, n) = minFarTopS.top().m_iIndex;
            minFarTopS.pop();
        }

    }

    // Destroy all locks
    for (int d = 0; d < n_proj; ++d) {
        omp_destroy_lock(&locks[d]);
    }

    /**
    For each random vector, extract top-m close/far data points
    **/
    matrix_top_m_ = -MatrixXi::Ones(2 * top_m, n_proj); // the first topM is for close, the second topM is for far away

    /**
    Parallel for each random vector, getting 2*top-m as close and far candidates
    **/
#pragma omp parallel for
    for (int d = 0; d < n_proj; ++d)
    {
        ASSERT_RELEASE((int)vectorMinQue_CloseTopM[d].size() == top_m, "Not enough topM");
        ASSERT_RELEASE((int)vectorMinQue_FarTopM[d].size() == top_m, "Not enough topM");

        for (int m = top_m - 1; m >= 0; --m)
        {
            // Close
            matrix_top_m_(m, d) = vectorMinQue_CloseTopM[d].top().m_iIndex;
            vectorMinQue_CloseTopM[d].pop();

            // Far
            matrix_top_m_(m + top_m, d) = vectorMinQue_FarTopM[d].top().m_iIndex;
            vectorMinQue_FarTopM[d].pop();
        }

    }
}

/**
 * This function constructs 1 layer CEOs using FHT, aiming for supporting small data set
 * For each point, we find its top-s closest random vectors among [2D] random vectors
 * For each random vector, we find its top-m closest points. There are always top-m closest points to the random vector in case of 1 layer
 *
 * Data structure:
 * - We store the information in a matrix_top_s of size top_s x n_points
 *
 * Algorithm:
 *  - We convert D random vectors to 2D random vectors by considering both +Ri and -Ri
 * - We use vectorMinQue_TopM 2D (2 * n_proj) to keep track top-m closest points for each random vector
 * - We parallel for each point Xi, compute its projection value on each random vector Ri and update vectorMinQue_TopM
 * - While computing projection value, we also keep track top-s close/far random vectors for each point Xi using minCloseTopK
 *
 */
void clupig::fht_index1_()
{
    /** Param for embedding L1 and L2 **/
    int iFourierEmbed_D = ker_n_features / 2; // This is because we need cos() and sin()

    // See: https://github.com/hichamjanati/srf/blob/master/RFF-I.ipynb
    if (distance == "L1")
        matrix_R_ = cauchyGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // K(x, y) = exp(-gamma * L1_dist(X, y))) where gamma = 1/sigma
    else if (distance == "L2")
        matrix_R_ = gaussGenerator(iFourierEmbed_D, n_features, 0, 1.0 / ker_sigma, seed); // std = 1/sigma, K(x, y) = exp(-gamma * L2_dist^2(X, y))) where gamma = 1/2 sigma^2

    /** Param for random projection via FHT **/
    // MatrixXf MATRIX_FHT = MatrixXf::Zero(sOptics::n_proj, sOptics::n_points);
    int num2D = 2 * n_proj;
    vector<priority_queue< IFPair, vector<IFPair>, greater<> >> vectorMinQue_TopM(num2D);

    int log2Project = log2(fhtDim_);
    bitHD3Generator(fhtDim_ * n_rotate_, seed, bitHD_);

    /** Param for index **/
    matrix_top_s_ = MatrixXi::Zero(top_s, n_points);

    // 2D = 2K, so NUM_LOCKS is small enough to store in stack
    vector<omp_lock_t> locks(num2D); // heap memory is faster than stack
// #pragma omp parallel for
    for (int d = 0; d < num2D; ++d) {
        omp_init_lock(&locks[d]);
    }

    /**
    Parallel for each the point Xi: (1) Compute and store dot product, and (2) Extract top-k close/far random vectors
    **/
#pragma omp parallel for
    for (int n = 0; n < n_points; ++n)
    {
        /**
        Random embedding
        **/
        VectorXf vecX = matrix_X.row(n);
        VectorXf vecEmbed = VectorXf::Zero(ker_n_features); // sOptics::ker_n_features >= D

        /// Must ensure ker_n_features = n_features on Cosine
        if (distance == "Cosine")
            vecEmbed.segment(0, n_features) = vecX;
        else if ((distance == "L1") || (distance == "L2"))
        {
            VectorXf vecProject = matrix_R_ * vecX;
            vecEmbed.segment(0, iFourierEmbed_D) = vecProject.array().cos();
            vecEmbed.segment(iFourierEmbed_D, iFourierEmbed_D) = vecProject.array().sin(); // start from iEmbbed, copy iEmbed elements
        }
        else if (distance == "Chi2")
            embedChi2(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);
        else if (distance == "JS")
            embedJS(vecX, vecEmbed, ker_n_features, n_features, ker_intervalSampling);

        /**
        Random projection
        **/

        VectorXf vecRotation = VectorXf::Zero(fhtDim_); // NUM_PROJECT > PARAM_KERNEL_EMBED_D
        vecRotation.segment(0, ker_n_features) = vecEmbed;

        for (int r = 0; r < n_rotate_; ++r)
        {
            // Component-wise multiplication with a random sign
            for (int d = 0; d < fhtDim_; ++d)
            {
                vecRotation(d) *= (2 * static_cast<float>(bitHD_[r * fhtDim_ + d]) - 1);
            }

            // Multiple with Hadamard matrix by calling FWHT transform
            fht_float(vecRotation.data(), log2Project);
            // FWHT(vecRotation);
        }

        /** Extract top-k closest vector for each point
         * Extract top-m closest points for each vector
         * Note that the vector index is now [2D] so there is no far vectors/points
         **/

        Min_PQ_Pair minQueTopS;

        for (int r = 0; r < n_proj; ++r)
        {
            int iSign = sgn(vecRotation(r));
            float fAbsHashValue = iSign * vecRotation(r);

            int Ri_2D = r; // index of random vector in [2D] after consider the sign
            if (iSign < 0)
                // Ri_2D |= 1UL << log2Project; // set bit at position log2(D) but be aware the case that n_proj is not 2^(log2Proj)
                Ri_2D += n_proj;

            // TopK
            if ((int)minQueTopS.size() < top_s)
                minQueTopS.emplace(Ri_2D, fAbsHashValue); // emplace is push without creating temp data
            else if (fAbsHashValue > minQueTopS.top().m_fValue)
            {
                minQueTopS.pop();
                minQueTopS.emplace(Ri_2D, fAbsHashValue); // No need IFPair()
            }

            // Note: Different point can change the top-M closest points of the same random vector in parallel
            // Top-M
            omp_set_lock(&locks[Ri_2D]);

            if ((int)vectorMinQue_TopM[Ri_2D].size() < top_m)
                vectorMinQue_TopM[Ri_2D].emplace(n, fAbsHashValue);

            else if (fAbsHashValue > vectorMinQue_TopM[Ri_2D].top().m_fValue)
            {
                vectorMinQue_TopM[Ri_2D].pop();
                vectorMinQue_TopM[Ri_2D].emplace(n, fAbsHashValue);
            }


            omp_unset_lock(&locks[Ri_2D]);
        }

        // Get top-S closest random vectors for each point
        for (int s = top_s - 1; s >= 0; --s)
        {
            matrix_top_s_(s, n) = minQueTopS.top().m_iIndex;
            minQueTopS.pop();
        }

    }

    // Destroy locks
    for (int d = 0; d < num2D; d++) {
        omp_destroy_lock(&locks[d]);
    }

    // Extract topM for each random vector in [2D]
    matrix_top_m_ = -MatrixXi::Ones(top_m, num2D); // default -1

#pragma omp parallel for
    for (int d = 0; d < num2D; ++d)
    {
        for (int m = top_m - 1; m >= 0; --m)
        {
            matrix_top_m_(m, d) = vectorMinQue_TopM[d].top().m_iIndex;
            vectorMinQue_TopM[d].pop();
        }
    }
}


/**
 * We use 1 layer CEOs to construct weighted symmetric kNN, and run (c, k)-DNP to form clusters
 * - We call fht_index0() that use FHT to simulate Gaussian matrix,
 * and considers both close and far random vectors for each point.
 * Alternatively, we can use gauss_index0() that uses Gaussian matrix directly
 * - We call approx_sym_kNN0() to find the weighted symmetric kNN graph
 * - Hence we need 2 * top_s * top_m distance computation for each point
 *
 * If you want to use 1 layer CEOs with only closest random vectors, please call fht_index1() and approx_sym_kNN()
 * But note that fht_index1() use top-s * top-m distance computation for each point
 *
 * Data structure:
 * - Output of fht_index0(): vec2D_NeighborDist: store the approx weighted symmetric minPts-NN graph
 * Note that the degree of each point might be larger than k, especially for points in dense region
 * - Output of run_DNP(): labels_: store the cluster labels
 *
 * @param MATRIX_X
 * @param k
 * @param c
 */
void clupig::ceos_minmax_dnp(const Ref<const RowMajorMatrixXf> & MATRIX_X, const int k, const float c)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;
        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "top_p: " << top_p << endl;
        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;

    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin;

    begin = chrono::steady_clock::now();
    matrix_X = MATRIX_X;
    transformData(matrix_X, distance);

    if (verbose)
        cout << "Copy data and check distance support time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    fht_index_minmax_();
    // gauss_index_minmax();
    // fht_index1();

    if (verbose)
        cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Find core point
    begin = chrono::steady_clock::now();
    ceos_minmax_sym_Gsm_();
    // sVDC::approx_sym_kNN();

    if (verbose)
        cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    dnp_(k, c);

    if (verbose)
        cout << "Form sVDC time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;
}

/**
 * We use 1 layer CEOs to construct weighted symmetric kNN, and run (c, k)-DNP to form clusters
 * - We call fht_index0() that use FHT to simulate Gaussian matrix,
 * and considers both close and far random vectors for each point.
 * Alternatively, we can use gauss_index0() that uses Gaussian matrix directly
 * - We call approx_sym_kNN0() to find the weighted symmetric kNN graph
 * - Hence we need 2 * top_s * top_m distance computation for each point
 *
 * If you want to use 1 layer CEOs with only close random vectors, please call fht_index1() and approx_sym_kNN()
 * But note that fht_index1() use top-s * top-m distance computation for each point
 *
 * Data structure:
 * - Output of fht_index0(): vec2D_NeighborDist: store the approx weighted symmetric minPts-NN graph
 * Note that the degree of each point might be larger than k, especially for points in dense region
 * - Output of run_DNP(): labels_: store the cluster labels
 *
 * @param dataset: file path of the dataset
 * @param k
 * @param c
 */
void clupig::ceos1_dnp_from_file(const string& dataset, const int k, const float c)
{
    if (verbose)
    {
        cout << "k: " << k << endl;

        cout << "n_points: " << n_points << endl;
        cout << "n_features: " << n_features << endl;
        cout << "n_proj: " << n_proj << endl;

        cout << "top_s: " << top_s << endl;
        cout << "top_m: " << top_m << endl;
        cout << "distance: " << distance << endl;
        cout << "kernel features: " << ker_n_features << endl;
        cout << "sigma: " << ker_sigma << endl;
        cout << "interval sampling: " << ker_intervalSampling << endl;
        cout << "n_threads: " << n_threads << endl;
    }

    // omp_set_dynamic(0);     // Explicitly disable dynamic teams
    omp_set_num_threads(n_threads);

    chrono::steady_clock::time_point begin, start;

    begin = chrono::steady_clock::now();
    loadbinData(dataset, distance, n_points, n_features, matrix_X);
    if (verbose)
        cout << "Loading data time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    start = chrono::steady_clock::now();

    begin = chrono::steady_clock::now();
    fht_index1_();
    if (verbose)
        cout << "Build index time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    // Find core point
    begin = chrono::steady_clock::now();
    ceos_sym_Gsm_();
    if (verbose)
        cout << "Find neighborhoods and distance time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    begin = chrono::steady_clock::now();
    dnp_(k, c);
    if (verbose)
        cout << "Form optics time = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << "[ms]" << endl;

    cout << "Form sVDC time (excluding loading data) = " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - start).count() << "[ms]" << endl;

}

 /**
 * Early implementation of (c, k)-DNP
 *
 * @param minPts
 */
// void sVDC::run_DNP_test(const int minPts, const float c)
// {
//     // Truncate neighborhood to only minPts points
// // #pragma omp parallel for
// //     for (int n = 0; n < sVDC::n_points; ++n)
// //     {
// //         if (sVDC::vec2D_NeighborDist[n].size() > minPts)
// //             sVDC::vec2D_NeighborDist[n].resize(minPts);
// //     }
//
//     // sVDC::predecessors_ = IVector(sVDC::n_points, -1);   //noise = -1
//     sVDC::labels_ = IVector(sVDC::n_points, -1);
//
//     int density_K = minPts; //min(10, minPts);
//
//     boost::dynamic_bitset<> processSet(sVDC::n_points);
//     FVector minConnectedDist(sVDC::n_points, POS_INF); // assign max distance
//     FVector vec_density(sVDC::n_points, 0.0);
//     IVector sortedIndex_density = IVector(sVDC::n_points, -1);
//
// #pragma omp parallel for
//     for (int n = 0; n < sVDC::n_points; ++n)
//     {
//         // init index from 0 to n
//         sortedIndex_density[n] = n;
//
//         // Note: Using minPts-dist is quite important.
//         // If X1 has smallest minpts-dist, any X2 in kNN(X1) should have larger minPts-dist
//         // Hence, X1 will be in kNN(X2) and we can always connect X2 from X1
//         // In case there is not enough minPts points in the neighborhood
//
//         float density_dist = sVDC::vec2D_NeighborDist[n][density_K - 1].second; // minPts-1, since index starts from 0
//
//         if (density_dist > 0) // we might use [minPts - 1]
//             vec_density[n] = 1.0 / density_dist;
//
//         // Get avg top-k to have better density estimate
//         // for (int i = 0; i < minPts; ++i)
//         // {
//         //     float dist = sOptics::vec2D_NeighborDist[n][i].second; // second: distance
//         //     if (dist > 0) // we might use [minPts - 1]
//         //         vec_density[n] += (dist / minPts);
//         // }
//         // vec_density[n] = vec_density[n] != 0.0 ? 1.0 / vec_density[n] : 0.0; // avoid division by zero
//     }
//
//     sort(sortedIndex_density.begin(), sortedIndex_density.end(),
//         [&](int i1, int i2){
//             return vec_density[i1] > vec_density[i2];
//         }
//     );
//
//     // Note: We still need to use density to compute the average density for cluster
//     // Note: Since distance range is too large, compare to 1/dist \in [0, 1]
//     // Note: The sensitivity of the cluster quality is much better with density, compared to distance
//
//     // pair.first: avgDensity, fair.second: size of cluster
//     vector< std::tuple<float, float, int> > vecClusters;
//
//     // Starting with cluster Id = -1
//     int clusterId = -1;
//
//     // Start from the highest density point idx
//     for (const auto& topDens_Idx : sortedIndex_density)
//     {
//         // If it is already processed, then skip and go to next point
//         if (processSet[topDens_Idx])
//             continue;
//
//         processSet[topDens_Idx] = true;
//
//         // increase cluster Id
//         clusterId = clusterId + 1;
//         sVDC::labels_[topDens_Idx] = clusterId;
//         // cout << clusterId << "th cluster from the highest density points: " << endl;
//
//         // Min PQ has 3 values: (1) Xi, (2) Predecessor Idx, (3) weight
//         Min_PQ_Triple seedSet;
//
//         vector< pair<int, float> > Xi_neighborhood = sVDC::vec2D_NeighborDist[topDens_Idx];
//
//         // For all Xj is neighbor of core Xi, insert into the PQ with its predecessor Xi
//         for (const auto & point : Xi_neighborhood)
//         {
//
//             int Xj = point.first; // first: idx, second: dist
//
//             // only update if it is not processed
//             if (processSet[Xj])
//                 continue;
//
//             // Note: we can add more parameter to control the running time
//             // This is for the case (2km + additional points) neighbors are too large and cover points are not on similar density, i.e. dist(Xi, Xj) >> kNN(Xi)
//             // We will pick the first top-minPts points, then the rest depends on dist(Xi, Xj) < (1 +- alpha) kNN(Xi)
//             // Since Xi_neighbor is sorted, so we should break
//             // if (vec_density[topDens_Idx] * point.second > 1 + sOptics::alpha)
//             //     break;
//
//             // Simulate Density-Peak, keep min connected distance with higher density points
//             // This will reduce the size of PQ, improving running time
//             if (minConnectedDist[Xj] < point.second) // point.second= dist(Xi, Xj)
//                 continue;
//
//             // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
//             // This idea is similar to Optics, i.e. keeping the minimum reachability dist so far
//             minConnectedDist[Xj] = point.second;
//
//
//             // Xi_neighborhood[j].second = dist(Xi, Xj)
//             // float weight = (Xi_neighborhood[j].second + sOptics::vec_CoreDist[Xj]) / 2;
//             float weight = (point.second + sVDC::vec2D_NeighborDist[Xj][density_K - 1].second) / 2;
//
//             // Sorted by weight, but store extra information, i.e. highest-index = connected core point,
//             // to form cluster
//             seedSet.emplace(Xj, topDens_Idx, weight); // point idx, predecessor idx, weight
//
//         }
//
//         // Cluster info: avgDen, Mn, size
//         vecClusters.emplace_back(vec_density[topDens_Idx], vec_density[topDens_Idx] * vec_density[topDens_Idx] , 1);
//
//         // Processing PQ for label propagation
//         while (!seedSet.empty())
//         {
//             int Xj = seedSet.top().m_iIndex; // consider the new point which is connected by the highest density point
//             int Xi = seedSet.top().m_iPred;
//
//             float distXiXj = seedSet.top().m_fValue * 2 - sVDC::vec2D_NeighborDist[Xj][density_K - 1].second; // dist(Xi, Xj) = (weight * 2 - kNN(Xj))
//
//             seedSet.pop();
//
//             if (processSet[Xj])
//                 continue;
//
//             processSet[Xj] = true; // set processed
//
//             int predLabel = sVDC::labels_[Xi];
//
//             // TODO: Welford’s algorithm for streaming Variance Computation
//             float oldSum = get<0>(vecClusters[predLabel]);
//             float oldSumSquare = get<1>(vecClusters[predLabel]);
//             int clusterSize = get<2>(vecClusters[predLabel]);
//
//             float oldMean = oldSum / clusterSize;
//             float oldStd = 0.0;
//             float temp = oldSumSquare / clusterSize - oldMean * oldMean;
//             if (temp > 0.0)
//                 oldStd = sqrt(temp);
//
//             float newMean = (oldSum + vec_density[Xj]) / (clusterSize + 1.0);
//
//             bool bExpandCluster = true;
//             // Cluster size > 50 to ensure the std estimation is correct
//             // If clusterSize < 50, then always propagate labels to its neighbors
//             if (clusterSize > sVDC::min_cluster_size)
//             {
//                 // Note: We should remove beta as we prefer less parameter to tune
//                 size_t t1 = min((size_t)minPts, sVDC::vec2D_NeighborDist[Xi].size());
//                 size_t t2 = min((size_t)minPts, sVDC::vec2D_NeighborDist[Xj].size());
//                 float alpha = 2.0;
//                 if ( /* newMean > (oldMean + (1.0 + alpha) * oldStd) || newMean < (oldMean - (1.0 + alpha) * oldStd) || */
//                     distXiXj > (sVDC::vec2D_NeighborDist[Xi][t1 - 1].second + sVDC::vec2D_NeighborDist[Xj][t2 - 1].second)
//                     )
//                     bExpandCluster = false;
//             }
//
//
//             // Condition 1: Check density-ratio between the current cluster and new cluster is within 1 +/- alpha
//             // If it does not hold, unprocess the point
//             if ( bExpandCluster )
//             {
//                 vector< pair<int, float> > Xj_neighborhood = sVDC::vec2D_NeighborDist[Xj];
//                 vector<pair<int, float>> top_KNN(Xj_neighborhood.begin(), Xj_neighborhood.begin() +
//                     min(static_cast<int>(round(c * minPts)), static_cast<int>(Xj_neighborhood.size())));
//
//                 // Note: Check one of minPts neighbors has label as the predecessor
//                 // as we want to spread cluster info via min reachability-dist
//                 bool hasLabel = false;
//
//                 for (const auto& p : top_KNN)
//                 {
//                     if (labels_[p.first] == predLabel)
//                     {
//                         hasLabel = true;
//                         break;
//                     }
//                 }
//
//                 // All kNN points do not have predecessor label, create new cluster
//                 if ( !hasLabel )
//                 {
//                     clusterId = clusterId + 1;
//                     // // cout << "all negative one" << endl;
//                     sVDC::labels_[Xj] = clusterId;
//                     vecClusters.emplace_back(vec_density[Xj], vec_density[Xj] * vec_density[Xj], 1);
//                 }
//                 else // Use the predecessor's label
//                 {
//                     sVDC::labels_[Xj] = predLabel; // label of predecessor
//
//                     // Update the cluster of predecessor's label
//                     get<0>(vecClusters[predLabel]) = oldSum + vec_density[Xj];
//                     get<1>(vecClusters[predLabel]) = oldSumSquare + vec_density[Xj] * vec_density[Xj];
//                     get<2>(vecClusters[predLabel]) += 1;
//                 }
//
//                 // Now we extend the seedSet with Xj_neighborhood
//                 // Case 1: If Xj starts the new cluster, we tend to process the points around Xj in the new cluster
//                 // It this is the case, then we might process border points from previous cluster
//                 // This is why we keep predecessors' label to connect border points to previous cluster.
//                 // Case 2: If Xj is connected to the old cluster, we also extend PQ with Xj's neighbors
//
//                 for (auto & p : Xj_neighborhood)
//                 {
//                     int Xk = p.first; // first: point idx, second: dist
//
//                     // only update if it is not processed
//                     if (processSet[Xk])
//                         continue;
//
//                     // Note: This condition is nice to reduce PQ since we aim at finding min reachability distance
//                     if (minConnectedDist[Xk] < p.second)
//                         continue;
//
//                     // Heuristic to reduce PQ size: only add to PQ for smaller connected dist(Xi, Xj)
//                     minConnectedDist[Xk] = p.second;
//
//                     // float weight = (p.second + sOptics::vec_CoreDist[Xk]) / 2;
//                     float weight = (p.second + sVDC::vec2D_NeighborDist[Xk][density_K - 1].second) / 2;
//                     seedSet.emplace(Xk, Xj, weight);
//
//                 }
//             }
//             else
//             {
//                 // Note: If we reset PQ for new cluster Xj, then we mis-classify the border point from previous cluster
//                 // The cluster quality is significantly decreased
//                 processSet[Xj] = false;
//
//             }
//         }
//     }
// }
