/*!
 * This file contains a document listing class, which implements
 * strategy GREEDY in the article:
 * J. S. Culpepper, G. Navarro, S. J. Puglisi and A. Turpin:
 * ,,Top-k Ranked Document Search in General Text Databases''
 * Proceedings Part II of the 18th Annual European Symposium on
 * Algorithms (ESA 2010)
 */
#ifndef DOCUMENT_LISING_GREEDY_INCLUDED
#define DOCUMENT_LISING_GREEDY_INCLUDED

#include <fstream>
#include <list>
#include <string>
#include <utility>
#include <vector>

#include <sdsl/rmq_support.hpp>
#include <sdsl/suffix_arrays.hpp>

#include "doc_list_index.hpp"

using std::vector;

namespace sdsl
{

template <class t_csa = csa_wt<wt_huff<rrr_vector<63>>, 1000000, 1000000>,
          class t_wtd = wt_int<bit_vector, rank_support_v5<1>, select_support_scan<1>, select_support_scan<0>>,
          typename t_csa::char_type t_doc_delim = 1>
class doc_list_index_greedy
{
public:
    using size_type = typename t_wtd::size_type;
    using value_type = typename t_wtd::value_type;

    typedef t_csa csa_type;
    typedef t_wtd wtd_type;
    typedef std::vector<std::pair<size_type, size_type>> list_type;
    typedef doc_list_tag index_category;

    enum
    {
        WIDTH = t_csa::alphabet_category::WIDTH
    };

    class result : public list_type
    {
    private:
        size_type m_sp, m_ep;

    public:
        // Number of occurrences
        size_type count()
        {
            return m_ep - m_sp + 1;
        }

        // Constructors for an empty result and for a result in the interval [sp, ep]:
        result(size_type sp, size_type ep, list_type && l) : list_type(l), m_sp(sp), m_ep(ep)
        {}
        result() : m_sp(1), m_ep(0)
        {}
        result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep)
        {}
        result & operator=(result const & res)
        {
            if (this != &res)
            {
                list_type::operator=(res);
                m_sp = res.m_sp;
                m_ep = res.m_ep;
            }
            return *this;
        }
    };

    struct wt_range_t
    {
        using node_type = typename wtd_type::node_type;

        node_type v;
        range_type r;

        size_t size() const
        {
            return r[1] - r[0] + 1;
        }

        bool operator<(wt_range_t const & x) const
        {
            if (x.size() != size())
                return size() < x.size();
            return v.sym > x.v.sym;
        }

        wt_range_t()
        {}
        wt_range_t(node_type const & _v, range_type const & _r) : v(_v), r(_r)
        {}
    };

protected:
    size_type m_doc_cnt; // number of documents in the collection
    csa_type m_csa_full; // CSA built from the collection text
    wtd_type m_wtd;      // wtd build from the collection text

public:
    //! Default constructor
    doc_list_index_greedy()
    {}

    doc_list_index_greedy(std::string file_name, sdsl::cache_config & cconfig, uint8_t num_bytes)
    {
        construct(m_csa_full, file_name, cconfig, num_bytes);

        char const * KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
        std::string text_file = cache_file_name(KEY_TEXT, cconfig);

        bit_vector doc_border;
        construct_doc_border(text_file, doc_border);
        bit_vector::rank_1_type doc_border_rank(&doc_border);
        m_doc_cnt = doc_border_rank(doc_border.size());

        int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
        {
            int_vector<> D;
            construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
            std::string d_file = cache_file_name("DARRAY", cconfig);
            store_to_file(D, d_file);
            util::clear(D);
            construct(m_wtd, d_file);
            sdsl::remove(d_file);
        }
    }

    size_type doc_cnt() const
    {
        return m_wtd.sigma - 1; // subtract one, since zero does not count
    }

    size_type word_cnt() const
    {
        return m_wtd.size() - doc_cnt();
    }

    size_type serialize(std::ostream & out, structure_tree_node * v = NULL, std::string name = "") const
    {
        structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
        size_type written_bytes = 0;
        written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
        written_bytes += m_csa_full.serialize(out, child, "csa_full");
        written_bytes += m_wtd.serialize(out, child, "wtd");
        structure_tree::add_size(child, written_bytes);
        return written_bytes;
    }

    void load(std::istream & in)
    {
        read_member(m_doc_cnt, in);
        m_csa_full.load(in);
        m_wtd.load(in);
    }

    //! Search for the k documents which contain the search term most frequent
    template <class t_pat_iter>
    size_type search(t_pat_iter begin, t_pat_iter end, result & res, size_t k) const
    {
        size_type sp = 1, ep = 0;
        if (0 == backward_search(m_csa_full, 0, m_csa_full.size() - 1, begin, end, sp, ep))
        {
            res = result();
            return 0;
        }
        else
        {
            auto tmp_res = topk_greedy(sp, ep, k);
            res = result(sp, ep, std::move(tmp_res));
            return ep - sp + 1;
        }
    }

private:
    //! Construct the doc_border bitvector by streaming the text file
    void construct_doc_border(std::string const & text_file, bit_vector & doc_border)
    {
        int_vector_buffer<WIDTH> text_buf(text_file);
        doc_border = bit_vector(text_buf.size(), 0);
        for (size_type i = 0; i < text_buf.size(); ++i)
        {
            if (t_doc_delim == text_buf[i])
            {
                doc_border[i] = 1;
            }
        }
    }

    void construct_D_array(int_vector_buffer<0> & sa_buf,
                           bit_vector::rank_1_type & doc_border_rank,
                           const size_type doc_cnt,
                           int_vector<> & D)
    {
        D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt + 1) + 1);
        for (size_type i = 0; i < sa_buf.size(); ++i)
        {
            uint64_t d = doc_border_rank(sa_buf[i] + 1);
            D[i] = d;
        }
    }

    //! Returns the top k most frequent documents in D[lb..rb]
    /*!
     *  \param lb  Left array border in D.
     *  \param rb  Right array border in D.
     *  \param k   The number of documents to return.
     *  \returns the top-k items in ascending order.
     */
    std::vector<std::pair<value_type, size_type>> topk_greedy(size_type lb, size_type rb, size_type k) const
    {
        std::vector<std::pair<value_type, size_type>> results;
        std::priority_queue<wt_range_t> heap;

        heap.emplace(wt_range_t(m_wtd.root(), {lb, rb}));

        while (!heap.empty())
        {
            wt_range_t e = heap.top();
            heap.pop();
            if (m_wtd.is_leaf(e.v))
            {
                results.emplace_back(e.v.sym, e.size());
                if (results.size() == k)
                {
                    break;
                }
                continue;
            }

            auto child = m_wtd.expand(e.v);
            auto child_ranges = m_wtd.expand(e.v, e.r);
            auto left_range = std::get<0>(child_ranges);
            auto right_range = std::get<1>(child_ranges);

            if (!empty(left_range))
            {
                heap.emplace(wt_range_t(std::get<0>(child), left_range));
            }
            if (!empty(right_range))
            {
                heap.emplace(wt_range_t(std::get<1>(child), right_range));
            }
        }
        return results;
    };
};

} // namespace sdsl

#endif
