#ifndef QUL_H
#define QUL_H

#include "Env.h"
#include <vector>
#include <tuple>

class Qlearning_genul {
public:
    /**
     * @brief Constructor for the ULCB Q-learning algorithm.
     * @param mdp A reference to the MDP environment.
     * @param c The exploration constant for the UCB bonus.
     * @param total_episodes The total number of episodes to run the learning process.
     */
    Qlearning_genul(FiniteStateFiniteActionMDP& mdp, float c, int total_episodes);

    /**
     * @brief Runs the main ULCB learning loop.
     * @return A tuple containing:
     *         - std::vector<float>: The optimal value function (V*).
     *         - std::vector<std::vector<std::vector<float>>>: The optimal Q-function (Q*).
     *         - std::vector<float>: The value function of the policy from the last episode.
     *         - std::vector<std::vector<std::vector<float>>>: The final learned upper-bound Q-function.
     *         - std::vector<float>: A vector of the raw regret (gap) for each episode.
     */
    std::tuple<
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>
    > learn();

private:
    // Member Variables
    FiniteStateFiniteActionMDP& mdp;
    float c;
    int total_episodes;

    // Upper and lower bounds for V-functions and Q-functions
    std::vector<std::vector<float>> V_func;
    std::vector<std::vector<std::vector<float>>> V_next;
    std::vector<std::vector<float>> V_func_low;
    std::vector<std::vector<std::vector<float>>> V_next_low;
    std::vector<std::vector<std::vector<float>>> global_Q;
    std::vector<std::vector<std::vector<float>>> global_Q_low;

    // Visit counts
    std::vector<std::vector<std::vector<int>>> N; // Total visits
    std::vector<std::vector<std::vector<int>>> n; // Visits in current episode

    // Set of valid (not-yet-eliminated) actions
    std::vector<std::vector<std::vector<int>>> A_valid;

    // Regret tracking
    std::vector<float> regret;
    std::vector<float> raw_gap;

    // Private Helper Methods
    std::pair<std::vector<std::vector<std::vector<float>>>, int> run_episode();
    std::vector<std::vector<std::vector<float>>> choose_action();
    void update_Q(const std::vector<std::vector<std::vector<float>>>& rewards);
};

#endif // QLEARNING_GENUL_H