#ifndef RAMB_H
#define RAMB_H

#include "Env.h"
#include <vector>
#include <tuple>

class Qlearning_gen_AMBR {
public:
    /**
     * @brief Constructor for the AMB Q-learning algorithm.
     * @param mdp A reference to the MDP environment.
     * @param c The exploration constant for the UCB bonus.
     * @param total_episodes The total number of episodes to run the learning process.
     */
    Qlearning_gen_AMBR(FiniteStateFiniteActionMDP& mdp, float c, int total_episodes);

    /**
     * @brief Runs the main AMB learning loop.
     * @return A tuple containing:
     *         - std::vector<float>: The optimal value function (V*).
     *         - std::vector<std::vector<std::vector<float>>>: The optimal Q-function (Q*).
     *         - std::vector<float>: The value function of the policy from the last episode.
     *         - std::vector<std::vector<std::vector<float>>>: The final learned upper-bound Q-function (QU).
     *         - std::vector<float>: A vector of the raw regret (gap) for each episode.
     */
    std::tuple<
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>
    > learn();

private:
    // Member Variables
    FiniteStateFiniteActionMDP& mdp;
    float c;
    int total_episodes;

    // Upper and lower bounds for V and Q functions
    std::vector<std::vector<float>> VU;
    std::vector<std::vector<float>> VL;
    std::vector<std::vector<std::vector<float>>> QU;
    std::vector<std::vector<std::vector<float>>> QL;

    // Visit counts
    std::vector<std::vector<std::vector<int>>> N; // Total visits
    std::vector<std::vector<std::vector<int>>> n; // Visits in current episode

    // Action and State Management
    std::vector<std::vector<std::vector<int>>> A_valid; // Set of valid actions
    std::vector<std::vector<int>> G;                   // Decided states (0=undecided, 1=decided)

    // Episode trajectory storage
    std::vector<int> episode_state;
    std::vector<int> episode_action;
    
    // Regret tracking
    std::vector<float> regret;
    std::vector<float> raw_gap;

    // Private Helper Methods
    std::pair<std::vector<std::vector<std::vector<float>>>, int> run_episode();
    std::vector<std::vector<std::vector<float>>> choose_action();
    int first_undecided_state(int step);
    void update_QAMB(const std::vector<std::vector<std::vector<float>>>& rewards);
};

#endif // QLEARNING_GEN_AMB_H