function [M_arr,Q_arr] = qmi(opts)

if ~isfield(opts, 'S') error('Missing S!'); end
if ~isfield(opts, 'A') error('Missing A!'); end
if ~isfield(opts, 'gamma') error('Missing gamma!'); end
S = opts.S; A = opts.A; gamma = opts.gamma;
if ~isfield(opts, 'del') opts.del = 1/S; end
if ~isfield(opts, 'policy') error('Missing policy!'); end
% if ~isfield(opts, 'Q0') opts.Q0 = zeros(S,1); end
if ~isfield(opts, 's0') opts.s0 = 1; end
if ~isfield(opts, 'm_opt') error('Missing m_opt!'); end
if ~isfield(opts, 'r') error('Missing r!'); end
if ~isfield(opts, 'softmax') error('Missing softmax!'); end
if ~isfield(opts, 'get_softmax') error('Missing get_softmax!'); end
if ~isfield(opts, 'filter') error('Missing filter!'); end
if ~isfield(opts, 'epochs') opts.epochs = 10; end
if ~isfield(opts, 'K') opts.K = 48; end
if ~isfield(opts, 'T') opts.T = 20; end
if ~isfield(opts, 'M0') opts.M0 = ones(opts.S,1) / opts.S; end
if ~isfield(opts, 'FP') opts.FP = true; end
if ~isfield(opts, 'OMD') opts.OMD = false; end
if ~isfield(opts, 'step') opts.step = @(k,l) l; end
if ~isfield(opts, 'kappa') opts.kappa = 4 + 3 * strcmp(opts.policy, 'on'); end
if ~isfield(opts, 'temp') opts.temp = 1e-3; end
if ~isfield(opts, 'GLIE') opts.GLIE = false; end

del = opts.del;
M0 = opts.M0; s0 = opts.s0;
epochs = opts.epochs;
m_opt = opts.m_opt; r = opts.r;
softmax = opts.softmax; get_softmax = opts.get_softmax;
step = opts.step; 
filter = opts.filter;
policy = opts.policy; FP = opts.FP; OMD = opts.OMD;

skip = strcmp(policy, 'off') + 1;
sample_comp = opts.kappa;
T = opts.T; %+ opts.T * (S * sample_comp-1);
K = opts.K; %* skip;
temp = opts.temp;
GLIE = opts.GLIE;

M_arr = zeros(S, K, epochs);
Q_arr = -inf(S, A, K, epochs);

for e = 1:epochs
    fprintf('epoch: %d\n', e)
		if isfield(opts, 'Q0') 
			Q0 = opts.Q0;
		else
			Q0 = -inf(S,A);
			Q0(filter == 1) = rand(size(Q0(filter == 1))) * 1e-2;
		end
    Q = Q0;
    Q_fixed = Q0;
    M0 = normpdf(linspace(-1,1,S),0.5,1e-1)' + normpdf(linspace(-1,1,S),-0.5,1e-1)';
    M0 = circshift(M0, randi(S));
    M0 = M0 ./ sum(M0); % initial M
    M = M0;
    M_fixed = M0;
    s1 = randi(S);              % fixed initial state
    
		M_arr(:,1,e) = M;
		Q_arr(:,:,1,e) = Q;
    for k = 1:K
        for l = 1:T
            % Sample
            s = s1;
            if strcmpi(policy, 'off')
                filtered_Q = Q_fixed(s, :); % Q_fixed as behavior policy
            elseif strcmpi(policy, 'on')
                filtered_Q = Q(s, :); % Q as behavior policy
            end
            filtered_Q(filter(s,:) ~= 1) = -inf;
						if GLIE; temp_mult = k*l; else temp_mult = 1; end
            a = get_softmax(filtered_Q, temp * temp_mult);

						% for probability 'soft', states are equally likely to be selected
						if rand < opts.soft
							s1 = find(filter(s,:) == 1);
							s1 = s1(randi(length(s1)));
						else
							s1 = a;
						end
            
            % Update Q
            alpha = 1e-3; %1/step(k,l);
            % alpha = 1.1;
            Q(s,a) = (1-alpha) * Q(s,a) + alpha * (r(s,a,M_fixed) + gamma * max(Q(s1,filter(s1,:) == 1)));
            % Update M
						beta = 1e-3;
            M = (1-beta) * M; M(s1) = M(s1) + beta * 1;
        end
        
        if FP
            M_fixed = (1-1/k)*M_fixed + 1/k * M;
        else
            M_fixed = M;
        end

				if OMD
					Q_fixed = (1-1/k)*Q_fixed + 1/k * Q;
				else
					Q_fixed = Q;
				end
        
        % Log
				M_arr(:,k+1,e) = M;
				Q_arr(:,:,k+1,e) = Q;
    end
end
end
