% Part I: prepare the dataset

% load('BankMarketing.mat'); 
% BankMarketing.mat contains double matrix A with size 27459*53, 
% double vector b with size 27459*1,
% double matrix Ac with size 13729*53 
% and double vector bc with size 13729*1.          

% The problem is that the first column in A and Ac is not for the sensitive variable.
% So, load from other source.
load('X.mat'); % double matrix X with size 32950*54
load('X_t.mat'); % double matrix X_test with size 8238*54
load('label.mat'); % double array label with size 1*32950
load('label_t.mat'); % double array test_l with size 1*8238

A = X;
Ac = X_test;
b = label';
bc = test_l';
% M = readtable('C:\Program Files\MATLAB\R2021a\toolbox\libsvm-3.3\bank-additional-full.csv', VariableNamingRule='preserve');
% isequal( sort([label'; test_l']), sort( [strcmp(M.y, 'yes')] ) )
% ans = logical 1
% isequal( sort([A(:,1); Ac(:,1)]), sort( M.age > 25 & M.age < 60) )
% ans = logical 0
% Re-value b and bc
b( b < 0.5 ) = -1;
bc( bc < 0.5 ) = -1;

% On the page https://archive.ics.uci.edu/ml/datasets/bank+marketing#, 
% download download bank.zip and bank-additional.zip. 
% Unzip bank-additional-full.csv and bank-additional-names.txt from bank-additional.zip,
% and then the raw data is in bank-additional-full.csv with 41188 instances.
% [A, b] and [Ac, bc] are divided from these 41188 instances.

% The raw data have 21 features, we preprocess them to get 54 features.

% i) [A, b] is for constructing the constaint.

[n,d] = size(A); % number of instances n = 32950, feature dimension d = 543
numclass = length(unique(b)); % b is vector of class labels.
% There are two labels: 1 and -1, so numclass = 2.

sv = A(:,1); % sv = 1 for "male" and = 0 for "female"
idmale = find(sv==0); % indexes in [n] of instances for "male"
idfemale = find(sv==1); % indexes in [n] of instances for "female"

nummale = sum(sv==0); % number of instances for "male" = 1571
numfemale = sum(sv==1); % number of instances for "female" = 31379

numpos = sum(b==1); % number of instances with positive, i.e. label 1 = 3720
numneg = sum(b==-1); % number of instances with negative, i.e. label -1 = 29230

numposmale = sum(b==1 & sv==0); % number of instances for "male" and with label 1 = 531
numnegmale = sum(b==-1 & sv==0); % number of instances for "male" and with label -1 = 1040
numposfemale = sum(b==1 & sv==1); % number of instances for "female" and with label 1 = 3189
numnegfemale = sum(b==-1 & sv==1); % number of instances for "female" and with label -1 = 28190

% ii) [Ac, bc] is for constructing the objective.

[nc,dc] = size(Ac); % number of instances n = 8238, feature dimension d = 54
numclassc = length(unique(bc)); % bc is vector of class labels.
% There are two labels: 1 and -1, so numclass = 2.

% sensitive variable, abbreviated by sv, concerns the fairness.
svc = Ac(:,1); % sv = 1 for "male" and = 0 for "female"
idmalec = find(svc==0); % indexes in [nc] of instances for "male"
idfemalec = find(svc==1); % indexes in [nc] of instances for "female"

nummalec = sum(svc==0); % number of instances for "male" = 407
numfemalec = sum(svc==1); % number of instances for "female" = 7831

numposc = sum(bc==1); % number of instances with positive, i.e. label 1 = 920
numnegc = sum(bc==-1); % number of instances with negative, i.e. label -1 = 7318

numposmalec = sum(bc==1 & svc==0); % number of instances for "male" and with label 1 = 531
numnegmalec = sum(bc==-1 & svc==0); % number of instances for "male" and with label -1 = 268
numposfemalec = sum(bc==1 & svc==1); % number of instances for "female" and with label 1 = 781
numnegfemalec = sum(bc==-1 & svc==1); % number of instances for "female" and with label -1 = 7050

% Part II: initialize w^(0) by hingeloss_minimization

hingeloss_minimization % apply subgradient method to minimize the hinge loss and get w

fprintf('Average hingle loss is L*=%f\n', obj);

delta = 0.001 * obj; % tuning parameter to balance fairness and efficiency
C = obj + delta; % constant term in the constraint function

rho = ( mean(vecnorm(Ac(idmalec,:),2,2).^2) + mean(vecnorm(Ac(idfemalec,:),2,2).^2) ) / 4;
rho_hat = max(rho, 1) * 1; % select from {1, 1.5, 2}

M = max( mean(vecnorm(Ac,2,2)), sqrt( eigs( ((A').*b') * (b.*A), 1 ) ) / n  );

% the result of hingeloss_minimization is the initialization for all methods,
% which is certainly feasible since we add delta in the constraint.
w_init = w; 
D_X = 10 * norm(w_init,2);

% Part III: solve the constrained problem by four methods

% apply the SSG method by constant step-size
% to solve the constrained problem
SSG_1_deterministic_convex_minimization
w_SSG_1_result = w;

% apply the SSG method by dynamic step-size
% to solve the constrained problem
SSG_2_deterministic_convex_minimization
w_SSG_2_result = w;

% apply the IPP-SSG method to solve the constrained problem
IPP_SSG_deterministic_convex_minimization 
w_IPP_SSG_result = w;

% apply the IPP-ConEx method to solve the constrained problem
IPP_ConEx_deterministic_convex_minimization 
w_IPP_ConEx_result = w;

% Part IV: visualize the results

title_name = "BankMarketing";
% deterministic_convex_visualization