%% Load Noisy X
clear all
close all

load X.mat
[nRows,nCols] = size(X);
nNodes = nRows*nCols;
nStates = 2;
nInstances = 100;

% Make 100 noisy X instances
y = int32(1+X);
y = reshape(y,[1 nNodes]);
y = repmat(y,[nInstances 1 1]);

X = reshape(X,1,1,nNodes);
X = repmat(X,[nInstances 1 1]);
X = X + randn(size(X))/2;

figure;
for i = 1:4
	subplot(2,2,i);
	imagesc(reshape(X(i,1,:),nRows,nCols));
	colormap gray
end
suptitle('Examples of Noisy Xs');

%% Make edgeStruct

adj = latticeAdjMatrix(nRows,nCols);
edgeStruct = UGM_makeEdgeStruct(adj,nStates);
nEdges = edgeStruct.nEdges;

%% Make Xnode, Xedge, infoStruct, initialize weights

% Add bias and Standardize Columns
tied = 1;
Xnode = [ones(nInstances,1,nNodes) UGM_standardizeCols(X,tied)];
nNodeFeatures = size(Xnode,2);

% Make Xedge
sharedFeatures = [1 0];
Xedge = UGM_makeEdgeFeatures(Xnode,edgeStruct.edgeEnds,sharedFeatures);

% Make nodeMap, edgeMap, initial parameter vector
tied = 1;
ising = 1;
[nodeMap,edgeMap,w] = UGM_makeCRFmaps(Xnode,Xedge,edgeStruct,ising,tied);
nParams = length(w);

%% Evaluate with random parameters

figure;
w = randn(nParams,1);
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	nodeBel = UGM_Infer_LBP(nodePot,edgePot,edgeStruct);
	imagesc(reshape(nodeBel(:,2),nRows,nCols));
	colormap gray
end
suptitle('Loopy BP node marginals with random parameters');
fprintf('(paused)\n');
pause

%% Train with Stochastic gradient descent
stepSize = 1e-4;
w = zeros(nParams,1);
fAvg = 0;
maxIter = 3;
for iter = 1:maxIter*nInstances
	% Compute NLL and Gradient for random training example
	i = ceil(rand*nInstances);
	[f,g] = UGM_CRF_NLL(w,Xnode(i,:,:),Xedge(i,:,:),y(i,:),nodeMap,edgeMap,edgeStruct,@UGM_Infer_LBP);
	
	% Update estimate of function value and parameters
	fAvg = (1/iter)*f + ((iter-1)/iter)*fAvg;
	w = w - stepSize*g;
	
	fprintf('Iter = %d of %d (fAvg = %f)\n',iter,maxIter*nInstances,fAvg);
end

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	nodeBel = UGM_Infer_LBP(nodePot,edgePot,edgeStruct);
	imagesc(reshape(nodeBel(:,2),nRows,nCols));
	colormap gray
end
suptitle('Loopy BP node marginals with truncated SGD parameters');
fprintf('(paused)\n');
pause

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	yMAP = UGM_Decode_LBP(nodePot,edgePot,edgeStruct);
	imagesc(reshape(yMAP,nRows,nCols));
	colormap gray
end
suptitle('Loopy BP decoding with SGD parameters (Loopy-CRF Objective)');
fprintf('(paused)\n');
pause

%% Train with Projected Stochastic gradient descent to ensure parameters are associative

fprintf('Making non-negative features\n');
% Make Xedge
sharedFeatures = [1 0];
Xedge = UGM_makeEdgeFeaturesInvAbsDif(Xnode,edgeStruct.edgeEnds,sharedFeatures);

% Make nodeMap, edgeMap, initial parameter vector
[nodeMap,edgeMap,w] = UGM_makeCRFmaps(Xnode,Xedge,edgeStruct,ising,tied);
nParams = length(w);

stepSize = 1e-4;
w = zeros(nParams,1);
fAvg = 0;
edgeParamStart = max(nodeMap(:))+1;
for iter = 1:maxIter*nInstances
	% Compute NLL and Gradient for random training example
	i = ceil(rand*nInstances);
	[f,g] = UGM_CRF_NLL(w,Xnode(i,:,:),Xedge(i,:,:),y(i,:),nodeMap,edgeMap,edgeStruct,@UGM_Infer_LBP);
	
	% Update estimate of function value and parameters
	fAvg = (1/iter)*f + ((iter-1)/iter)*fAvg;
    
	w = w - stepSize*g;
    w(3:end) = max(w(3:end),0);
	
	fprintf('Iter = %d of %d (fAvg = %f)\n',iter,maxIter*nInstances,fAvg);
end

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	yMAP = UGM_Decode_GraphCut(nodePot,edgePot,edgeStruct);
	imagesc(reshape(yMAP,nRows,nCols));
	colormap gray
end
suptitle('Graph cut decoding with projected SGD parameters (Loopy-CRF Objective)');
fprintf('(paused)\n');
pause

%% Train Max-margin Markov network with projected stochastic subgradient descent 
% to ensure parameters are associative and use exact decoding via graph
% cuts

stepSize = 1e-4;
w = zeros(nParams,1);
fAvg = 0;
edgeParamStart = max(nodeMap(:))+1;
for iter = 1:maxIter*nInstances
	% Compute NLL and Gradient for random training example
	i = ceil(rand*nInstances);
	[f,sg] = UGM_M3N_Obj(w,Xnode(i,:,:),Xedge(i,:,:),y(i,:),nodeMap,edgeMap,edgeStruct,@UGM_Decode_GraphCut);
	
	% Update estimate of function value and parameters
	fAvg = (1/iter)*f + ((iter-1)/iter)*fAvg;
    
	w = w - stepSize*sg;
    w(3:end) = max(w(3:end),0);
	
	fprintf('Iter = %d of %d (fAvg = %f)\n',iter,maxIter*nInstances,fAvg);
end

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	yMAP = UGM_Decode_GraphCut(nodePot,edgePot,edgeStruct);
	imagesc(reshape(yMAP,nRows,nCols));
	colormap gray
end
suptitle('Graph cut decoding with projected SSGD parameters (Max-Margin Objective)');
fprintf('(paused)\n');
pause

%% Train with Stochastic gradient descent using Contrastive Divergence

% Go back to original features
sharedFeatures = [1 0];
Xedge = UGM_makeEdgeFeatures(Xnode,edgeStruct.edgeEnds,sharedFeatures);
[nodeMap,edgeMap,w] = UGM_makeCRFmaps(Xnode,Xedge,edgeStruct,ising,tied);
nParams = length(w);

stepSize = 1e-4;
burnIn = 0;
edgeStruct.maxIter = 1;
w = zeros(nParams,1);
fAvg = 0;
for iter = 1:maxIter*nInstances
	% Compute NLL and Gradient for random training example
	i = ceil(rand*nInstances);
	[f,g] = UGM_CRF_NLL(w,Xnode(i,:,:),Xedge(i,:,:),y(i,:),nodeMap,edgeMap,edgeStruct,@UGM_Infer_Sample,@UGM_Sample_Gibbs,burnIn,y(i,:));
    
	% Update estimate of function value and parameters
	w = w - stepSize*g;
	
	fprintf('Iter = %d of %d\n',iter,maxIter*nInstances);
end

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	nodeBel = UGM_Infer_LBP(nodePot,edgePot,edgeStruct);
	imagesc(reshape(nodeBel(:,2),nRows,nCols));
	colormap gray
end
suptitle('Loopy BP node marginals with contrastive divergence');
fprintf('(paused)\n');
pause

%% Train with Stochastic gradient descent using stochastic maximum likelihood
stepSize = 1e-4;
burnIn = 0;
edgeStruct.maxIter = 1;
w = zeros(nParams,1);
fAvg = 0;
yMemory = y;
for iter = 1:maxIter*nInstances
	% Compute NLL and Gradient for random training example
	i = ceil(rand*nInstances);
	[f,g,yMemory(i,:)] = UGM_CRF_NLL_MCMC(w,Xnode(i,:,:),Xedge(i,:,:),y(i,:),nodeMap,edgeMap,edgeStruct,@UGM_Sample_Gibbs,burnIn,yMemory(i,:));

	% Update estimate of function value and parameters
	w = w - stepSize*g;
	
	fprintf('Iter = %d of %d\n',iter,maxIter*nInstances);
end

figure;
for i = 1:4
	subplot(2,2,i);
	[nodePot,edgePot] = UGM_CRF_makePotentials(w,Xnode,Xedge,nodeMap,edgeMap,edgeStruct,i);
	nodeBel = UGM_Infer_LBP(nodePot,edgePot,edgeStruct);
	imagesc(reshape(nodeBel(:,2),nRows,nCols));
	colormap gray
end
suptitle('Loopy BP node marginals with SML');