import os
import torch
!pip install GPUtil
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import numpy as np
import argparse
import torch
import sys
import importlib as ipb
import pickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler
import GPUtil
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Logger(object):
def __init__(self, runs, info=None):
self.info = info
self.results = [[] for _ in range(runs)]
def pickle(self, key_save):
f = open(key_save, 'wb')
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
f.close()
def unpickle(self, key_save):
with open(key_save, 'rb') as f:
return pickle.load(f)
def add_result(self, run, result):
assert len(result) == 2
assert run >= 0 and run < len(self.results)
self.results[run].append(result)
def print_statistics(self, run=None):
if run is not None:
result = 100 * torch.tensor(self.results[run])
argmax = result[:, 0].argmax().item()
print(f'Run {run + 1:02d}:')
print(f'Highest Train: {result[:, 0].max():.2f}')
print(f'Highest Test: {result[:, 1].max():.2f}')
# Same as highest train, as we have no validation data
print(f' Final Train: {result[argmax, 0]:.2f}')
print(f' Final Test: {result[argmax, 1]:.2f}')
else:
result = 100 * torch.tensor(self.results)
best_results = []
for r in result:
train = r[:, 0].max().item()
test = r[:, 1].max().item()
train2 = r[r[:, 0].argmax(), 0].item()
test2 = r[r[:, 0].argmax(), 1].item()
best_results.append((train, test, train2, test2))
best_result = torch.tensor(best_results)
print(f'All runs:')
r = best_result[:, 0]
print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
r = best_result[:, 1]
print(f'Highest Test: {r.mean():.2f} ± {r.std():.2f}')
r = best_result[:, 2]
print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
r = best_result[:, 3]
print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
def test(model, train_loader, test_loader):
model.eval()
loader = {0: train_loader, 1:test_loader}
accuracies = []
with torch.no_grad():
for data_loader in loader.values():
correct = 0
total = 0
for i, (images, labels) in enumerate(data_loader):
images = images.reshape(-1, input_size).to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracies.append(correct/total)
return accuracies # train_accu & test accu
def train(model, images, labels):
model.train()
out = model(images)
cost = nn.CrossEntropyLoss()
loss = cost(out, labels)
# loss = F.mse_loss(out.float(), F.one_hot(labels).float())/2
loss.backward()
return loss.item()
def train_SVI(model, images, labels):
#### New lines for SVI ####
# NOTE: lines below are necessary, as o/w model.layers_x grow in size as epoches increases
model.layers_Xtilde = []
model.layers_grad = []
model.on_training = True
#### End #####
model.train()
out = model(images)
cost = nn.CrossEntropyLoss()
loss = cost(out, labels)
# loss = F.mse_loss(out.float(), F.one_hot(labels).float())/2
loss.backward(retain_graph=True)
#### New lines for SVI ####
for Xlplus1, Xlplus1grad in zip(model.layers_Xtilde, model.layers_grad):
Xlplus1grad = Xlplus1grad.grad.detach().to(device)
loss_tmp = (Xlplus1*Xlplus1grad).sum()
loss_tmp.backward(retain_graph=True) # To get update direction by MVI
model.on_training = False # To avoid additional .retain_grad()
#### End #####
return loss.item()
class FCnet(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(FCnet, self).__init__()
self.convs = torch.nn.ModuleList()
self.convs.append(nn.Linear(in_channels, hidden_channels))
self.bns = torch.nn.ModuleList()
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
for _ in range(num_layers - 2):
self.convs.append(nn.Linear(hidden_channels, hidden_channels))
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
self.convs.append(nn.Linear(hidden_channels, out_channels))
self.dropout = dropout
def forward(self, x):
for i, conv in enumerate(self.convs):
if i > 0:
x = self.bns[i-1](x)
x = conv(x)
x = F.relu(x)
if i == len(self.convs[:-1])-1:
x = F.dropout(x, p=self.dropout, training=self.training)
return x.softmax(dim=-1)
class FCnet_SVI(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(FCnet_SVI, self).__init__()
self.convs = torch.nn.ModuleList()
self.convs.append(nn.Linear(in_channels, hidden_channels))
self.bns = torch.nn.ModuleList()
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
for _ in range(num_layers - 2):
self.convs.append(nn.Linear(hidden_channels, hidden_channels))
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
self.convs.append(nn.Linear(hidden_channels, out_channels))
self.dropout = dropout
#### New lines for SVI ####
# If any layer true, then it needs to use SVI
self.layers_with_SVI = [True for i in range(len(self.convs))]
# TODO: later, can treat this as an input to be decided (as some layers need not SVI)
# Append pre-activation \tilde{X}_{l+1}, ONLY at layers i where self.layers_with_SVI[i] == True
self.layers_Xtilde = []
# Append the grad of L w.r.t. X_{l+1}, ONLY at layers i where self.layers_with_SVI[i] == True
self.layers_grad = []
self.on_training = True
#### End #####
def forward(self, x):
for i, conv in enumerate(self.convs):
if i > 0:
x = self.bns[i-1](x)
#### New lines for SVI ####
# This is added b/c o/w the gradient backprop w.r.t. new loss also gets to earlier layers (undesirable)
if self.layers_with_SVI[i] and self.on_training:
x_tmp = x.detach().clone().to(device)
self.layers_Xtilde.append(conv(x_tmp))
#### End #####
x = conv(x)
x = F.relu(x)
if i == len(self.convs[:-1])-1:
x = F.dropout(x, p=self.dropout, training=self.training)
#### New lines for SVI ####
if self.layers_with_SVI[i] and self.on_training:
x.retain_grad() # To get the gradient with respect to output
self.layers_grad.append(x)
#### End #####
return x.softmax(dim=-1)
def mem_report():
if device.type == 'cuda':
GPUs = GPUtil.getGPUs()
for i, gpu in enumerate(GPUs):
print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(
i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
else:
print("CPU RAM Free: "
+ humanize.naturalsize(psutil.virtual_memory().available))
import torchvision
import torchvision.transforms as transforms
def subset_data(data,frac=1):
# Randomly subset a fraction of data from total data
np.random.seed(1103)
idx = np.random.choice(len(data),int(frac*len(data)),replace=False)
return torch.utils.data.Subset(data,idx)
data_fixed = 'CIFAR10_batched' # 'MNIST_batched' or 'CIFAR10_batched'
if 'MNIST' in data_fixed:
train_dataset0 = torchvision.datasets.MNIST(root = './data',
train = True,
transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.ToTensor(),
transforms.Normalize(mean = (0.1307,), std = (0.3081,))]),
download = True)
test_dataset0 = torchvision.datasets.MNIST(root = './data',
train = False,
transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.ToTensor(),
transforms.Normalize(mean = (0.1325,), std = (0.3105,))]),
download=True)
num_classes = 10
in_channels = 1
else:
# NOTE, CIFAR10 has color channels, so input size = 3*32*32 with FC net
train_dataset0 = torchvision.datasets.CIFAR10(root='/data',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset0 = torchvision.datasets.CIFAR10(root='/data',
train=False,
transform=transforms.ToTensor())
num_classes = 10
in_channels = 3
input_size = torch.prod(torch.tensor(train_dataset0[0][0].shape)).item()
frac=0.1 if 'MNIST' in data_fixed else 0.2
frac = 1
train_dataset = subset_data(train_dataset0,frac=frac)
test_dataset = subset_data(test_dataset0,frac=frac)
print(len(train_dataset))
print(len(test_dataset))
# Data loader (i.e., split to batches) see below
Start testing
SVI_ls = [True, False] # If False, use ordinary SGD or Adam
optim_ls = ['SGD','Adam']
lr, hidden_channels = 0.001, 512
FC_only = True # Use all FC layers if True, o/w use LeNet
dataname = data_fixed
if FC_only == False:
dataname = data_fixed + 'LeNet_'
# batch_size = train_dataset uses ALL data, so no mini-batch
batch_size = int(len(train_dataset))
num_runs = 3
num_epochs, num_log_steps = 100, 10
if 'batch' in dataname:
batch_size = 64 # powers of 2 to take advantage of GPU
# Evaluate every 'num_log_steps' effect iteration. Tried to make it as close to GCD of total effective iteration on MNIST and CIFAR-10 with fixed batch size
num_epochs, num_log_steps = 10, 85
if __name__ == "__main__":
Final_result = {} # For quick check at the end of training
for use_SVI in SVI_ls:
for optim_name in optim_ls:
result_dict = {'SVI-SGD': [],
'SVI-Adam': [], 'SGD': [], 'Adam': []}
parser = argparse.ArgumentParser(
description=dataname)
parser.add_argument('--log_steps', type=int, default=num_log_steps)
parser.add_argument('--num_layers', type=int, default=4)
parser.add_argument('--dropout', type=float, default=0.25)
parser.add_argument('--lr', type=float, default=lr)
parser.add_argument('--momentum', type=float, default=0.95)
parser.add_argument(
'--epochs', type=int, default=num_epochs) # Change to 100
parser.add_argument(
'--batch_size', type=int, default=batch_size)
parser.add_argument('--runs', type=int,
default=num_runs)
parser.add_argument('--SVI', type=bool, default=use_SVI)
parser.add_argument('--FC', type=bool, default=FC_only)
parser.add_argument(
'--optimizer', type=str, default=optim_name)
parser.add_argument('-f')
args = parser.parse_args()
args.hidden_channels = hidden_channels
print(args)
# Get data loader from dataset
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
logger = Logger(args.runs, args)
results_over_runs = {}
for run in range(args.runs):
accu_at_run = []
# Initiatiate new model per run
torch.manual_seed(1103+run)
if args.SVI:
if args.FC:
model = FCnet_SVI(input_size, args.hidden_channels,
num_classes, args.num_layers, args.dropout).to(device)
else:
model = LeNet5_SVI(in_channels, num_classes).to(device)
else:
if args.FC:
model = FCnet(input_size, args.hidden_channels,
num_classes, args.num_layers, args.dropout).to(device)
else:
model = LeNet5(in_channels, num_classes).to(device)
if args.optimizer == 'SGD':
optimizer = torch.optim.SGD(
model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True)
else:
optimizer = torch.optim.Adam(
model.parameters(), lr=args.lr)
for epoch in range(1, 1 + args.epochs):
if device.type == 'cuda':
# Useful to avoid GPU allocation excess
torch.cuda.empty_cache()
print(f"LR is {optimizer.param_groups[0]['lr']}")
for i, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
if args.FC:
images = images.reshape(-1, input_size).to(device)
images = images.to(device)
labels = labels.to(device)
if args.SVI:
print(
f'SVI-{args.optimizer} training at batch {i}, epoch {epoch}')
loss = train_SVI(model, images, labels)
else:
print(
f'{args.optimizer} training at batch {i}, epoch {epoch}')
loss = train(model, images, labels)
optimizer.step()
mem_report()
# Test at each iteration
if i % args.log_steps == 0:
if i == 0 and epoch > 1:
continue
print('Testing')
# Do so because training data has too many images, and evaluation thus takes too long.
train_loader_sub = torch.utils.data.DataLoader(dataset=subset_data(train_dataset0,frac=0.1),
batch_size=batch_size,
shuffle=True)
result = test(model, train_loader_sub, test_loader)
logger.add_result(run, result)
train_acc, test_acc = result
accu_at_run += [[train_acc, test_acc]]
print(f'Run: {run + 1:02d}, '
f'Epoch: {epoch:02d}, '
f'Loss: {loss:.4f}, '
f'Train: {100 * train_acc:.2f}%,'
f'Test: {100 * test_acc:.2f}%')
# # Test at each epoch
# if epoch % args.log_steps == 0:
# print('Testing')
# result = test(model, train_loader, test_loader)
# logger.add_result(run, result)
# train_acc, test_acc = result
# accu_at_run += [[train_acc, test_acc]]
# print(f'Run: {run + 1:02d}, '
# f'Epoch: {epoch:02d}, '
# f'Loss: {loss:.4f}, '
# f'Train: {100 * train_acc:.2f}%,'
# f'Test: {100 * test_acc:.2f}%')
# Running np.array(accu_at_run) would make it into Epoch-by-3 matrices, but doing so causes .json saving error so I just use the list version
results_over_runs[f'lr={args.lr}@Run{run+1}'] = accu_at_run
logger.print_statistics(run)
logger.print_statistics()
# Save results
key = f'SVI-{optim_name}' if use_SVI else optim_name
key_save = f'SVI-{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR' if use_SVI else f'{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR'
key_save = dataname + key_save
# Save it to file, but need not now because only one run.
logger.pickle(key_save)
result_dict[key].append(results_over_runs)
# Final train and test accuracy
Final_result[key] = accu_at_run[-1]
with open(f"{key_save}.json", "w") as outfile:
json.dump(result_dict, outfile)
Final_result # Check prelim result at the latest run by all methods
Note, as long as results saved on file, can directly run these below
def get_mean(result_dict, stop_epoch, return_full=False):
# Stop_epoch: determine when to stop computing the values.
full_res = [np.array(i)[:stop_epoch] for i in result_dict.values()]
if return_full:
full_mean = np.round(100*np.mean(full_res, axis=0), 2)
full_std = np.round(100*np.std(full_res, axis=0), 2)
return [full_mean, full_std] # For plot
else:
idx = [np.argmax(i[:, 1])
for i in full_res] # Collect idx at which valid is highest
vals = [i[j] for i, j in zip(full_res, idx)]
means = np.round(100*np.mean(vals, axis=0), decimals=2)
stds = np.round(100*np.std(vals, axis=0), decimals=2)
return [means, stds] # For table
def plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam, model_architect, lr0, full_epoch=True, return_early=False, frac = 0.2):
'''
full_epoch: Decides if we plot the entire trajectory or just the initial segments
'''
result_dict = json.load(open(f'{key_SVI}.json'))
results_over_runs_SVI = result_dict['SVI-SGD'][0]
result_dict = json.load(open(f'{key_SVI_Adam}.json'))
results_over_runs_SVI_Adam = result_dict['SVI-Adam'][0]
result_dict = json.load(open(f'{key_SGD}.json'))
results_over_runs_SGD = result_dict['SGD'][0]
result_dict = json.load(open(f'{key_Adam}.json'))
results_over_runs_Adam = result_dict['Adam'][0]
tot_epoch = len(results_over_runs_SVI[f'lr={lr0[1:]}@Run1'])
stop_epoch = tot_epoch if full_epoch else int(frac*tot_epoch)
if return_early:
results_SVI = get_mean(results_over_runs_SVI, stop_epoch, return_full=False)
results_SVI_Adam = get_mean(results_over_runs_SVI_Adam, stop_epoch, return_full=False)
results_SGD = get_mean(results_over_runs_SGD, stop_epoch, return_full=False)
results_Adam = get_mean(results_over_runs_Adam, stop_epoch, return_full=False)
# NOTE, they include train test mean and std.
res_mean = np.concatenate(
[val[0] for val in [results_SVI, results_SGD, results_SVI_Adam, results_Adam]])
res_std = np.concatenate(
[val[1] for val in [results_SVI, results_SGD, results_SVI_Adam, results_Adam]])
return [res_mean, res_std]
else:
results_SVI = get_mean(results_over_runs_SVI, stop_epoch, return_full=True)
results_SVI_Adam = get_mean(results_over_runs_SVI_Adam, stop_epoch, return_full=True)
results_SGD = get_mean(results_over_runs_SGD, stop_epoch, return_full=True)
results_Adam = get_mean(results_over_runs_Adam, stop_epoch, return_full=True)
res_orig = [results_over_runs_SVI, results_over_runs_SGD,
results_over_runs_SVI_Adam, results_over_runs_Adam]
mpl.rcParams['font.size'] = 20
mpl.rcParams['axes.titlesize'] = 20
fig, ax = plt.subplots(1, 4, figsize=(36, 4), sharey=True, sharex=True)
mtd_dict = {0: 'SVI', 1: 'SGD', 2: 'SVI-Adam', 3: 'Adam'}
type_dict = {0: 'Train', 1: 'Test'}
print(f'LR={lr0[1:]} up to {stop_epoch*num_log_steps} effective iterations')
for i, result in enumerate([results_SVI, results_SGD, results_SVI_Adam, results_Adam]):
res_mean, res_std = result
res_mean_final, res_std_final = get_mean(res_orig[i], stop_epoch, return_full=False)
xtick = np.arange(stop_epoch)*num_log_steps
for j in range(len(type_dict)): # Train, Test
ax[i].plot(xtick,res_mean[:, j], label=type_dict[j])
ax[i].fill_between(xtick, res_mean[:, j]-res_std[:, j],
res_mean[:, j]+res_std[:, j], alpha=0.3)
if j == 0:
print(mtd_dict[i])
ax[i].set_title(
f'{mtd_dict[i]} Final Train {res_mean_final[0]}'+r'$\pm$'+f'{res_std_final[0]}'+f', Test: {res_mean_final[1]}'+r'$\pm$'+f'{res_std_final[1]}')
ax[i].legend(loc='lower right', ncol=3)
ax[i].grid(True)
fig.tight_layout()
full_e = '' if full_epoch else '_sub_epoch'
fig.savefig(f'{dataname}SVI_SGD_Adam_vs_SGD_Adam{model_architect}{lr0}LR{full_e}.pdf',
dpi=300, bbox_inches='tight', pad_inches=0)
plt.show()
# NOTE: I tried to "pause_SVI" after certain epochs, but it seems results not good
model_architect = '-4layers-512nodes'
lrs=['-0.001']
# NOTE: first 12 includes mean values and the rest for std.
full_result = np.zeros((len(lrs), 16))
# records results over initial training epochs.
full_result_sub = np.zeros((len(lrs), 16))
dataname = 'MNIST_batched'
frac = 0.05 # frac controls % of initial convergence
if FC_only == False:
dataname = dataname + 'LeNet_'
for i, lr0 in enumerate(lrs):
key_SVI = f'{dataname}SVI-SGD{model_architect}{lr0}LR'
key_SVI_Adam = f'{dataname}SVI-Adam{model_architect}{lr0}LR'
key_SGD = f'{dataname}SGD{model_architect}{lr0}LR'
key_Adam = f'{dataname}Adam{model_architect}{lr0}LR'
full_result[i] = np.concatenate(plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam,
model_architect, lr0, full_epoch=True, return_early=True))
full_result_sub[i] = np.concatenate(plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam,
model_architect, lr0, full_epoch=False, return_early=True, frac = frac))
full_tmp = np.zeros((len(lrs), 8), dtype=object)
full_tmp_sub = np.zeros((len(lrs), 8), dtype=object)
full_result = np.round(full_result,1)
full_result_sub = np.round(full_result_sub,1)
for i in range(len(lrs)):
for j in range(8):
full_tmp[i, j] = f'{full_result[i, j]} ({full_result[i, 8+j]})'
full_tmp_sub[i, j] = f'{full_result_sub[i, j]} ({full_result_sub[i, 8+j]})'
full_result = full_tmp
full_result_sub = full_tmp_sub
mtds = ['SVI', 'SGD','SVI-Adam', 'Adam']
type = ['Train', 'Test']
tuples = list(zip(*[np.repeat(mtds, len(type)), np.tile(type, len(mtds))]))
index = pd.MultiIndex.from_tuples(tuples)
lr_names = [lr[1:] for lr in lrs]
print('Result over all epochs')
full_result = pd.DataFrame(full_result, index=lr_names, columns=index) # Index are LRs
print(full_result.round(1).to_latex())
full_result
Result over all epochs \begin{tabular}{lllllllll} \toprule {} & \multicolumn{2}{l}{SVI} & \multicolumn{2}{l}{SGD} & \multicolumn{2}{l}{SVI-Adam} & \multicolumn{2}{l}{Adam} \\ {} & Train & Test & Train & Test & Train & Test & Train & Test \\ \midrule 0.001 & 99.1 (0.1) & 98.1 (0.1) & 99.1 (0.1) & 98.2 (0.0) & 97.0 (0.0) & 96.6 (0.1) & 97.9 (0.1) & 97.5 (0.0) \\ \bottomrule \end{tabular}
SVI | SGD | SVI-Adam | Adam | |||||
---|---|---|---|---|---|---|---|---|
Train | Test | Train | Test | Train | Test | Train | Test | |
0.001 | 99.1 (0.1) | 98.1 (0.1) | 99.1 (0.1) | 98.2 (0.0) | 97.0 (0.0) | 96.6 (0.1) | 97.9 (0.1) | 97.5 (0.0) |
print(f'Result over initial {100*frac}% epochs')
full_result_sub = pd.DataFrame(full_result_sub, index=lr_names, columns=index) # Index are LRs
print(full_result_sub.round(1).to_latex())
full_result_sub
Result over initial 5.0% epochs \begin{tabular}{lllllllll} \toprule {} & \multicolumn{2}{l}{SVI} & \multicolumn{2}{l}{SGD} & \multicolumn{2}{l}{SVI-Adam} & \multicolumn{2}{l}{Adam} \\ {} & Train & Test & Train & Test & Train & Test & Train & Test \\ \midrule 0.001 & 93.5 (0.1) & 93.7 (0.1) & 92.3 (0.2) & 92.6 (0.2) & 92.4 (0.3) & 92.4 (0.3) & 92.6 (0.3) & 92.9 (0.2) \\ \bottomrule \end{tabular}
SVI | SGD | SVI-Adam | Adam | |||||
---|---|---|---|---|---|---|---|---|
Train | Test | Train | Test | Train | Test | Train | Test | |
0.001 | 93.5 (0.1) | 93.7 (0.1) | 92.3 (0.2) | 92.6 (0.2) | 92.4 (0.3) | 92.4 (0.3) | 92.6 (0.3) | 92.9 (0.2) |
# Then make detailed plots. ONLY at non-FC model (FC=False) and no change in train size (change_ratio=False).
# See Figure 1 of rebuttal
model_architect='-4layers-512nodes'
lrs=['-0.001']
for lr0 in lrs:
key_SVI = f'{dataname}SVI-SGD{model_architect}{lr0}LR'
key_SVI_Adam = f'{dataname}SVI-Adam{model_architect}{lr0}LR'
key_SGD = f'{dataname}SGD{model_architect}{lr0}LR'
key_Adam = f'{dataname}Adam{model_architect}{lr0}LR'
plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam, model_architect, lr0, full_epoch=True)
plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam, model_architect, lr0, full_epoch=False, frac = frac)
LR=0.001 up to 9435 effective iterations SVI SGD SVI-Adam Adam
LR=0.001 up to 425 effective iterations SVI SGD SVI-Adam Adam
# NOTE: I tried to "pause_SVI" after certain epochs, but it seems results not good
model_architect = '-4layers-512nodes'
lrs=['-0.001']
# NOTE: first 12 includes mean values and the rest for std.
full_result = np.zeros((len(lrs), 16))
# records results over initial training epochs.
full_result_sub = np.zeros((len(lrs), 16))
dataname = 'CIFAR10_batched'
frac = 0.05 # frac controls % of initial convergence
if FC_only == False:
dataname = dataname + 'LeNet_'
for i, lr0 in enumerate(lrs):
key_SVI = f'{dataname}SVI-SGD{model_architect}{lr0}LR'
key_SVI_Adam = f'{dataname}SVI-Adam{model_architect}{lr0}LR'
key_SGD = f'{dataname}SGD{model_architect}{lr0}LR'
key_Adam = f'{dataname}Adam{model_architect}{lr0}LR'
full_result[i] = np.concatenate(plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam,
model_architect, lr0, full_epoch=True, return_early=True))
full_result_sub[i] = np.concatenate(plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam,
model_architect, lr0, full_epoch=False, return_early=True, frac = frac))
full_tmp = np.zeros((len(lrs), 8), dtype=object)
full_tmp_sub = np.zeros((len(lrs), 8), dtype=object)
full_result = np.round(full_result,1)
full_result_sub = np.round(full_result_sub,1)
for i in range(len(lrs)):
for j in range(8):
full_tmp[i, j] = f'{full_result[i, j]} ({full_result[i, 8+j]})'
full_tmp_sub[i, j] = f'{full_result_sub[i, j]} ({full_result_sub[i, 8+j]})'
full_result = full_tmp
full_result_sub = full_tmp_sub
mtds = ['SVI', 'SGD','SVI-Adam', 'Adam']
type = ['Train', 'Test']
tuples = list(zip(*[np.repeat(mtds, len(type)), np.tile(type, len(mtds))]))
index = pd.MultiIndex.from_tuples(tuples)
lr_names = [lr[1:] for lr in lrs]
print('Result over all epochs')
full_result = pd.DataFrame(full_result, index=lr_names, columns=index) # Index are LRs
print(full_result.round(1).to_latex())
full_result
Result over all epochs \begin{tabular}{lllllllll} \toprule {} & \multicolumn{2}{l}{SVI} & \multicolumn{2}{l}{SGD} & \multicolumn{2}{l}{SVI-Adam} & \multicolumn{2}{l}{Adam} \\ {} & Train & Test & Train & Test & Train & Test & Train & Test \\ \midrule 0.001 & 55.8 (0.8) & 50.0 (0.0) & 55.5 (0.7) & 49.8 (0.2) & 48.8 (1.2) & 46.5 (0.6) & 48.4 (0.6) & 45.8 (0.2) \\ \bottomrule \end{tabular}
SVI | SGD | SVI-Adam | Adam | |||||
---|---|---|---|---|---|---|---|---|
Train | Test | Train | Test | Train | Test | Train | Test | |
0.001 | 55.8 (0.8) | 50.0 (0.0) | 55.5 (0.7) | 49.8 (0.2) | 48.8 (1.2) | 46.5 (0.6) | 48.4 (0.6) | 45.8 (0.2) |
print(f'Result over initial {100*frac}% epochs')
full_result_sub = pd.DataFrame(full_result_sub, index=lr_names, columns=index) # Index are LRs
print(full_result_sub.round(1).to_latex())
full_result_sub
Result over initial 5.0% epochs \begin{tabular}{lllllllll} \toprule {} & \multicolumn{2}{l}{SVI} & \multicolumn{2}{l}{SGD} & \multicolumn{2}{l}{SVI-Adam} & \multicolumn{2}{l}{Adam} \\ {} & Train & Test & Train & Test & Train & Test & Train & Test \\ \midrule 0.001 & 36.9 (0.6) & 37.7 (0.1) & 34.6 (0.4) & 35.5 (0.6) & 34.5 (0.9) & 34.6 (0.7) & 33.2 (1.2) & 33.5 (1.8) \\ \bottomrule \end{tabular}
SVI | SGD | SVI-Adam | Adam | |||||
---|---|---|---|---|---|---|---|---|
Train | Test | Train | Test | Train | Test | Train | Test | |
0.001 | 36.9 (0.6) | 37.7 (0.1) | 34.6 (0.4) | 35.5 (0.6) | 34.5 (0.9) | 34.6 (0.7) | 33.2 (1.2) | 33.5 (1.8) |
# Then make detailed plots. ONLY at non-FC model (FC=False) and no change in train size (change_ratio=False).
# See Figure 1 of rebuttal
model_architect='-4layers-512nodes'
lrs=['-0.001']
for lr0 in lrs:
key_SVI = f'{dataname}SVI-SGD{model_architect}{lr0}LR'
key_SVI_Adam = f'{dataname}SVI-Adam{model_architect}{lr0}LR'
key_SGD = f'{dataname}SGD{model_architect}{lr0}LR'
key_Adam = f'{dataname}Adam{model_architect}{lr0}LR'
plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam, model_architect, lr0, full_epoch=True)
plot_correct_split(key_SVI, key_SVI_Adam, key_SGD, key_Adam, model_architect, lr0, full_epoch=False, frac = frac)
LR=0.001 up to 7735 effective iterations SVI SGD SVI-Adam Adam
LR=0.001 up to 340 effective iterations SVI SGD SVI-Adam Adam