import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import tensorflow as tf
import numpy as np

runs = 5
all_accuracies, all_ll = [], []
for run in range(runs):
    (_, _), (_, y_test) = tf.keras.datasets.mnist.load_data()
    P = [np.load(f'predictions/{run:d}/{i:d}_mean.npy') for i in range(10)]
    V = [np.load(f'predictions/{run:d}/{i:d}_var.npy') for i in range(10)]
    P = np.array(P).T
    V = np.array(V).T
 
    y_pred = [
        np.random.normal(P[i : i + 1], V[i : i + 1], size=(500, 10)).mean(axis=0)
        for i in range(len(y_test))
    ]
    y_pred = np.array(y_pred)
    L = np.log(y_pred[np.arange(len(y_pred)), y_test.ravel()])
    ll = L[np.logical_not(np.isnan(L))].mean()

    FP = np.argmax(P, axis=1)
    #mu, var = P[np.arange(len(P)), FP], V[np.arange(len(P)), FP]
    #ll = loglikelihood(mu, var)
    accuracy = np.mean(FP == y_test.ravel()) * 100
    all_ll.append(ll)
    all_accuracies.append(accuracy)
    print(f'Test accuracy={accuracy:.2f} - LL={ll:.2f}')

print(np.mean(all_accuracies), np.std(all_accuracies))
print(np.mean(all_ll), np.std(all_ll))
