import torch
import numpy as np
import torch.nn as nn
import math

size=128
N=1000
L=10

mean=[]
std=[]

mean_per_layer=[[] for _ in range(L)]
std_per_layer=[[] for _ in range(L)]
for n in range(N):
    x=torch.bernoulli(torch.zeros(size) + 0.5)
#    x=torch.randn(size)

    for i in range(L):
        a=torch.zeros((size, size))
        a=torch.randn((size, size))
        #a=torch.Tensor(size, size).uniform_(-1,1)

#        nn.init.kaiming_uniform_(a, nonlinearity='tanh')
        nn.init.kaiming_uniform_(a, nonlinearity='sigmoid')
#        nn.init.kaiming_normal_(a, nonlinearity='tanh')
#        nn.init.kaiming_normal_(a, nonlinearity='sigmoid')
#        torch.nn.init.xavier_normal_(a, gain=nn.init.calculate_gain('sigmoid'))
#        torch.nn.init.xavier_uniform_(a, gain=nn.init.calculate_gain('sigmoid'))
#        nn.init.uniform_(a, a=0.0001, b=1)
#        nn.init.orthogonal_(a,  gain=nn.init.calculate_gain('sigmoid'))
#        a = a / (a.softmax(-1) @ x).std()

#        x = (a @ x).clamp_min(0.)
#        x = torch.tanh(a@x)
#        x = torch.sigmoid(a@x)
#        x = a.softmax(-1) @ x
        x = (-torch.empty_like(a, memory_format=torch.legacy_contiguous_format).exponential_().log()*10 + a).softmax(-1) @ x

        mean.append(x.mean())
        std.append(x.std())

        mean_per_layer[i].append(x.mean())
        std_per_layer[i].append(x.std())


print(np.mean(mean), np.mean(std))

mean_per_layer=np.array(mean_per_layer).mean(-1)
std_per_layer=np.array(std_per_layer).mean(-1)

print(mean_per_layer)
print(std_per_layer)



