#  Copyright 2021 The PlenOctree Authors.
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.
import math
import torch
import time
import numpy as np
import torch.nn.functional as F
from utils.brdf_utils import rendering_equation
#from sphericalHarmonics import shEvaluate
C0 = 0.28209479177387814
C1 = 0.4886025119029199
C2 = [
    1.0925484305920792,
    -1.0925484305920792,
    0.31539156525252005,
    -1.0925484305920792,
    0.5462742152960396
]
C3 = [
    -0.5900435899266435,
    2.890611442640554,
    -0.4570457994644658,
    0.3731763325901154,
    -0.4570457994644658,
    1.445305721320277,
    -0.5900435899266435
]
C4 = [
    2.5033429417967046,
    -1.7701307697799304,
    0.9461746957575601,
    -0.6690465435572892,
    0.10578554691520431,
    -0.6690465435572892,
    0.47308734787878004,
    -1.7701307697799304,
    0.6258357354491761,
]   

# C0=torch.tensor(C0,device="cuda")
# C1=torch.tensor(C1,device="cuda")
# C2=torch.tensor(C2,device="cuda")
# C3=torch.tensor(C3,device="cuda")
# C4=torch.tensor(C4,device="cuda")
def eval_sh(deg, sh, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert deg <= 4 and deg >= 0
    coeff = (deg + 1) ** 2
    assert sh.shape[-1] >= coeff

    result = C0 * sh[..., 0]
    if deg > 0:
        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
        result = (result -
                C1 * y * sh[..., 1] +
                C1 * z * sh[..., 2] -
                C1 * x * sh[..., 3])

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            result = (result +
                    C2[0] * xy * sh[..., 4] +
                    C2[1] * yz * sh[..., 5] +
                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
                    C2[3] * xz * sh[..., 7] +
                    C2[4] * (xx - yy) * sh[..., 8])

            if deg > 2:
                result = (result +
                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
                C3[1] * xy * z * sh[..., 10] +
                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
                C3[5] * z * (xx - yy) * sh[..., 14] +
                C3[6] * x * (xx - 3 * yy) * sh[..., 15])

                if deg > 3:
                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
    return result
def eval_sh_coef(deg, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert 4 >= deg >= 0
    coeff = (deg + 1) ** 2
    results = torch.zeros(dirs.shape[:-1] + (coeff,), device=dirs.device)
    results[..., 0] = C0
    if deg > 0:
        x, y, z = dirs[..., 0], dirs[..., 1], dirs[..., 2]
        results[..., 1] = -C1 * y
        results[..., 2] = C1 * z
        results[..., 3] = -C1 * x

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            results[..., 4] = C2[0] * xy
            results[..., 5] = C2[1] * yz
            results[..., 6] = C2[2] * (2.0 * zz - xx - yy)
            results[..., 7] = C2[3] * xz
            results[..., 8] = C2[4] * (xx - yy)

            if deg > 2:
                results[..., 9] = C3[0] * y * (3 * xx - yy)
                results[..., 10] = C3[1] * xy * z
                results[..., 11] = C3[2] * y * (4 * zz - xx - yy)
                results[..., 12] = C3[3] * z * (2 * zz - 3 * xx - 3 * yy)
                results[..., 13] = C3[4] * x * (4 * zz - xx - yy)
                results[..., 14] = C3[5] * z * (xx - yy)
                results[..., 15] = C3[6] * x * (xx - 3 * yy)

                if deg > 3:
                    results[..., 16] = C4[0] * xy * (xx - yy)
                    results[..., 17] = C4[1] * yz * (3 * xx - yy)
                    results[..., 18] = C4[2] * xy * (7 * zz - 1)
                    results[..., 19] = C4[3] * yz * (7 * zz - 3)
                    results[..., 20] = C4[4] * (zz * (35 * zz - 30) + 3)
                    results[..., 21] = C4[5] * xz * (7 * zz - 3)
                    results[..., 22] = C4[6] * (xx - yy) * (7 * zz - 1)
                    results[..., 23] = C4[7] * xz * (xx - 3 * yy)
                    results[..., 24] = C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy))

    return results
def RGB2SH(rgb):
    return (rgb - 0.5) / C0

def SH2RGB(sh):
    return sh * C0 + 0.5
    
def ToVector(phi,theta):
    r = torch.sin(theta)
    return torch.cat((r * torch.cos(phi), r * torch.sin(phi), torch.cos(theta)),dim=2)
def factorial(x):
	if(x == 0):
		return 1.0
	return x * factorial(x-1)
def P(l, m, x):
	pmm = 1.0
	if(m>0):
		somx2 = torch.sqrt((1.0-x)*(1.0+x))
		fact = 1.0
		for i in range(1,m+1):
			pmm *= (-fact) * somx2
			fact += 2.0
	
	if(l==m):
		return pmm * torch.ones(x.shape,device="cuda")
	
	pmmp1 = x * (2.0*m+1.0) * pmm
	
	if(l==m+1):
		return pmmp1
	
	pll = torch.zeros(x.shape,device="cuda").float()
	for ll in range(m+2, l+1):
		pll = ( (2.0*ll-1.0)*x*pmmp1-(ll+m-1.0)*pmm ) / (ll-m)
		pmm = pmmp1
		pmmp1 = pll
	
	return pll


def K(l, m):
	#return torch.sqrt((2.0 * l + 1.0) * 0.07957747154594766788 * divfact(l, m))
	return math.sqrt( ((2 * l + 1) * factorial(l-m)) / (4*torch.pi*factorial(l+m)) )

def SH(l, m, theta, phi):
	sqrt2 = pow(2.0,0.5)
	#print(sqrt2)
	if(m==0):
		return K(l,m)*P(l,m,torch.cos(theta))*torch.ones(phi.shape,device="cuda").float()
	elif(m>0):
		return sqrt2*K(l,m)*torch.cos(m*phi)*P(l,m,torch.cos(theta)).float()
	else:
		return sqrt2*K(l,-m)*torch.sin(-m*phi)*P(l,-m,torch.cos(theta)).float()
	
def shTerms(l):
	return (l + 1) * (l + 1)

def shIndex(l, m):
	return l*l+l+m

def shEvaluate(theta, phi, lmax):

	coeffsMatrix = torch.zeros((theta.shape[0],theta.shape[1],shTerms(lmax)),device="cuda")
	# print("coeffsMatrix")
	# print(coeffsMatrix.shape)
	for l in range(0,lmax+1):
		for m in range(-l,l+1):
			index = shIndex(l, m)
			#a= SH(l, m, theta, phi).squeeze()
			#print("a_shape")
			#print(a.shape)
			#shtime=time.time()
			coeffsMatrix[...,index] = SH(l, m, theta, phi).squeeze()
			#print("p1_sh run time:", time.time() - shtime)

	return coeffsMatrix
def rotation_between_z(vec):
    """
    https://math.stackexchange.com/questions/180418/calculate-rotation-matrix-to-align-vector-a-to-vector-b-in-3d/476311#476311
    Args:
        vec: [..., 3]

    Returns:
        R: [..., 3, 3]

    """
    v1 = -vec[..., 1]
    v2 = vec[..., 0]
    v3 = torch.zeros_like(v1)
    v11 = v1 * v1
    v22 = v2 * v2
    v33 = v3 * v3
    v12 = v1 * v2
    v13 = v1 * v3
    v23 = v2 * v3
    cos_p_1 = (vec[..., 2] + 1).clamp_min(1e-7)
    R = torch.zeros(vec.shape[:-1] + (3, 3,), dtype=torch.float32, device="cuda")
    R[..., 0, 0] = 1 + (-v33 - v22) / cos_p_1
    R[..., 0, 1] = -v3 + v12 / cos_p_1
    R[..., 0, 2] = v2 + v13 / cos_p_1
    R[..., 1, 0] = v3 + v12 / cos_p_1
    R[..., 1, 1] = 1 + (-v33 - v11) / cos_p_1
    R[..., 1, 2] = -v1 + v23 / cos_p_1
    R[..., 2, 0] = -v2 + v13 / cos_p_1
    R[..., 2, 1] = v1 + v23 / cos_p_1
    R[..., 2, 2] = 1 + (-v22 - v11) / cos_p_1
    R = torch.where((vec[..., 2] + 1 > 0)[..., None, None], R,
                    -torch.eye(3, dtype=torch.float32, device="cuda").expand_as(R))
    return R
def fibonacci_sphere_sampling(normals, sample_num, random_rotate=True):
    pre_shape = normals.shape[:-1]
    if len(pre_shape) > 1:
        normals = normals.reshape(-1, 3)
    delta = np.pi * (3.0 - np.sqrt(5.0))

    # fibonacci sphere sample around z axis
    idx = torch.arange(sample_num, dtype=torch.float, device='cuda')[None]
    z = 1 - 2 * idx / ( 2*sample_num -1)
    # print(z)
    # print(aaaaa)
    rad = torch.sqrt(1 - z ** 2)
    theta = delta * idx
    if random_rotate:
        theta = torch.rand(*pre_shape, 1, device='cuda') * 2 * np.pi + theta
    y = torch.cos(theta) * rad
    x = torch.sin(theta) * rad
    z_samples = torch.stack([x, y, z.expand_as(y)], dim=-2)

    # rotate to normal
    # z_vector = torch.zeros_like(normals)
    # z_vector[..., 2] = 1  # [H, W, 3]
    # rotation_matrix = rotation_between_vectors(z_vector, normals)
    rotation_matrix = rotation_between_z(normals)
    incident_dirs = rotation_matrix @ z_samples
    incident_dirs = F.normalize(incident_dirs, dim=-2).transpose(-1, -2)
    incident_areas = torch.ones_like(incident_dirs)[..., 0:1] * 2 * np.pi
    if len(pre_shape) > 1:
        incident_dirs = incident_dirs.reshape(*pre_shape, sample_num, 3)
        incident_areas = incident_areas.reshape(*pre_shape, sample_num, 1)
    return incident_dirs, incident_areas
def sample_incident_rays(normals, is_training=False, sample_num=24):
    if is_training:
        incident_dirs, incident_areas = fibonacci_sphere_sampling(
            normals, sample_num, random_rotate=True)
    else:
        incident_dirs, incident_areas = fibonacci_sphere_sampling(
            normals, sample_num, random_rotate=False)

    return incident_dirs, incident_areas  # [N, S, 3], [N, S, 1]

def ProjectFunction(order, func, sample_side, N, visibility,V,albedo,metallic,roughness):
    with torch.no_grad():
    # This is the approach demonstrated in [1] and is useful for arbitrary
    # functions on the sphere that are represented analytically.
        torch.cuda.synchronize()
        proj_start_time = time.time()
        lenth=N.shape[0]
        sample_count=sample_side**2
        rngh = torch.rand((sample_count,1),device="cuda").float()
        rngt = torch.rand((sample_count,1),device="cuda").float()
        t = torch.arange(sample_count,device="cuda").unsqueeze(0).float().unsqueeze(2)
        # print("rngh.shape")
        # print(rngh.shape)
        # print("t.shape")
        # print(t.shape)
        # print("rngh")
        # print(rngh[0])
        # print("t")
        # print(t[0])
        #sample_side=(self.max_sh_degree + 1) ** 2
        t_mod=t%sample_side
        alpha = ((t -t_mod)/sample_side+rngt) / sample_side
        beta = (t_mod + rngh) / sample_side
        # print("alpha")
        # print(alpha.shape)
        # print("beta")
        # print(beta.shape)
        phi = 2.0 * 3.1415926535 * beta
        theta = (2.0 * alpha - 1.0).acos()
        torch.cuda.synchronize()
        # print("p1_sh run time:", time.time() - proj_start_time)
        d=ToVector(theta,phi).squeeze()
        print("ds")
        print(d.shape)
        d=d.unsqueeze(0).repeat(albedo.shape[0],1,1)
        theta=theta.squeeze().unsqueeze(0).repeat(albedo.shape[0],1,1)
        phi=phi.squeeze().unsqueeze(0).repeat(albedo.shape[0],1,1)
        d,a=fibonacci_sphere_sampling(N, sample_count, random_rotate=True)
        d,a=fibonacci_sphere_sampling(N, sample_count, random_rotate=True)
        theta = torch.acos(d[...,2])
        phi = torch.atan2(d[...,1], d[...,0])
        #print(d[0:101])
        #print("ds")
        #print(d.shape)
        #d_reshape=d.reshape(d.shape[0]*d.shape[1],3,1)
        #print(d_reshape[0:101])
        #print("visis")
        #visibility=visibility.unsqueeze(1).repeat(1,d.shape[1],1,1)
        #print(visibility[0:101,:,0])
        #visibility=visibility.reshape(d.shape[0]*d.shape[1],order**2,1)
        #print(visibility[0:101,0])
        #print(visibility.unsqueeze(1).repeat(1,3,1,1)[0:101,:,0,:])
        #print(aaaaaa)
        # print("vs")
        #print(vvisibility.unsqueeze(1).repeat(1,3,1,1).squeeze().shape)
        #print("drs")
        #print(d_reshape.shape)
        N=N.unsqueeze(1).repeat(1,sample_count,1)
        cos=(torch.mul(d,N).mean(dim=2)*3)
        zero = torch.zeros_like(cos)
        one = torch.ones_like(cos)
        print(cos[0])
        visi_cos=torch.where( cos >= -0.3, one, zero)
        #visi_cos = torch.where(cos < -0.3, zero, cos)
        print(visi_cos[0])
        cos = torch.where(cos <= 0.0, zero, cos)
        shs_visibility = visibility.transpose(1, 2).view(albedo.shape[0], 1, 1, -1)
        # print(shs_visibility.shape)
        # print(aaaaa)
        #shs_visibility=torch.ones(3030, 1, 1, 16).cuda()
        torch.cuda.synchronize()
        eval_start_time = time.time()
        #sh_coef=eval_sh_coef(order-1,d).unsqueeze(2)
        print(d.shape)
        #print(sh_coef.shape)
        print(shs_visibility.shape)
        visibility=eval_sh(order-1,shs_visibility,d).squeeze()
        visibility=torch.clamp( visibility + 0.5, 0.0, 1.0)
        print(visibility.shape)
        torch.cuda.synchronize()
        print("eval_sh run time:", time.time() - eval_start_time)
        #print(aaaa)
        # print("vs")
        # print(visibility.shape)
        #visibility=visibility[:,0].reshape(d.shape[0],d.shape[1])
        # print("vss")
        print(visibility.mean())
        print(visibility[0])
        print(visibility.shape)
        print(visi_cos.shape)
        visibility=visibility.squeeze()#*visi_cos
        print(visibility.mean())
        print(visibility[0])
        #print(aaaaa)
        print(cos.shape)
        print(visibility.shape)	
        #print(aaaaaa)		
        out_occ=visibility.mean(dim=1)#*2#*torch.pi 
        # H=torch.ones((lenth,3,16)).float().cuda()
        # return H,out_occ
        # print(out_occ.max())
        # print(aaaaaa)
        #d_N=d
        # print("Ns")
        # print(N.shape)
        # print("ds")
        # print(d.shape)
        #cos=cos.unsqueeze(2).repeat(1,1,order** 2)
        #visibility=visibility.unsqueeze(2).repeat(1,1,order** 2)
        zeros = torch.zeros_like(visibility)
        ones = torch.ones_like(visibility)
        # visibility = torch.where(visibility >= 0.7, ones, visibility) # TODO
        # visibility = torch.where(visibility < 0.7, zeros, visibility)
        # visibiliti = torch.ones_like()
        # print("cos_s")
        # print(cos.device)
        torch.cuda.synchronize()
        print("p2_sh run time:", time.time() - proj_start_time)
        torch.cuda.synchronize()
        sh_eval_time=time.time()                 
        coeffsMatrix=shEvaluate(theta.squeeze(), phi.squeeze(), order-1)#.unsqueeze(0).repeat(N.shape[0],1,1)
        torch.cuda.synchronize()
        print("sheval run time:", time.time()-sh_eval_time)
        #coeffsMatrix= coeffsMatrix.unsqueeze(0).repeat(lenth,1,1)
        # print("coe_s")
        # print(coeffsMatrix.shape)
        # print(aaaaa)
        #H=(visibility*cos*coeffsMatrix).mean(dim=1) # H /= torch.pi
        # print(V.shape)
        # print(d.shape)
        # print(N.shape)
        # roughness=torch.ones_like(roughness)
        # metallic=torch.zeros_like(metallic)
        # albedo=torch.ones_like(albedo)
        print(cos.mean())
        #print(aaaaaa)
        H=(visibility*cos)
        #H=(cos)
        V=V.unsqueeze(1).repeat(1,sample_count,1)
            #BRDF1=brdf(V=V,L=d,N=N,baseColor=albedo)
    BRDF1=rendering_equation( output_dirs=V, 
                    normals=N, 
                    base_color=albedo.unsqueeze(1).repeat(1,sample_count,1), 
                    roughness=roughness.unsqueeze(1).repeat(1,sample_count,1), 
                    metallic=metallic.unsqueeze(1).repeat(1,sample_count,1), 
                    incident_dirs=d)
    #BRDF1=torch.ones_like(BRDF1)
    # print(visibility.shape)
    # print(BRDF1.shape)		 # H /= torch.pi
    #print(coeffsMatrix.shape)
    # print(BRDF1.shape)
    H=H.unsqueeze(2).repeat(1,1,3)
    H=(H*(BRDF1))
    H=H.unsqueeze(3).repeat(1,1,1,order** 2)
    print(H.shape)
    H=(H*coeffsMatrix.unsqueeze(2).repeat(1,1,3,1)).mean(dim=1)
    # H /= torch.pi
    weight = 4.0 * torch.pi 
    H=H*weight
    #print(H.shape)
    #print(H.mean())
    torch.cuda.synchronize()
    print("p1_sh run time:", time.time() - proj_start_time)
    #print(aaaaaaa)
    # print(H.mean())
    # print(aaaaa)
    return H,out_occ
def ProjectFunction2(order, func, sample_side, N, visibility,V,albedo,metallic,roughness,light_shs,indir_shs):
    with torch.no_grad():
    # This is the approach demonstrated in [1] and is useful for arbitrary
    # functions on the sphere that are represented analytically.
        torch.cuda.synchronize()
        proj_start_time = time.time()
        lenth=N.shape[0]
        sample_count=sample_side**2
        rngh = torch.rand((sample_count,1),device="cuda").float()
        rngt = torch.rand((sample_count,1),device="cuda").float()
        t = torch.arange(sample_count,device="cuda").unsqueeze(0).float().unsqueeze(2)
        # print("rngh.shape")
        # print(rngh.shape)
        # print("t.shape")
        # print(t.shape)
        # print("rngh")
        # print(rngh[0])
        # print("t")
        # print(t[0])
        #sample_side=(self.max_sh_degree + 1) ** 2
        t_mod=t%sample_side
        alpha = ((t -t_mod)/sample_side+rngt) / sample_side
        beta = (t_mod + rngh) / sample_side
        # print("alpha")
        # print(alpha.shape)
        # print("beta")
        # print(beta.shape)
        phi = 2.0 * 3.1415926535 * beta
        theta = (2.0 * alpha - 1.0).acos()
        torch.cuda.synchronize()
        # print("p1_sh run time:", time.time() - proj_start_time)
        d=ToVector(theta,phi).squeeze()
        print("ds")
        print(d.shape)
        d=d.unsqueeze(0).repeat(albedo.shape[0],1,1)
        theta=theta.squeeze().unsqueeze(0).repeat(albedo.shape[0],1,1)
        phi=phi.squeeze().unsqueeze(0).repeat(albedo.shape[0],1,1)
        d,a=fibonacci_sphere_sampling(N, sample_count, random_rotate=True)
        d,a=fibonacci_sphere_sampling(N, sample_count, random_rotate=True)
        theta = torch.acos(d[...,2])
        phi = torch.atan2(d[...,1], d[...,0])
        #print(d[0:101])
        #print("ds")
        #print(d.shape)
        #d_reshape=d.reshape(d.shape[0]*d.shape[1],3,1)
        #print(d_reshape[0:101])
        #print("visis")
        #visibility=visibility.unsqueeze(1).repeat(1,d.shape[1],1,1)
        #print(visibility[0:101,:,0])
        #visibility=visibility.reshape(d.shape[0]*d.shape[1],order**2,1)
        #print(visibility[0:101,0])
        #print(visibility.unsqueeze(1).repeat(1,3,1,1)[0:101,:,0,:])
        #print(aaaaaa)
        # print("vs")
        #print(vvisibility.unsqueeze(1).repeat(1,3,1,1).squeeze().shape)
        #print("drs")
        #print(d_reshape.shape)
        N=N.unsqueeze(1).repeat(1,sample_count,1)
        cos=(torch.mul(d,N).mean(dim=2)*3)
        zero = torch.zeros_like(cos)
        one = torch.ones_like(cos)
        print(cos[0])
        visi_cos=torch.where( cos >= -0.3, one, zero)
        #visi_cos = torch.where(cos < -0.3, zero, cos)
        print(visi_cos[0])
        cos = torch.where(cos <= 0.0, zero, cos)
        shs_visibility = visibility.transpose(1, 2).view(albedo.shape[0], 1, 1, -1)
        # print(shs_visibility.shape)
        # print(aaaaa)
        #shs_visibility=torch.ones(3030, 1, 1, 16).cuda()
        torch.cuda.synchronize()
        eval_start_time = time.time()
        #sh_coef=eval_sh_coef(order-1,d).unsqueeze(2)
        print(d.shape)
        #print(sh_coef.shape)
        print(shs_visibility.shape)
        visibility=eval_sh(order-1,shs_visibility,d).squeeze()
        visibility=torch.clamp( visibility + 0.5, 0.0, 1.0)
        print(visibility.shape)
        torch.cuda.synchronize()
        print("eval_sh run time:", time.time() - eval_start_time)
        #print(aaaa)
        # print("vs")
        # print(visibility.shape)
        #visibility=visibility[:,0].reshape(d.shape[0],d.shape[1])
        # print("vss")
        print(visibility.mean())
        print(visibility[0])
        print(visibility.shape)
        print(visi_cos.shape)
        visibility=visibility.squeeze()#*visi_cos
        print(visibility.mean())
        print(visibility[0])
        #print(aaaaa)
        print(cos.shape)
        print(visibility.shape)	
        #print(aaaaaa)		
        out_occ=visibility.mean(dim=1)#*2#*torch.pi 
        # H=torch.ones((lenth,3,16)).float().cuda()
        # return H,out_occ
        # print(out_occ.max())
        # print(aaaaaa)
        #d_N=d
        # print("Ns")
        # print(N.shape)
        # print("ds")
        # print(d.shape)
        #cos=cos.unsqueeze(2).repeat(1,1,order** 2)
        #visibility=visibility.unsqueeze(2).repeat(1,1,order** 2)
        zeros = torch.zeros_like(visibility)
        ones = torch.ones_like(visibility)
        # visibility = torch.where(visibility >= 0.7, ones, visibility) # TODO
        # visibility = torch.where(visibility < 0.7, zeros, visibility)
        # visibiliti = torch.ones_like()
        # print("cos_s")
        # print(cos.device)
        torch.cuda.synchronize()
        print("p2_sh run time:", time.time() - proj_start_time)
        torch.cuda.synchronize()
        sh_eval_time=time.time()                 
        coeffsMatrix=shEvaluate(theta.squeeze(), phi.squeeze(), order-1)#.unsqueeze(0).repeat(N.shape[0],1,1)
        torch.cuda.synchronize()
        print("sheval run time:", time.time()-sh_eval_time)
        #coeffsMatrix= coeffsMatrix.unsqueeze(0).repeat(lenth,1,1)
        # print("coe_s")
        # print(coeffsMatrix.shape)
        # print(aaaaa)
        #H=(visibility*cos*coeffsMatrix).mean(dim=1) # H /= torch.pi
        # print(V.shape)
        # print(d.shape)
        # print(N.shape)
        # roughness=torch.ones_like(roughness)
        # metallic=torch.zeros_like(metallic)
        # albedo=torch.ones_like(albedo)
        print(cos.mean())
        #print(aaaaaa)
        H=(visibility*cos)
        #H=(cos)
        V=V.unsqueeze(1).repeat(1,sample_count,1)
            #BRDF1=brdf(V=V,L=d,N=N,baseColor=albedo)
    BRDF1=rendering_equation( output_dirs=V, 
                    normals=N, 
                    base_color=albedo.unsqueeze(1).repeat(1,sample_count,1), 
                    roughness=roughness.unsqueeze(1).repeat(1,sample_count,1), 
                    metallic=metallic.unsqueeze(1).repeat(1,sample_count,1), 
                    incident_dirs=d)
    #BRDF1=torch.ones_like(BRDF1)
    # print(visibility.shape)
    # print(BRDF1.shape)		 # H /= torch.pi
    #print(coeffsMatrix.shape)
    # print(BRDF1.shape)
    H=H.unsqueeze(2).repeat(1,1,3)
    H=(H*(BRDF1))
    print(light_shs.shape)
    print(indir_shs.shape)
    print(indir_shs.mean())
    light=eval_sh(order-1,light_shs.transpose(0,1).unsqueeze(0).unsqueeze(1).repeat(lenth,sample_count,1,1),d)
    indir_light=eval_sh(order-1,indir_shs.transpose(1,2).unsqueeze(1).repeat(1,sample_count,1,1),d)
    #light=torch.clamp(light+0.5,min=0.0)
    indir_light=torch.clamp(indir_light,min=0.0)
    # print(indir_light.mean())
    # print(aaaaa)
    H_full=(H*(light+indir_light)).mean(dim=1)
    H_dir=(H*(light)).mean(dim=1)/torch.pi
    H_indir=(H*(indir_light)).mean(dim=1)
    H_full=H_full*2.0*torch.pi
    H_dir=H_dir*2.0*torch.pi
    H_indir=H_indir*2.0*torch.pi
    return H_full,out_occ,H_dir,H_indir
    #H=H.unsqueeze(3).repeat(1,1,1,order** 2)
    print(H.shape)
    # H=(H*coeffsMatrix.unsqueeze(2).repeat(1,1,3,1)).mean(dim=1)
    # # H /= torch.pi
    # weight = 4.0 * torch.pi 
    # H=H*weight*2.0
    # #print(H.shape)
    # #print(H.mean())
    # torch.cuda.synchronize()
    # print("p1_sh run time:", time.time() - proj_start_time)
    # #print(aaaaaaa)
    # # print(H.mean())
    # # print(aaaaa)
    # return H,out_occ
    # BRDF1=brdf(V=V,L=d,N=N,baseColor=albedo)
    # #BRDF1=torch.nan_to_num(BRDF1, nan=0.0)
    # print("BRDF")
    # print(BRDF1[0])
    # print(visibility.shape)
    # print(BRDF1.shape)
    #     #H=(cos*coeffsMatrix).mean(dim=1)
    # H=((visibility*cos*coeffsMatrix).unsqueeze(2).repeat(1,1,3,1)*(BRDF1.unsqueeze(3).repeat(1,1,1,(order)**2))).mean(dim=1)
    # weight = 4.0 * torch.pi 
    # H=H*weight
    #     #print(H.shape)
    #     #print(H.mean())
    # torch.cuda.synchronize()
    # print("p1_sh run time:", time.time() - proj_start_time)
    # return H,out_occ
        #print(aaaaaaa)
        # print(H.mean())
        # print(aaaaa)
    #print(coeffsMatrix.shape)
    # print("???????")
    # print(aaaaaaa)
    #std::unique_ptr<std::vector<double>> coeffs(new std::vector<double>());
    #coeffs->assign(GetCoefficientCount(order), 0.0);

    # // generate sample_side^2 uniformly and stratified samples over the sphere
    # std::random_device rd;
    # std::mt19937 gen(rd());
    # std::uniform_real_distribution<> rng(0.0, 1.0);
    # for (int t = 0; t < sample_side; t++) {
    #     for (int p = 0; p < sample_side; p++) {
    #     double alpha = (t + rng(gen)) / sample_side;
    #     double beta = (p + rng(gen)) / sample_side;
    #     // See http://www.bogotobogo.com/Algorithms/uniform_distribution_sphere.php
    #     double phi = 2.0 * M_PI * beta;
    #     double theta = acos(2.0 * alpha - 1.0);

    #     // evaluate the analytic function for the current spherical coords
    #     double func_value = func(phi, theta);

    #     // evaluate the SH basis functions up to band O, scale them by the
    #     // function's value and accumulate them over all generated samples
    #     for (int l = 0; l <= order; l++) {
    #         for (int m = -l; m <= l; m++) {
    #         double sh = EvalSH(l, m, phi, theta);
    #         (*coeffs)[GetIndex(l, m)] += func_value * sh;
    #         }
    #     }
    #     }
    

#   // scale by the probability of a particular sample, which is
#   // 4pi/sample_side^2. 4pi for the surface area of a unit sphere, and
#   // 1/sample_side^2 for the number of samples drawn uniformly.
#   double weight = 4.0 * M_PI / (sample_side * sample_side);
#   for (unsigned int i = 0; i < coeffs->size(); i++) {
#      (*coeffs)[i] *= weight;
#   }

#   return coeffs;
# }
