function C = tprod(A,B)

% Tensor-tensor product of two 3 way tensors: C = A*B
% A - n1*n2*n3 tensor
% B - n2*l*n3  tensor
% C - n1*l*n3  tensor
%
% version 2.0 - 09/10/2017
%
% Written by Canyi Lu (canyilu@gmail.com)
%
%
% References: 
% Canyi Lu, Tensor-Tensor Product Toolbox. Carnegie Mellon University. 
% June, 2018. https://github.com/canyilu/tproduct.
%
% Canyi Lu, Jiashi Feng, Yudong Chen, Wei Liu, Zhouchen Lin and Shuicheng
% Yan, Tensor Robust Principal Component Analysis with A New Tensor Nuclear
% Norm, arXiv preprint arXiv:1804.03728, 2018
%

% [n1,n2,n3] = size(A);
% [m1,m2,m3] = size(B);
% 
% if n2 ~= m1 || n3 ~= m3 
%     error('Inner tensor dimensions must agree.');
% end
% 
% A = fft(A,[],3);
% B = fft(B,[],3);
% C = zeros(n1,m2,n3);
% 
% % first frontal slice
% C(:,:,1) = A(:,:,1)*B(:,:,1);
% % i=2,...,halfn3
% halfn3 = round(n3/2);
% for i = 2 : halfn3
%     C(:,:,i) = A(:,:,i)*B(:,:,i);
%     C(:,:,n3+2-i) = conj(C(:,:,i));
% end
% 
% % if n3 is even
% if mod(n3,2) == 0
%     i = halfn3+1;
%     C(:,:,i) = A(:,:,i)*B(:,:,i);
% end
% C = ifft(C,[],3);





if nargin < 3
        do_gather = true;
    end

    % 尺寸检查（先在 CPU 上做）
    [n1, n2, n3] = size(A);
    [m1, m2, m3] = size(B);
    if n2 ~= m1 || n3 ~= m3
        error('Inner tensor dimensions must agree.');
    end

    % 转到 GPU
    Ag = gpuArray(A);
    Bg = gpuArray(B);

    % 在第 3 维做 FFT（GPU 上）
    Af = fft(Ag, [], 3);     % size: n1 x n2 x n3
    Bf = fft(Bg, [], 3);     % size: n2 x m2 x n3

    % 利用 pagefun(@mtimes, ...) 对每个频率切片做矩阵乘:
    % 对 p=1..n3，有 C_f(:,:,p) = Af(:,:,p) * Bf(:,:,p)
    Cf = pagefun(@mtimes, Af, Bf);   % size: n1 x m2 x n3, complex gpuArray

    % 逆 FFT 回时域
    Cg = ifft(Cf, [], 3);   % complex gpuArray

    % 如果原始输入是实数，结果理论上也是实数，取实部
    if isreal(A) && isreal(B)
        Cg = real(Cg);
    end

    % 是否 gather 回 CPU
    if do_gather
        C = gather(Cg);
    else
        C = Cg;    % 保持在 GPU 上
    end
