function [U, S, V] = tsvd(X, opt, do_gather)
% [U,S,V] = tsvd_gpu(X,opt,do_gather)
% GPU 加速版本的 tensor SVD: X = U * S * V^*
% 其他语义与原 tsvd 相同。
%
% X      - n1*n2*n3 tensor（可以是 CPU 数组，会自动转 GPU）
% opt    - 'full' | 'econ' | 'skinny' （同原版）
% do_gather - (可选，默认 true)
%             true:  返回 CPU 数组
%             false: 返回 gpuArray，保持在 GPU 上
%
% 需要 Parallel Computing Toolbox + 可用的 CUDA GPU

    if ~exist('opt', 'var') || isempty(opt)
        opt = 'full';
    end
    if ~exist('do_gather', 'var') || isempty(do_gather)
        do_gather = true;
    end

    % ==== 尺寸信息（先在 CPU 上取 size 即可）====
    [n1, n2, n3] = size(X);

    % ==== 转到 GPU ====
    Xg = gpuArray(X);

    % ==== 频域变换（GPU 上）====
    Xf = fft(Xg, [], 3);   % n1 x n2 x n3, complex gpuArray

    % === 不同 opt 分支 ===
    if strcmp(opt, 'skinny') == 1 || strcmp(opt, 'econ') == 1
        % -------- econ / skinny 情况 --------
        min12 = min(n1, n2);

        Ug = gpuArray.zeros(n1, min12, n3, 'like', Xf);
        Sg = gpuArray.zeros(min12, min12, n3, 'like', Xf);
        Vg = gpuArray.zeros(n2, min12, n3, 'like', Xf);

        % i = 1
        [Ug(:,:,1), Sg(:,:,1), Vg(:,:,1)] = svd(Xf(:,:,1), 'econ');

        % i = 2,...,halfn3，利用共轭对称
        halfn3 = round(n3/2);
        for i = 2:halfn3
            [Ui, Si, Vi] = svd(Xf(:,:,i), 'econ');
            Ug(:,:,i)         = Ui;
            Sg(:,:,i)         = Si;
            Vg(:,:,i)         = Vi;
            Ug(:,:,n3+2-i)    = conj(Ui);
            Vg(:,:,n3+2-i)    = conj(Vi);
            Sg(:,:,n3+2-i)    = Si;
        end

        % 若 n3 为偶数，需要单独处理中间片
        if mod(n3, 2) == 0
            i = halfn3 + 1;
            [Ug(:,:,i), Sg(:,:,i), Vg(:,:,i)] = svd(Xf(:,:,i), 'econ');
        end

        % ---- skinny: 截断到张量 tubal rank ----
        if strcmp(opt, 'skinny') == 1
            % s1: 近似每个频率切片奇异值之和的平均，对应论文里的做法
            s1 = diag(sum(Sg, 3)) / n3;     % 这是一个 gpuArray 向量
            % 容忍度
            tol = max(n1, n2) * eps(max(s1));
            % tubal rank
            trank = sum(s1 > tol);

            Ug = Ug(:, 1:trank, :);
            Vg = Vg(:, 1:trank, :);
            Sg = Sg(1:trank, 1:trank, :);
        end

    elseif strcmp(opt, 'full') == 1
        % -------- full 情况 --------
        Ug = gpuArray.zeros(n1, n1, n3, 'like', Xf);
        Sg = gpuArray.zeros(n1, n2, n3, 'like', Xf);
        Vg = gpuArray.zeros(n2, n2, n3, 'like', Xf);

        % i = 1
        [Ug(:,:,1), Sg(:,:,1), Vg(:,:,1)] = svd(Xf(:,:,1));

        % i = 2,...,halfn3
        halfn3 = round(n3/2);
        for i = 2:halfn3
            [Ui, Si, Vi] = svd(Xf(:,:,i));
            Ug(:,:,i)         = Ui;
            Sg(:,:,i)         = Si;
            Vg(:,:,i)         = Vi;
            Ug(:,:,n3+2-i)    = conj(Ui);
            Vg(:,:,n3+2-i)    = conj(Vi);
            Sg(:,:,n3+2-i)    = Si;
        end

        % 若 n3 为偶数
        if mod(n3, 2) == 0
            i = halfn3 + 1;
            [Ug(:,:,i), Sg(:,:,i), Vg(:,:,i)] = svd(Xf(:,:,i));
        end
    else
        error('Unknown option: %s. Use ''full'', ''econ'' or ''skinny''.', opt);
    end

    % ==== 逆 FFT 回时域（仍在 GPU 上）====
    Ug = ifft(Ug, [], 3);
    Sg = ifft(Sg, [], 3);
    Vg = ifft(Vg, [], 3);

    % 若原始 X 为实数，结果理论上也是实数，取实部以避免数值虚部
    if isreal(X)
        Ug = real(Ug);
        Sg = real(Sg);
        Vg = real(Vg);
    end

    % ==== 根据需要是否 gather 回 CPU ====
    if do_gather
        U = gather(Ug);
        S = gather(Sg);
        V = gather(Vg);
    else
        U = Ug;
        S = Sg;
        V = Vg;
    end
end
