/**
 * @file kernels_cpu.h
 * @brief Soft True Damerau-Levenshtein CPU Kernel Declarations
 *
 * CPU implementations that mirror the CUDA interface for seamless dispatch.
 *
 * Damerau differs from OSA in that transpositions can span variable distances
 * based on character positions, using precomputed trans_src indices.
 */

#pragma once

namespace d2p {
namespace damerau {
namespace cpu {

// Positive infinity for minimization
constexpr float PINF = 1e30f;

/**
 * @brief Forward pass for Soft Damerau-Levenshtein (CPU)
 */
void damerau_forward_cpu(
    const float* sub_costs,
    const int* trans_src,
    float* alpha,
    float* damerau_score,
    const int* lengths,
    float ins_cost, float del_cost, float trans_cost,
    int B, int max_L1, int max_L2,
    float T
);

/**
 * @brief Backward pass for Soft Damerau-Levenshtein (CPU)
 */
void damerau_backward_cpu(
    const float* alpha,
    const float* sub_costs,
    const int* trans_src,
    const float* damerau_score,
    float* beta,
    float* posteriors,
    float* grad_T,
    float* grad_ins,
    float* grad_del,
    float* grad_trans,
    const int* lengths,
    float ins_cost, float del_cost, float trans_cost,
    int B, int max_L1, int max_L2,
    float T
);

/**
 * @brief Hessian-vector product for Soft Damerau-Levenshtein (CPU)
 */
void damerau_hvp_cpu(
    const float* alpha,
    const float* sub_costs,
    const int* trans_src,
    const float* damerau_score,
    const float* V,
    float* d_alpha,
    float* d_score,
    float* beta,
    float* d_beta,
    float* H_scores,
    const int* lengths,
    float ins_cost, float del_cost, float trans_cost,
    int B, int max_L1, int max_L2,
    float T
);

}  // namespace cpu
}  // namespace damerau
}  // namespace d2p
