{
    "uuid": "f7b532c1-3fd7-5a2b-87b4-522592ff6dbe",
    "question": "When training with non-English image-text pairs, what is the loss function of the TriKD?",
    "answer_format": "",
    "tags": [
        "formula",
        "single",
        "subjective"
    ],
    "conference": [],
    "reasoning_steps": [
        "First, get the content of section 3.2 and summarize the answer."
    ],
    "evaluator": {
        "eval_func": "eval_reference_answer_with_llm",
        "eval_kwargs": {
            "reference_answer": "When training with non-English image-text pairs, only ITC loss is applied, as the CLIP text encoder does not support non-English languages. Therefore, the loss function for the TriKD is $\\mathcal{L}_{\\text{TriKD}} = \\mathcal{L}_{\\text{ITC}}$. And the Image-Text Contrastive(ITC) loss is formulated as the average of image-to-text($\\mathcal{L}_{\\text{i2x}}$) loss and text-to-image($\\mathcal{L}_{\\text{x2i}})$) loss: $\\mathcal{L}_{\\text{ITC}} = 1/2(\\mathcal{L}_{\\text{i2x}} + \\mathcal{L}_{\\text{x2i}}) = 1/2[\\ell(h^I, h^X) + \\ell(h^X, h^I)]$. Here, $h^I$ represents the $\\ell_2$-normalized output from the CLIP image encoder and CLIP-projector, and $h^X$ represents the $\\ell_2$-normalized output from the Multilingual Text Encoder (MTE) and X-projector for a given image-text pair.",
            "question": "When training with non-English image-text pairs, what is the loss function of the TriKD?"
        }
    },
    "state": {
        "gpt-4o-2024-05-13": false,
        "gui-gpt-4o-2024-05-13": true
    },
    "annotator": "human",
    "anchor_pdf": [
        "ff651d37-e725-5752-9c38-3361bc54723d"
    ],
    "reference_pdf": []
}