{
    "paper_1": 
        {
            "links": "https://arxiv.org/pdf/2404.01318",
            "paper_title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
            "paper_abstract": ""
        }, 
    "paper_2": 
        {
            "links": "https://arxiv.org/pdf/2402.04249",
            "paper_title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
            "paper_abstract": ""
        },
    "paper_3":
        {
            "links": "https://arxiv.org/pdf/2312.00027",
            "paper_title": "Stealthy and Persistent Unalignment on Large Language Models via Backdoor Injections",
            "paper_abstract": ""
        },
    "paper_4":
        {
            "links": "https://arxiv.org/pdf/2410.09024",
            "paper_title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
            "paper_abstract": ""
        },
    "paper_5":
        {
            "links": "https://openreview.net/pdf?id=cw5mgd71jW",
            "paper_title": "Many-shot Jailbreaking",
            "paper_abstract": ""
        },
    "paper_6":
        {
            "links": "https://arxiv.org/pdf/2312.02119",
            "paper_title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically",
            "paper_abstract": ""
        },
    "paper_7":
        {
            "links": "https://arxiv.org/pdf/2404.02151",
            "paper_title": "JAILBREAKING LEADING SAFETY-ALIGNED LLMS WITH SIMPLE ADAPTIVE ATTACKS",
            "paper_abstract": ""
        },
    "paper_8":
        {
            "links": "https://arxiv.org/pdf/2502.04322",
            "paper_title": "SPEAK EASY: Eliciting Harmful Jailbreaks from LLMs with Simple Interactions",
            "paper_abstract": ""
        },
    "paper_9":
        {
            "links": "https://arxiv.org/pdf/2406.09321",
            "paper_title": "JailbreakEval: An Integrated Toolkit for Evaluating Jailbreak Attempts Against Large Language Models",
            "paper_abstract": ""
        },
    "paper_10":
        {
            "links": "https://huggingface.co/meta-llama/Llama-Guard-3-8B",
            "paper_title": "meta-llama/Llama-Guard-4-12B",
            "paper_abstract": ""
        },
    "paper_11":
        {
            "links": "https://arxiv.org/pdf/2502.14744",
            "paper_title": "HiddenDetect: Detecting Jailbreak Attacks against Large Vision-Language Models via Monitoring Hidden States",
            "paper_abstract": ""
        },
    "paper_12":
        {
            "links": "https://arxiv.org/pdf/2406.10248",
            "paper_title": "On the Worst Prompt Performance of Large Language Models",
            "paper_abstract": ""
        },
    "paper_13":
        {
            "links": "https://arxiv.org/pdf/2303.08896",
            "paper_title": "SELFCHECKGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models",
            "paper_abstract": ""
        },
    "paper_14":
        {
            "links": "https://arxiv.org/pdf/2405.20003",
            "paper_title": "Kernel Language Entropy: Fine-grained Uncertainty Quantification for LLMs from Semantic Similarities",
            "paper_abstract": ""
        },
    "paper_15":
        {
            "links": "https://arxiv.org/pdf/2503.00172",
            "paper_title": "A Survey of Uncertainty Estimation Methods on Large Language Models",
            "paper_abstract": ""
        },
    "paper_16":
        {
            "links": "https://arxiv.org/pdf/2507.06306",
            "paper_title": "Humans overrely on overconfident language models, across languages",
            "paper_abstract": ""
        },
    "paper_17":
        {
            "links": "https://www.nature.com/articles/s41586-024-07421-0",
            "paper_title": "Detecting hallucinations in large language models using semantic entropy",
            "paper_abstract": ""
        },
    "paper_18":
        {
            "links": "https://arxiv.org/pdf/2508.08992v1",
            "paper_title": "Prospect Theory Fails for LLMs: Revealing Instability of Decision-Making under Epistemic Uncertainty",
            "paper_abstract": ""
        },
    "paper_19":
        {
            "links": "https://arxiv.org/pdf/2406.13261v3",
            "paper_title": "BeHonest: Benchmarking Honesty in Large Language Models",
            "paper_abstract": ""
        },
    "paper_20":
        {
            "links": "https://arxiv.org/pdf/2412.18171v1",
            "paper_title": "Token Highlighter: Inspecting and Mitigating Jailbreak Prompts for Large Language Models",
            "paper_abstract": ""
        },
    "paper_21":
        {
            "links": "https://arxiv.org/pdf/2502.15435v1",
            "paper_title": "Single-pass Detection of Jailbreaking Input in Large Language Models",
            "paper_abstract": ""
        },
    "paper_22":
        {
            "links": "https://arxiv.org/pdf/2410.20774",
            "paper_title": "Are LLM-Judges Robust to Expressions of Uncertainty? Investigating the effect of Epistemic Markers on LLM-based Evaluation",
            "paper_abstract": ""
        },
    "paper_23":
        {
            "links": "https://arxiv.org/pdf/2505.24778",
            "paper_title": "Revisiting Epistemic Markers in Confidence Estimation: Can Markers Accurately Reflect Large Language Models' Uncertainty?",
            "paper_abstract": ""
        },
    "paper_24":
        {
            "links": "https://arxiv.org/pdf/2412.14737",
            "paper_title": "On Verbalized Confidence Scores for LLMs",
            "paper_abstract": ""
        },
    "paper_25":
        {
            "links": "https://arxiv.org/pdf/2410.06707",
            "paper_title": "Calibrating Verbalized Probabilities for Large Language Models",
            "paper_abstract": ""
        },
    "paper_26":
        {
            "links": "https://arxiv.org/pdf/2411.06528v2",
            "paper_title": "Epistemic Integrity in Large Language Models",
            "paper_abstract": ""
        },
    "paper_27":
        {
            "links": "https://arxiv.org/pdf/2403.00867",
            "paper_title": "Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes",
            "paper_abstract": ""
        },
    "paper_28":
        {
            "links": "https://arxiv.org/pdf/2402.13494",
            "paper_title": "GradSafe: Detecting Jailbreak Prompts for LLMs via Safety-Critical Gradient Analysis",
            "paper_abstract": ""
        },
    "paper_29":
        {
            "links": "https://arxiv.org/pdf/2506.00085",
            "paper_title": "COSMIC: Generalized Refusal Direction Identification in LLM Activations",
            "paper_abstract": ""
        },
    "paper_30":
        {
            "links": "https://arxiv.org/pdf/2406.18495",
            "paper_title": "WILDGUARD: Open One-stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs",
            "paper_abstract": ""
        },
    "paper_31":
        {
            "links": "https://arxiv.org/pdf/2402.16192v2",
            "paper_title": "Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing",
            "paper_abstract": ""
        },
    "paper_32": 
        {
            "links": "https://arxiv.org/pdf/2502.11084v2",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_33": 
        {
            "links": "https://arxiv.org/abs/2412.14093",
            "paper_title": "",
            "paper_abstract": ""
        }, 
    "paper_34": 
        {
            "links": "https://arxiv.org/html/2505.03563v2",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_35": 
        {
            "links": "https://arxiv.org/pdf/2406.15927",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_36": 
        {
            "links": "https://arxiv.org/abs/2406.14598",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_37": 
        {
            "links": "https://arxiv.org/pdf/2406.19314",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_38": 
        {
            "links": "https://arxiv.org/pdf/2410.22685v1",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_39": 
        {
            "links": "https://arxiv.org/pdf/2405.20947v5",
            "paper_title": "",
            "paper_abstract": ""
        },
    "paper_40": 
        {
            "links": "https://arxiv.org/pdf/2505.23856v1",
            "paper_title": "",
            "paper_abstract": ""
        }
    }