@article{golchin2023time,
  title={Time to rethink benchmarks: Benchmarking for contamination in large language models},
  author={Golchin, Shahriar and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2406.04244},
  year={2024}
}

@article{shi2024rethinking,
  title={Rethinking benchmark and contamination for language models with rephrased samples},
  author={Shi, Mengzhou and Fried, Daniel and Gao, Tianyu and Levy, Omer},
  journal={arXiv preprint arXiv:2311.04850},
  year={2023}
}

@article{ren2024benchmarking,
  title={Benchmarking benchmark leakage in large language models},
  author={Ren, Ruixiang and Deng, Yizhe and Sakaguchi, Keisuke and Zhang, Yifan and Zhang, Meng and Choi, Yejin and Graham, Yuntian},
  journal={arXiv preprint arXiv:2404.18824},
  year={2024}
}

@article{li2025lessleak,
  title={LessLeak-Bench: A first investigation of data leakage in LLMs across 83 software engineering benchmarks},
  author={Li, Jialun and Han, Jingwen and Zhou, Zhengran and Wu, Jiarong and Liu, Shuai and Wang, Tao and Zhou, Minlie and Zhang, Dongmei},
  journal={arXiv preprint arXiv:2502.06215},
  year={2025}
}

@article{nguyen2024search,
  title={Search-time data contamination},
  author={Nguyen, Duc Anh and Nguyen, Duc Minh and Nguyen, Duc Duy and Pham, Duc Huy},
  journal={arXiv preprint arXiv:2508.13180},
  year={2024}
}

@article{golchin2024contaminated,
  title={How contaminated is your benchmark? Measuring dataset leakage in large language models with kernel divergence},
  author={Golchin, Shahriar and Chen, Weiwei and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2502.00678},
  year={2025}
}

@inproceedings{wei2022chain,
  title={Chain-of-thought prompting elicits reasoning in large language models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  booktitle={Advances in neural information processing systems},
  volume={35},
  pages={24824--24837},
  year={2022}
}

@article{yao2023tree,
  title={Tree of thoughts: Deliberate problem solving with large language models},
  author={Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L and Cao, Yuan and Narasimhan, Karthik},
  journal={arXiv preprint arXiv:2305.10601},
  year={2023}
}

@article{yang2023large,
  title={Large language models as optimizers},
  author={Yang, Chengrun and Wang, Xuezhi and Lu, Yifeng and Liu, Hanxiao and Le, Quoc V and Zhou, Denny and Chen, Xinyun},
  journal={arXiv preprint arXiv:2309.03409},
  year={2023}
}

@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}

@article{touvron2023llama,
  title={Llama: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}

@article{matton2024leakage,
  title={On leakage of code generation evaluation datasets},
  author={Matton, Alexandre and Sherborne, Tom and Aumiller, Dennis and Tommasone, Elena and Aloui, Milad},
  journal={Findings of EMNLP},
  year={2024}
}

@inproceedings{carlini2021extracting,
  title={Extracting training data from large language models},
  author={Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and Herbert-Voss, Ariel and Lee, Katherine and Roberts, Adam and Brown, Tom and Song, Dawn and Erlingsson, Ulfar and others},
  booktitle={30th USENIX Security Symposium (USENIX Security 21)},
  pages={2633--2650},
  year={2021}
}

@article{deng2023investigating,
  title={Investigating data contamination in modern benchmarks for large language models},
  author={Deng, Chunyuan and Zhang, Yilun and Li, Shiyu and Zhao, Yanze and Xiong, Jiangjie and Zhang, Cheng and Luo, Hao and others},
  journal={arXiv preprint arXiv:2311.09783},
  year={2023}
}