@article{golchin2023time,
  title={Time to rethink benchmarks: Benchmarking for contamination in large language models},
  author={Golchin, Shahriar and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2406.04244},
  year={2024}
}

@article{shi2024rethinking,
  title={Rethinking benchmark and contamination for language models with rephrased samples},
  author={Shi, Mengzhou and Fried, Daniel and Gao, Tianyu and Levy, Omer},
  journal={arXiv preprint arXiv:2311.04850},
  year={2023}
}

@article{ren2024benchmarking,
  title={Benchmarking benchmark leakage in large language models},
  author={Ren, Ruixiang and Deng, Yizhe and Sakaguchi, Keisuke and Zhang, Yifan and Zhang, Meng and Choi, Yejin and Graham, Yuntian},
  journal={arXiv preprint arXiv:2404.18824},
  year={2024}
}

@article{li2025lessleak,
  title={LessLeak-Bench: A first investigation of data leakage in LLMs across 83 software engineering benchmarks},
  author={Li, Jialun and Han, Jingwen and Zhou, Zhengran and Wu, Jiarong and Liu, Shuai and Wang, Tao and Zhou, Minlie and Zhang, Dongmei},
  journal={arXiv preprint arXiv:2502.06215},
  year={2025}
}

@article{nguyen2024search,
  title={Search-time data contamination},
  author={Nguyen, Duc Anh and Nguyen, Duc Minh and Nguyen, Duc Duy and Pham, Duc Huy},
  journal={arXiv preprint arXiv:2508.13180},
  year={2024}
}

@article{golchin2024contaminated,
  title={How contaminated is your benchmark? Measuring dataset leakage in large language models with kernel divergence},
  author={Golchin, Shahriar and Chen, Weiwei and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2502.00678},
  year={2025}
}

@inproceedings{wei2022chain,
  title={Chain-of-thought prompting elicits reasoning in large language models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  booktitle={Advances in neural information processing systems},
  volume={35},
  pages={24824--24837},
  year={2022}
}

@article{yao2023tree,
  title={Tree of thoughts: Deliberate problem solving with large language models},
  author={Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L and Cao, Yuan and Narasimhan, Karthik},
  journal={arXiv preprint arXiv:2305.10601},
  year={2023}
}

@article{yao2022react,
  title={ReAct: Synergizing reasoning and acting in language models},
  author={Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  journal={arXiv preprint arXiv:2210.03629},
  year={2022}
}

@article{zhang2023autogpt,
  title={Auto-GPT for online decision making: Benchmarks and additional opinions},
  author={Zhang, Ziniu and Wang, Shuaiwen and Fang, Bowei and Zhou, Yongkang and Zhang, Xiangyu and others},
  journal={arXiv preprint arXiv:2306.02224},
  year={2023}
}

@misc{autogpt2023benchmark,
  title={Auto-GPT Benchmarks: A repo built for the purpose of benchmarking the performance of agents},
  author={{Significant Gravitas}},
  year={2023},
  howpublished={\url{https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks}}
}

@article{chandrasekaran2023test,
  title={Test \& evaluation best practices for machine learning-enabled systems},
  author={Chandrasekaran, Jaganmohan and Feng, Yang and Memon, Atif and Sinha, Saurabh and others},
  journal={arXiv preprint arXiv:2310.06800},
  year={2023}
}

@article{olson2017pmlb,
  title={PMLB: a large benchmark suite for machine learning evaluation and comparison},
  author={Olson, Randal S and La Cava, William and Orzechowski, Patryk and Urbanowicz, Ryan J and Moore, Jason H},
  journal={BioData mining},
  volume={10},
  number={1},
  pages={1--13},
  year={2017},
  publisher={BioMed Central}
}

@article{bouthillier2021recommendations,
  title={Recommendations for machine learning benchmarks in neuroimaging},
  author={Bouthillier, Xavier and others},
  journal={NeuroImage},
  volume={257},
  pages={119298},
  year={2022},
  publisher={Elsevier}
}

@misc{mlops2024crisp,
  title={Machine Learning Operations},
  author={{ML-Ops Community}},
  year={2024},
  howpublished={\url{https://ml-ops.org/content/crisp-ml}}
}

@misc{mlsys2024benchmarking,
  title={ML Systems Textbook},
  author={{ML Systems Community}},
  year={2024},
  howpublished={\url{https://www.mlsysbook.ai/contents/core/benchmarking/benchmarking}}
}

@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}

@article{touvron2023llama,
  title={Llama: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}

@article{matton2024leakage,
  title={On leakage of code generation evaluation datasets},
  author={Matton, Alexandre and Sherborne, Tom and Aumiller, Dennis and Tommasone, Elena and Aloui, Milad},
  journal={Findings of EMNLP},
  year={2024}
}

@inproceedings{carlini2021extracting,
  title={Extracting training data from large language models},
  author={Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and Herbert-Voss, Ariel and Lee, Katherine and Roberts, Adam and Brown, Tom and Song, Dawn and Erlingsson, Ulfar and others},
  booktitle={30th USENIX Security Symposium (USENIX Security 21)},
  pages={2633--2650},
  year={2021}
}

@article{deng2023investigating,
  title={Investigating data contamination in modern benchmarks for large language models},
  author={Deng, Chunyuan and Zhang, Yilun and Li, Shiyu and Zhao, Yanze and Xiong, Jiangjie and Zhang, Cheng and Luo, Hao and others},
  journal={arXiv preprint arXiv:2311.09783},
  year={2023}
}