@article{golchin2023time,
  title={Time to rethink benchmarks: Benchmarking for contamination in large language models},
  author={Golchin, Shahriar and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2406.04244},
  year={2024}
}

@article{shi2024rethinking,
  title={Rethinking benchmark and contamination for language models with rephrased samples},
  author={Shi, Mengzhou and Fried, Daniel and Gao, Tianyu and Levy, Omer},
  journal={arXiv preprint arXiv:2311.04850},
  year={2023}
}

@article{ren2024benchmarking,
  title={Benchmarking benchmark leakage in large language models},
  author={Ren, Ruixiang and Deng, Yizhe and Sakaguchi, Keisuke and Zhang, Yifan and Zhang, Meng and Choi, Yejin and Graham, Yuntian},
  journal={arXiv preprint arXiv:2404.18824},
  year={2024}
}

@article{li2025lessleak,
  title={LessLeak-Bench: A first investigation of data leakage in LLMs across 83 software engineering benchmarks},
  author={Li, Jialun and Han, Jingwen and Zhou, Zhengran and Wu, Jiarong and Liu, Shuai and Wang, Tao and Zhou, Minlie and Zhang, Dongmei},
  journal={arXiv preprint arXiv:2502.06215},
  year={2025}
}

@article{nguyen2024search,
  title={Search-time data contamination},
  author={Nguyen, Duc Anh and Nguyen, Duc Minh and Nguyen, Duc Duy and Pham, Duc Huy},
  journal={arXiv preprint arXiv:2508.13180},
  year={2024}
}

@article{golchin2024contaminated,
  title={How contaminated is your benchmark? Measuring dataset leakage in large language models with kernel divergence},
  author={Golchin, Shahriar and Chen, Weiwei and Surdeanu, Mihai},
  journal={arXiv preprint arXiv:2502.00678},
  year={2025}
}

@inproceedings{wei2022chain,
  title={Chain-of-thought prompting elicits reasoning in large language models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  booktitle={Advances in neural information processing systems},
  volume={35},
  pages={24824--24837},
  year={2022}
}

@article{yao2023tree,
  title={Tree of thoughts: Deliberate problem solving with large language models},
  author={Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L and Cao, Yuan and Narasimhan, Karthik},
  journal={arXiv preprint arXiv:2305.10601},
  year={2023}
}

@article{yao2022react,
  title={ReAct: Synergizing reasoning and acting in language models},
  author={Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  journal={arXiv preprint arXiv:2210.03629},
  year={2022}
}

@article{zhang2023autogpt,
  title={Auto-GPT for online decision making: Benchmarks and additional opinions},
  author={Zhang, Ziniu and Wang, Shuaiwen and Fang, Bowei and Zhou, Yongkang and Zhang, Xiangyu and others},
  journal={arXiv preprint arXiv:2306.02224},
  year={2023}
}

@misc{autogpt2023benchmark,
  title={Auto-GPT Benchmarks: A repo built for the purpose of benchmarking the performance of agents},
  author={{Significant Gravitas}},
  year={2023},
  howpublished={\url{https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks}}
}

@article{chandrasekaran2023test,
  title={Test \& evaluation best practices for machine learning-enabled systems},
  author={Chandrasekaran, Jaganmohan and Feng, Yang and Memon, Atif and Sinha, Saurabh and others},
  journal={arXiv preprint arXiv:2310.06800},
  year={2023}
}

@article{olson2017pmlb,
  title={PMLB: a large benchmark suite for machine learning evaluation and comparison},
  author={Olson, Randal S and La Cava, William and Orzechowski, Patryk and Urbanowicz, Ryan J and Moore, Jason H},
  journal={BioData mining},
  volume={10},
  number={1},
  pages={1--13},
  year={2017},
  publisher={BioMed Central}
}

@article{bouthillier2021recommendations,
  title={Recommendations for machine learning benchmarks in neuroimaging},
  author={Bouthillier, Xavier and others},
  journal={NeuroImage},
  volume={257},
  pages={119298},
  year={2022},
  publisher={Elsevier}
}

@misc{mlops2024crisp,
  title={Machine Learning Operations},
  author={{ML-Ops Community}},
  year={2024},
  howpublished={\url{https://ml-ops.org/content/crisp-ml}}
}

@misc{mlsys2024benchmarking,
  title={ML Systems Textbook},
  author={{ML Systems Community}},
  year={2024},
  howpublished={\url{https://www.mlsysbook.ai/contents/core/benchmarking/benchmarking}}
}

@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}

@article{touvron2023llama,
  title={Llama: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}

@article{matton2024leakage,
  title={On leakage of code generation evaluation datasets},
  author={Matton, Alexandre and Sherborne, Tom and Aumiller, Dennis and Tommasone, Elena and Aloui, Milad},
  journal={Findings of EMNLP},
  year={2024}
}

@inproceedings{carlini2021extracting,
  title={Extracting training data from large language models},
  author={Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and Herbert-Voss, Ariel and Lee, Katherine and Roberts, Adam and Brown, Tom and Song, Dawn and Erlingsson, Ulfar and others},
  booktitle={30th USENIX Security Symposium (USENIX Security 21)},
  pages={2633--2650},
  year={2021}
}

@article{openai2024learning,
  title={Learning to reason with LLMs},
  author={{OpenAI}},
  year={2024},
  howpublished={\url{https://openai.com/index/learning-to-reason-with-llms/}}
}

@article{chembench2024nature,
  title={A framework for evaluating the chemical knowledge and reasoning abilities of large language models against the expertise of chemists},
  author={{ChemBench Consortium}},
  journal={Nature Chemistry},
  year={2024},
  note={Available at: \url{https://www.nature.com/articles/s41557-025-01815-x}}
}

@article{chemsafetybench2024,
  title={ChemSafetyBench: Benchmarking LLM safety on chemistry domain},
  author={Wang, Haochen and Zhang, Xiangyu and Li, Yiran and Chen, Tong and others},
  journal={arXiv preprint arXiv:2411.16736},
  year={2024}
}

@article{google2024curie,
  title={Evaluating progress of LLMs on scientific problem-solving},
  author={{Google Research}},
  year={2024},
  howpublished={\url{https://research.google/blog/evaluating-progress-of-llms-on-scientific-problem-solving/}}
}

@article{sciknoweval2024,
  title={SciKnowEval: Evaluating multi-level scientific knowledge of large language models},
  author={Luo, Haowei and Zhang, Fanqi and Chen, Bo and Liu, Xiangang and others},
  journal={OpenReview},
  year={2024}
}

@article{pineau2021improving,
  title={Improving reproducibility in machine learning research (a report from the NeurIPS 2019 reproducibility program)},
  author={Pineau, Joelle and Vincent-Lamarre, Philippe and Sinha, Koustuv and Larivière, Vincent and Beygelzimer, Alina and d'Alché-Buc, Florence and Fox, Emily and Larochelle, Hugo},
  journal={The Journal of Machine Learning Research},
  volume={22},
  number={1},
  pages={7459--7478},
  year={2021},
  publisher={JMLR.org}
}

@article{desai2025reproducibility,
  title={What is reproducibility in artificial intelligence and machine learning research?},
  author={Desai, Shrey and others},
  journal={AI Magazine},
  year={2025},
  publisher={Wiley Online Library}
}

@article{roegiest2023reproducibility,
  title={Reproducibility in NLP: What have we learned from the checklist?},
  author={Roegiest, Adam and Kumar, Anshuman and others},
  journal={arXiv preprint arXiv:2306.09562},
  year={2023}
}

@article{semmelrock2025reproducibility,
  title={Reproducibility in machine-learning-based research: Overview, barriers, and drivers},
  author={Semmelrock, Jonas and Weinmann, Sarah},
  journal={AI Magazine},
  year={2025},
  publisher={Wiley Online Library}
}

@misc{kapoor2024reproducibility,
  title={Leakage and the reproducibility crisis in ML-based science},
  author={Kapoor, Sayash and Narayanan, Arvind},
  year={2024},
  howpublished={\url{https://reproducible.cs.princeton.edu/}}
}

@article{kohavi1995study,
  title={A study of cross-validation and bootstrap for accuracy estimation and model selection},
  author={Kohavi, Ron},
  journal={International joint Conference on artificial intelligence},
  volume={14},
  pages={1137--1145},
  year={1995},
  publisher={Lawrence Erlbaum Associates Ltd}
}

@article{domingos2012few,
  title={A few useful things to know about machine learning},
  author={Domingos, Pedro},
  journal={Communications of the ACM},
  volume={55},
  number={10},
  pages={78--87},
  year={2012},
  publisher={ACM New York, NY, USA}
}

@book{hastie2009elements,
  title={The elements of statistical learning: data mining, inference, and prediction},
  author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
  year={2009},
  publisher={Springer Science \& Business Media}
}

@article{roberts2024reforms,
  title={REFORMS: Consensus-based recommendations for machine-learning-based science},
  author={Roberts, Kirk and others},
  journal={PMC},
  year={2024},
  note={Available at: \url{https://pmc.ncbi.nlm.nih.gov/articles/PMC11092361/}}
}