{"reasoning_agent": 0.2, "tool_agent": 0.16666666666666666}{"code_generator": 0.18285714285714286, "test_generator": 0.07428571428571429}{"code_generator": 0.3028571428571429, "test_generator": 0.18285714285714286}{"reasoning_agent": 0.26666666666666666, "tool_agent": 0.3333333333333333}{"reasoning_agent": 0.26666666666666666, "tool_agent": 0.26666666666666666}{"code_generator": 0.2857142857142857, "test_generator": 0.13142857142857142}{"code_generator": 0.2, "test_generator": 0.04}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.1}{"reasoning_agent": 0.5, "tool_agent": 0.4666666666666667}{"code_generator": 0.4342857142857143, "test_generator": 0.2914285714285714}{"code_generator": 0.44, "test_generator": 0.29714285714285715}{"code_generator": 0.3878787878787879, "test_generator": 0.14545454545454545}{"code_generator": 0.2786885245901639, "test_generator": 0.040983606557377046}{"code_generator": 0.03821656050955414, "test_generator": 0.0}{"code_generator": 0.4434389140271493, "test_generator": 0.05429864253393665}{"code_generator": 0.027149321266968326, "test_generator": 0.00904977375565611}{"code_generator": 0.43891402714932126, "test_generator": 0.058823529411764705}{"code_generator": 0.38461538461538464, "test_generator": 0.04524886877828054}{"code_generator": 0.39819004524886875, "test_generator": 0.09049773755656108}{"code_generator": 0.19004524886877827, "test_generator": 0.00904977375565611}{"reasoning_agent": 0.472, "tool_agent": 0.49}{"reasoning_agent": 0.1, "tool_agent": 0.06666666666666667}{"code_generator": 0.392, "test_generator": 0.206}{"code_generator": 0.218, "test_generator": 0.034}{"code_generator": 0.18552036199095023, "test_generator": 0.03167420814479638}{"code_generator": 0.09954751131221719, "test_generator": 0.00904977375565611}{"code_generator": 0.058823529411764705, "test_generator": 0.013574660633484163}{"code_generator": 0.09049773755656108, "test_generator": 0.004524886877828055}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.00904977375565611, "test_generator": 0.004524886877828055}{"code_generator": 0.38461538461538464, "test_generator": 0.11764705882352941}{"code_generator": 0.36199095022624433, "test_generator": 0.10407239819004525}{"code_generator": 0.3438914027149321, "test_generator": 0.08597285067873303}{"code_generator": 0.29411764705882354, "test_generator": 0.03167420814479638}{"code_generator": 0.29411764705882354, "test_generator": 0.058823529411764705}{"code_generator": 0.29411764705882354, "test_generator": 0.05429864253393665}{"reasoning_agent": 0.432, "tool_agent": 0.484}{"code_generator": 0.334841628959276, "test_generator": 0.05429864253393665}{"code_generator": 0.334841628959276, "test_generator": 0.058823529411764705}{"reasoning_agent": 0.1, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.03333333333333333, "tool_agent": 0.03333333333333333}{"code_generator": 0.3257918552036199, "test_generator": 0.04072398190045249}{"code_generator": 0.33031674208144796, "test_generator": 0.07692307692307693}{"code_generator": 0.29411764705882354, "test_generator": 0.013574660633484163}{"code_generator": 0.2895927601809955, "test_generator": 0.013574660633484163}{"reasoning_agent": 0.418, "tool_agent": 0.39}{"reasoning_agent": 0.522, "tool_agent": 0.596}{"code_generator": 0.49321266968325794, "test_generator": 0.11764705882352941}{"reasoning_agent": 0.0, "tool_agent": 0.6}{"reasoning_agent": 0.01, "tool_agent": 0.6}{"reasoning_agent": 0.534, "tool_agent": 0.598}{"reasoning_agent": 0.586, "tool_agent": 0.504}{"reasoning_agent": 0.56, "tool_agent": 0.502}{"reasoning_agent": 0.56, "tool_agent": 0.502}{"reasoning_agent": 0.642, "tool_agent": 0.566}{"reasoning_agent": 0.936, "tool_agent": 0.954}{"reasoning_agent": 0.818, "tool_agent": 0.818}{"reasoning_agent": 0.82, "tool_agent": 0.826}{"reasoning_agent": 0.8248673237300985, "tool_agent": 0.8225928733889311}{"reasoning_agent": 0.819560272934041, "tool_agent": 0.7429871114480667}{"reasoning_agent": 0.6338134950720242, "tool_agent": 0.015163002274450341}{"reasoning_agent": 0.818, "tool_agent": 0.816}{"reasoning_agent": 0.814, "tool_agent": 0.754}{"reasoning_agent": 0.816, "tool_agent": 0.758}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.03333333333333333}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.13333333333333333}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.16666666666666666}{"reasoning_agent": 0.0, "tool_agent": 0.0}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.16666666666666666}{"reasoning_agent": 0.1, "tool_agent": 0.1}{"reasoning_agent": 0.1, "tool_agent": 0.16666666666666666}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.0}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.0}{"reasoning_agent": 0.1, "tool_agent": 0.16666666666666666}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.26666666666666666}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.13333333333333333}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.23333333333333334}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.06666666666666667}{"code_generator": 0.024242424242424242, "test_generator": 0.006060606060606061}{"code_generator": 0.03636363636363636, "test_generator": 0.024242424242424242}{"code_generator": 0.144, "test_generator": 0.062}{"code_generator": 0.082, "test_generator": 0.002}{"code_generator": 0.05142857142857143, "test_generator": 0.0}{"code_generator": 0.18285714285714286, "test_generator": 0.07428571428571429}{"reasoning_agent": 0.2, "tool_agent": 0.2}{"reasoning_agent": 0.2, "tool_agent": 0.2}{"reasoning_agent": 0.2, "tool_agent": 0.26666666666666666}{"reasoning_agent": 0.4666666666666667, "tool_agent": 0.43333333333333335}{"reasoning_agent": 0.1, "tool_agent": 0.1}{"reasoning_agent": 0.2, "tool_agent": 0.26666666666666666}{"reasoning_agent": 0.0, "tool_agent": 0.0}{"reasoning_agent": 0.3, "tool_agent": 0.16666666666666666}{"reasoning_agent": 0.0, "tool_agent": 0.0}{"code_generator": 0.142, "test_generator": 0.06}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"reasoning_agent": 0.5192878338278932, "tool_agent": 0.4658753709198813}{"reasoning_agent": 0.48813056379821956, "tool_agent": 0.37537091988130566}{"reasoning_agent": 0.0, "tool_agent": 0.0}{"reasoning_agent": 0.2, "tool_agent": 0.26666666666666666}{"reasoning_agent": 0.2, "tool_agent": 0.3}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.2}{"timestamp":"2025-09-12 15:31:46","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 15:31:56","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 15:32:08","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 15:32:19","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 15:32:29","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 15:32:41","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 15:32:51","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 15:33:01","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 15:33:10","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 15:33:23","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 15:33:35","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 15:33:45","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 16:55:58","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-12 16:56:11","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-12 16:56:23","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-12 16:56:32","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-12 16:56:44","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-12 16:56:57","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-12 16:57:09","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-12 16:57:18","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-12 16:57:28","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-12 16:57:40","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-12 17:04:48","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.03333333333333333, "tool_agent": 0.06666666666666667}{"timestamp":"2025-09-12 17:06:09","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:07:59","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:08:27","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:08:45","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.06666666666666667, "tool_agent": 0.1}{"timestamp":"2025-09-12 17:10:02","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.2, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-12 17:13:27","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.3}{"timestamp":"2025-09-12 17:18:40","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.1, "tool_agent": 0.1}{"timestamp":"2025-09-12 17:19:49","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.2, "tool_agent": 0.13333333333333333}{"timestamp":"2025-09-12 17:21:39","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.13333333333333333}{"timestamp":"2025-09-12 17:23:24","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.1, "tool_agent": 0.1}{"timestamp":"2025-09-12 17:24:41","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.3333333333333333, "tool_agent": 0.26666666666666666}{"timestamp":"2025-09-12 17:28:14","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.3333333333333333, "tool_agent": 0.3}{"timestamp":"2025-09-12 17:33:05","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.03333333333333333}{"timestamp":"2025-09-12 17:34:02","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.1, "tool_agent": 0.13333333333333333}{"timestamp":"2025-09-12 17:36:27","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.2}{"timestamp":"2025-09-12 17:38:49","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:45:09","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:45:29","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:45:42","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:45:55","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:46:09","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:46:18","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:46:29","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:46:39","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:46:42","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:46:49","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:46:55","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:46:59","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:47:05","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:47:09","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:47:16","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:47:19","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:47:25","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:47:29","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:47:38","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:47:38","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:47:48","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:47:49","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:47:59","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:48:00","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:48:12","task":"math","benchmark":"AIME25","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:48:12","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:48:21","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:48:26","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:48:31","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:48:37","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:48:43","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:48:48","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:48:53","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:48:57","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:49:03","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:49:07","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:49:13","task":"math","benchmark":"OlympiadBench_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:49:17","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:49:23","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:49:28","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:49:32","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:49:40","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:49:42","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:49:50","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:49:54","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:50:00","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:50:05","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:50:10","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:50:15","task":"math","benchmark":"gsm8k_test","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:50:24","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:50:27","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:50:36","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:50:46","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:50:57","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:51:07","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:51:16","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:51:26","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:51:35","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:51:45","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:51:55","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:52:05","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:52:15","task":"code","benchmark":"livecodebench","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:52:25","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:52:35","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:52:45","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:52:54","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:53:06","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:53:15","task":"code","benchmark":"code_contests","reasoning":true,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:53:28","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"timestamp":"2025-09-12 17:53:35","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"timestamp":"2025-09-12 17:53:47","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"timestamp":"2025-09-12 17:55:39","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 17:57:02","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 17:57:13","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 17:57:15","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"code_generator": 0.122, "test_generator": 0.036}{"timestamp":"2025-09-12 18:06:42","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"code_generator": 0.178, "test_generator": 0.048}{"timestamp":"2025-09-12 18:42:02","task":"code","benchmark":"apps","reasoning":true,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-12 18:48:39","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 18:48:44","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 18:51:12","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-12 18:51:17","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-12 18:51:20","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"code_generator": 0.138, "test_generator": 0.054}{"timestamp":"2025-09-12 18:53:05","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"code_generator": 0.13, "test_generator": 0.05}{"timestamp":"2025-09-12 18:56:44","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"code_generator": 0.176, "test_generator": 0.082}{"timestamp":"2025-09-12 19:02:24","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"code_generator": 0.13714285714285715, "test_generator": 0.05142857142857143}{"timestamp":"2025-09-12 19:03:39","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"code_generator": 0.17142857142857143, "test_generator": 0.10285714285714286}{"timestamp":"2025-09-12 19:05:55","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"code_generator": 0.14857142857142858, "test_generator": 0.08}{"timestamp":"2025-09-12 19:09:30","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"code_generator": 0.030303030303030304, "test_generator": 0.01818181818181818}{"timestamp":"2025-09-12 19:10:51","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"code_generator": 0.03636363636363636, "test_generator": 0.01818181818181818}{"timestamp":"2025-09-12 19:13:33","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"code_generator": 0.024242424242424242, "test_generator": 0.006060606060606061}{"timestamp":"2025-09-12 19:17:34","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.45103857566765576, "tool_agent": 0.22255192878338279}{"timestamp":"2025-09-12 19:20:22","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.47032640949554894, "tool_agent": 0.36350148367952523}{"timestamp":"2025-09-12 19:26:11","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.4762611275964392, "tool_agent": 0.3649851632047478}{"timestamp":"2025-09-12 19:34:32","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"reasoning_agent": 0.816, "tool_agent": 0.682}{"timestamp":"2025-09-12 19:35:55","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"reasoning_agent": 0.82, "tool_agent": 0.828}{"timestamp":"2025-09-12 19:37:34","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.824, "tool_agent": 0.82}{"timestamp":"2025-09-13 01:46:00","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.2}{"timestamp":"2025-09-13 01:46:59","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 01:48:37","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.3333333333333333}{"timestamp":"2025-09-13 01:50:46","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 01:51:37","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"reasoning_agent": 0.2, "tool_agent": 0.26666666666666666}{"timestamp":"2025-09-13 01:53:18","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"reasoning_agent": 0.2, "tool_agent": 0.3333333333333333}{"timestamp":"2025-09-13 01:55:25","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"reasoning_agent": 0.49258160237388726, "tool_agent": 0.3916913946587537}{"timestamp":"2025-09-13 01:58:17","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"reasoning_agent": 0.5222551928783383, "tool_agent": 0.47774480712166173}{"timestamp":"2025-09-13 02:03:20","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"reasoning_agent": 0.5252225519287834, "tool_agent": 0.4792284866468843}{"timestamp":"2025-09-13 02:10:55","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"reasoning_agent": 0.926, "tool_agent": 0.866}{"timestamp":"2025-09-13 02:13:02","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"reasoning_agent": 0.93, "tool_agent": 0.952}{"timestamp":"2025-09-13 02:14:55","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"reasoning_agent": 0.932, "tool_agent": 0.95}{"timestamp":"2025-09-13 02:16:53","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"code_generator": 0.296, "test_generator": 0.126}{"timestamp":"2025-09-13 02:20:17","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"code_generator": 0.354, "test_generator": 0.248}{"timestamp":"2025-09-13 02:26:47","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"code_generator": 0.354, "test_generator": 0.278}{"timestamp":"2025-09-13 02:36:20","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"code_generator": 0.13714285714285715, "test_generator": 0.06285714285714286}{"timestamp":"2025-09-13 02:38:39","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"code_generator": 0.26285714285714284, "test_generator": 0.18285714285714286}{"timestamp":"2025-09-13 02:42:40","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"code_generator": 0.2342857142857143, "test_generator": 0.17714285714285713}{"timestamp":"2025-09-13 02:49:07","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":1}
{"code_generator": 0.09696969696969697, "test_generator": 0.01818181818181818}{"timestamp":"2025-09-13 02:51:29","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":3}
{"code_generator": 0.13333333333333333, "test_generator": 0.04242424242424243}{"timestamp":"2025-09-13 02:56:19","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-4B","max_turns":5}
{"code_generator": 0.1696969696969697, "test_generator": 0.07272727272727272}{"timestamp":"2025-09-13 20:24:43","task":"math","benchmark":"AIME24","reasoning":true,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:25:12","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:25:25","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:25:35","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:25:45","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:25:55","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:26:05","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:26:15","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:26:25","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:26:34","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:26:43","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:26:52","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:27:02","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:27:11","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:27:22","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:27:32","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:27:42","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:27:52","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:28:02","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 20:28:11","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-13 20:28:20","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 20:28:30","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 21:03:39","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.2, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 21:05:09","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.3}{"timestamp":"2025-09-13 21:07:49","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.2, "tool_agent": 0.36666666666666664}{"timestamp":"2025-09-13 21:11:10","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.16666666666666666}{"timestamp":"2025-09-13 21:12:39","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 21:15:18","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 21:17:59","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.5192878338278932, "tool_agent": 0.3249258160237389}{"timestamp":"2025-09-13 21:22:06","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-13 21:26:46","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-13 21:27:08","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.2, "tool_agent": 0.2}{"timestamp":"2025-09-13 21:30:24","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.3}{"timestamp":"2025-09-13 21:35:53","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.36666666666666664}{"timestamp":"2025-09-13 21:41:19","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.16666666666666666}{"timestamp":"2025-09-13 21:44:35","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.23333333333333334}{"timestamp":"2025-09-13 21:47:47","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.36666666666666664}{"timestamp":"2025-09-13 21:51:27","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.5519287833827893, "tool_agent": 0.32344213649851633}{"timestamp":"2025-09-13 21:59:07","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.5667655786350149, "tool_agent": 0.4732937685459941}{"timestamp":"2025-09-13 22:11:29","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.56973293768546, "tool_agent": 0.4821958456973294}{"timestamp":"2025-09-13 22:26:49","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"reasoning_agent": 0.936, "tool_agent": 0.924}{"timestamp":"2025-09-13 22:31:09","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"reasoning_agent": 0.936, "tool_agent": 0.958}{"timestamp":"2025-09-13 22:34:16","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"reasoning_agent": 0.936, "tool_agent": 0.958}{"timestamp":"2025-09-13 22:36:40","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"code_generator": 0.302, "test_generator": 0.212}{"timestamp":"2025-09-13 22:41:30","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"code_generator": 0.444, "test_generator": 0.386}{"timestamp":"2025-09-13 22:52:43","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"code_generator": 0.448, "test_generator": 0.404}{"timestamp":"2025-09-13 23:10:30","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"code_generator": 0.21714285714285714, "test_generator": 0.10857142857142857}{"timestamp":"2025-09-13 23:13:53","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"code_generator": 0.2914285714285714, "test_generator": 0.21142857142857144}{"timestamp":"2025-09-13 23:22:10","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"code_generator": 0.29714285714285715, "test_generator": 0.25142857142857145}{"timestamp":"2025-09-13 23:35:13","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"code_generator": 0.15757575757575756, "test_generator": 0.05454545454545454}{"timestamp":"2025-09-13 23:38:33","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"code_generator": 0.17575757575757575, "test_generator": 0.10909090909090909}{"timestamp":"2025-09-13 23:48:09","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"code_generator": 0.17575757575757575, "test_generator": 0.09696969696969697}
{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}
{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}nnn{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"timestamp":"2025-09-14 03:05:52","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:06:03","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:06:14","task":"math","benchmark":"AIME24","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:06:24","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:06:34","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:06:44","task":"math","benchmark":"AIME25","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:06:53","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:07:05","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"timestamp":"2025-09-14 03:07:30","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:07:41","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:07:51","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:08:01","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:08:12","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:08:22","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:08:32","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:08:42","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:08:52","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:09:02","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"timestamp":"2025-09-14 03:09:13","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":1}
{"timestamp":"2025-09-14 03:09:25","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":3}
{"timestamp":"2025-09-14 03:09:38","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-8B","max_turns":5}
{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"livecodebench","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-14 11:51:17","task":"code","benchmark":"code_contests","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"OlympiadBench_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":3}
{"timestamp":"2025-09-14 11:51:18","task":"math","benchmark":"gsm8k_test","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":5}
{"reasoning_agent": 0.0, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.1}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.1}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.1}{"reasoning_agent": 0.23333333333333334, "tool_agent": 0.36666666666666664}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.26666666666666666}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.0}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.4666666666666667}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.3333333333333333}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.3333333333333333}{"sample_reasoning_agent": 0.0, "sample_tool_agent": 0.0, "aggreted_agent": 0.4666666666666667}{"sample_reasoning_agent": 0.3, "sample_tool_agent": 0.36666666666666664, "aggreted_agent": 0.36666666666666664}{"sample_reasoning_agent": 0.8666666666666667, "sample_tool_agent": 0.9666666666666667, "aggreted_agent": 0.4}{"sample_reasoning_agent": 0.4, "sample_tool_agent": 0.1, "aggreted_agent": 0.23333333333333334}{"sample_reasoning_agent": 0.8, "sample_tool_agent": 0.9666666666666667, "aggreted_agent": 0.4}{"sample_reasoning_agent": 1.8333333333333333, "sample_tool_agent": 1.3333333333333333, "aggreted_agent": 0.3}{"sample_reasoning_agent": 0.9, "sample_tool_agent": 0.8333333333333334, "aggreted_agent": 0.3333333333333333}{"sample_reasoning_agent": 0.6, "sample_tool_agent": 0.8666666666666667, "aggreted_agent": 0.26666666666666666}{"sample_reasoning_agent": 0.6666666666666666, "sample_tool_agent": 0.9333333333333333, "aggreted_agent": 0.3333333333333333}{"code_generator": 0.2057142857142857, "test_generator": 0.17142857142857143}{"code_generator": 0.17142857142857143, "test_generator": 0.13142857142857142}{"code_generator": 0.17142857142857143, "test_generator": 0.12}{"code_generator": 0.2, "test_generator": 0.17714285714285713}{"code_generator": 0.17142857142857143, "test_generator": 0.10285714285714286}{"code_generator": 0.2057142857142857, "test_generator": 0.17714285714285713}{"sample_reasoning_agent": 0.3333333333333333, "sample_tool_agent": 0.23333333333333334, "aggreted_agent": 0.36666666666666664}{"sample_reasoning_agent": 0.43333333333333335, "sample_tool_agent": 0.3333333333333333, "aggreted_agent": 0.43333333333333335}{"sample_reasoning_agent": 0.3333333333333333, "sample_tool_agent": 0.3333333333333333, "aggreted_agent": 0.3}{"sample_reasoning_agent": 0.39166666666666666, "sample_tool_agent": 0.38333333333333336, "aggreted_agent": 0.375}{"sample_reasoning_agent": 0.09166666666666666, "sample_tool_agent": 0.075, "aggreted_agent": 0.09166666666666666}{"sample_reasoning_agent": 0.125, "sample_tool_agent": 0.06666666666666667, "aggreted_agent": 0.11666666666666667}{"sample_reasoning_agent": 0.1, "sample_tool_agent": 0.05, "aggreted_agent": 0.11666666666666667}{"sample_reasoning_agent": 0.5, "sample_tool_agent": 0.5333333333333333, "aggreted_agent": 0.5666666666666667}{"sample_reasoning_agent": 0.65, "sample_tool_agent": 0.6666666666666666, "aggreted_agent": 0.6}{"sample_reasoning_agent": 0.2, "sample_tool_agent": 0.2, "aggreted_agent": 0.16666666666666666}{"sample_reasoning_agent": 0.1, "sample_tool_agent": 0.2, "aggreted_agent": 0.13333333333333333}{"sample_reasoning_agent": 0.18333333333333332, "sample_tool_agent": 0.21666666666666667, "aggreted_agent": 0.15}{"sample_reasoning_agent": 0.18333333333333332, "sample_tool_agent": 0.21666666666666667, "aggreted_agent": 0.13333333333333333}{"sample_reasoning_agent": 1.3333333333333333, "sample_tool_agent": 1.1166666666666667, "aggreted_agent": 0.2833333333333333}{"sample_reasoning_agent": 0.6333333333333333, "sample_tool_agent": 0.48333333333333334, "aggreted_agent": 0.6833333333333333}{"sample_reasoning_agent": 1.3166666666666667, "sample_tool_agent": 1.4, "aggreted_agent": 0.7166666666666667}{"sample_reasoning_agent": 0.4166666666666667, "sample_tool_agent": 0.4666666666666667, "aggreted_agent": 0.26666666666666666}{"sample_reasoning_agent": 0.2, "sample_tool_agent": 0.26666666666666666, "aggreted_agent": 0.26666666666666666}{"sample_reasoning_agent": 0.15, "sample_tool_agent": 0.23333333333333334, "aggreted_agent": 0.3}{"sample_reasoning_agent": 0.11666666666666667, "sample_tool_agent": 0.06666666666666667, "aggreted_agent": 0.11666666666666667}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.0, "test_generator": 0.0}{"code_generator": 0.18857142857142858, "test_generator": 0.12571428571428572}{"code_generator": 0.17, "test_generator": 0.112}{"code_generator": 0.162, "test_generator": 0.09}{"code_generator": 0.030303030303030304, "test_generator": 0.030303030303030304}{"code_generator": 0.166, "test_generator": 0.11}{"code_generator": 0.048484848484848485, "test_generator": 0.03636363636363636}{"code_generator": 0.03636363636363636, "test_generator": 0.012121212121212121}{"code_generator": 0.03636363636363636, "test_generator": 0.01818181818181818}{"code_generator": 0.176, "test_generator": 0.122}{"code_generator": 0.07878787878787878, "test_generator": 0.030303030303030304}{"code_generator": 0.186, "test_generator": 0.142}{"sample_reasoning_agent": 0.13333333333333333, "sample_tool_agent": 0.06666666666666667, "aggreted_agent": 0.1}{"sample_reasoning_agent": 0.13333333333333333, "sample_tool_agent": 0.06666666666666667, "aggreted_agent": 0.13333333333333333}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.13333333333333333}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.06666666666666667}{"sample_reasoning_agent": 0.16666666666666666, "sample_tool_agent": 0.08333333333333333, "aggreted_agent": 0.16666666666666666}{"reasoning_agent": 0.15, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.13333333333333333, "tool_agent": 0.06666666666666667}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.0}{"plan_agent": 0.0, "tool_call_agent": 0.0}{"reasoning_agent": 0.16666666666666666, "tool_agent": 0.1}{"timestamp":"2025-09-15 19:58:27","task":"code","benchmark":"apps","reasoning":false,"model":"/home/lah003/models/Qwen3-1.7B","max_turns":1}
{"plan_agent": 0.0, "tool_call_agent": 0.0}{"plan_agent": 0.69, "tool_call_agent": 0.69}{"plan_agent": 0.7, "tool_call_agent": 0.7}{"plan_agent": 0.56, "tool_call_agent": 0.82}{"plan_agent": 0.23, "tool_call_agent": 0.52}{"plan_agent": 0.04, "tool_call_agent": 0.04}{"plan_agent": 0.18, "tool_call_agent": 0.18}{"tool_call_agent": 0.09, "plan_agent": 0.19}{"plan_agent": 0.18}{"tool_call_agent": 0.0, "plan_agent": 0.33}{"plan_agent": 0.28}{"plan_agent": 0.29}{"tool_call_agent": 0.0, "plan_agent": 0.14}{"tool_call_agent": 0.0, "plan_agent": 0.15}{"tool_call_agent": 0.0, "plan_agent": 0.11}{"tool_call_agent": 0.0, "plan_agent": 0.12}{"plan_agent": 0.12}{"tool_call_agent": 0.0, "plan_agent": 0.73}{"plan_agent": 0.46}{"plan_agent": 0.46}{"plan_agent": 0.17}{"tool_call_agent": 0.0, "plan_agent": 0.72}{"plan_agent": 0.05}{"tool_call_agent": 0.0, "plan_agent": 0.06}{"tool_call_agent": 0.0, "plan_agent": 0.04}{"tool_call_agent": 0.0, "plan_agent": 0.05}{"tool_call_agent": 0.0, "plan_agent": 0.06}{"tool_call_agent": 0.0, "plan_agent": 0.05}{"tool_call_agent": 0.0, "plan_agent": 0.1}{"plan_agent": 0.1}{"plan_agent": 0.0}{"plan_agent": 0.0}{"plan_agent": 0.36}{"tool_call_agent": 0.0, "plan_agent": 0.79}{"plan_agent": 0.0}{"plan_agent": 0.72}{"plan_agent": 0.04}{"tool_call_agent": 0.0, "plan_agent": 0.01}{"tool_call_agent": 0.0, "plan_agent": 0.02}{"tool_call_agent": 0.0, "plan_agent": 0.04}{"tool_call_agent": 0.0, "plan_agent": 0.0}{"tool_call_agent": 0.0, "plan_agent": 0.01}{"tool_call_agent": 0.0, "plan_agent": 0.13}{"tool_call_agent": 0.0, "plan_agent": 0.62}{"plan_agent": 0.02}{"tool_call_agent": 0.0, "plan_agent": 0.34}{"plan_agent": 0.0}{"tool_call_agent": 0.0, "plan_agent": 0.01}{"tool_call_agent": 0.0, "plan_agent": 0.59}{"plan_agent": 0.25}{"plan_agent": 0.0}{"tool_call_agent": 0.0, "plan_agent": 0.4}