import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import eval.run_utils
import epic.epics_vipergpt
import epic.imgpatch
import epic.conformal_utils
import string
import traceback
import time
from utils import (
    get_prog_filepaths_from_dirs,
)
import torch
import argparse

def check_correctness(pred_raw : str, gt : str) -> bool:
    if pred_raw is None:
        return False
    else:
        pred = pred_raw.translate(str.maketrans('', '', string.punctuation)).lower()
        return pred in gt

CONCRETE_GLOBALS = {
    "__builtins__": __builtins__,
    "ImagePatch": epic.imgpatch.ImagePatch,
    "bool_to_yesno": epic.epics_vipergpt.bool_to_yesno,
    "if_stmt": eval.run_utils.concrete_if_stmt,
    "for_stmt": eval.run_utils.concrete_for_loop,
    "and_expr": eval.run_utils.concrete_and_expr,
    "or_expr": eval.run_utils.concrete_or_expr,
    "not_expr": eval.run_utils.concrete_not_expr,
    "len": len,
}

ABSTRACT_GLOBALS = {
    "__builtins__": __builtins__,
    "ImagePatch": epic.imgpatch.ImagePatch,
    "bool_to_yesno": epic.conformal_utils.abstract_bool_to_yesno,
    "if_stmt": epic.conformal_utils.abstract_if_stmt,
    "for_stmt": epic.conformal_utils.abstract_for_loop,
    "and_expr": epic.conformal_utils.abstract_and_expr,
    "or_expr": epic.conformal_utils.abstract_or_expr,
    "not_expr": epic.conformal_utils.abstract_not_expr,
    "len": epic.conformal_utils.abstract_len,
}

def get_exec_func_from_source(py_code, filename, exec_globals):
    code = compile(py_code, filename=filename, mode='exec')
    exec_locals = {}
    exec(code, exec_globals, exec_locals)

    return exec_locals.get("execute_command")

def get_image_for_problem(problem_id : str, imgpatch_impl):
    CONTEXT = make_context(imgpatch_impl, recording=False, sync=True, track_rounds=False)
    image_id = problem_lookup[problem_id]['imageId']
    image_raw = image_lookup.get(image_id)
    image = imgpatch.WrappedImage(image_raw, image_id, CONTEXT)
    return image








from eval.run_utils import *
import datasets

split = "val"
dataset_images = datasets.load_dataset("lmms-lab/GQA", f"{split}_all_images", split=split)
dataset_instructions = datasets.load_dataset("lmms-lab/GQA", f"{split}_all_instructions", split=split)
image_lookup = {item["id"]: item["image"] for item in dataset_images}

# TODO: this is a hack to make it faster because iterating over the whole dataset to construct `problem_image_lookup` is too slow
import random
random.seed(2025)
num_sample=1000
indices = random.sample(range(len(dataset_instructions)), len(dataset_instructions))[0:num_sample]

# problem_image_lookup = {item["id"]: item["imageId"] for item in dataset_instructions}
# problem_image_lookup = {dataset_instructions[i]["id"]: dataset_instructions[i]["imageId"] for i in indices}
problem_lookup = {dataset_instructions[i]["id"]: dataset_instructions[i] for i in indices}

DIRNAME = "datasets/gqa/epic_compiled"
PROG_DIR = DIRNAME + "/" + "progs_set"
SUFFIX = ".prog"
# all_problem_ids = tuple(f[:-len(SUFFIX)] for f in sorted(os.listdir(PROG_DIR)) if f.endswith(SUFFIX))
all_problem_ids = ('001001646', '00101462', '001020371', '001038165', '00107169', '00143855', '00152304', '00156120', '00176221', '00176309', '00199193', '00254961', '00261092', '00284964', '00293722', '00293806', '00356746', '00360182', '00367003', '00379599', '00389748', '00389964', '00401691', '00405746', '00416702', '00424458', '00440588', '00503889', '00504782', '00555072', '0055573', '00556086', '00562705', '0056441', '00577966', '00590453', '00647392', '00715574', '00743842', '00748177', '00748373', '00760714', '00760770', '00785575', '00804615', '00815086', '00824591', '00828907', '00896006', '00930227', '00942340', '00963351', '01107368', '01124197', '01132492', '01150568', '01165336', '01174074', '01174259', '01174305', '01208909', '01208951', '01218801', '01233337', '01297404', '01297752', '01327729', '01327842', '01331977', '01389765', '01393433', '01428176', '01442320', '01516213', '01584590', '01588728', '01640595', '01642500', '01656844', '01697916', '01718316', '01721353', '01839662', '01860023', '01889048', '01906018', '01910341', '01914251', '01945440', '01965903', '01971076', '021019172', '021025766', '021055662', '021056851', '021056883', '021065186', '021067013', '02127162', '02135522', '0213762', '02142713', '02143592', '02245618', '02269486', '02269569', '02270', '02383469', '02394546', '02405003', '02420529', '02426970', '02482241', '02497668', '0250564', '02532009', '02540124', '02540163', '02588016', '0260624', '02630209', '02653851', '02665137', '026861', '02693712', '02733330', '02750337', '02759734', '02802268', '02813067', '02824706', '02833193', '02871249', '02932796', '02933787', '02943254', '02948812', '0297503', '02975338', '02992317', '02992326', '02995759', '031007887', '03101567', '031016923', '031051779', '031066474', '03135896', '03186804', '03189562', '03196282', '03211328', '0327032', '0328486', '03309190', '0331757', '03322641', '03328149', '0333063', '03343145', '03408415', '03411725', '03423301', '03434830', '03437300', '03437773', '03449462', '03452525', '03454011', '03459474', '03463786', '03472171', '03472635', '0347884', '03492618', '03510314', '0354338', '03581702', '03583650', '03585325', '03610101', '03657584', '03674787', '03681784', '03726645', '03726794', '03768153', '03769712', '03791320', '03799224', '03804539', '0380924', '03810100', '03868474', '03884138', '03929817', '041005653', '041019566', '041024152', '04115049', '04145820', '04204942', '04232771', '04232833', '04248989', '04250787', '04285473', '04293467', '04306972', '04314741', '04319228', '04326017', '04345874', '0434949', '04358740', '04388592', '04396592', '04422189', '04485413', '04485556', '04502826', '04580140', '0459777', '04604866', '04682769', '04705574', '04706831', '04724030', '04780885', '04784882', '0479719', '04800494', '04812598', '04816856', '04891747', '04914314', '04917636', '04917654', '04943845', '04978697', '04978763', '04978802', '04987397', '04993979', '051005507', '051013502', '051025447', '05103809', '05193833', '05199507', '05282638', '05287707', '05299947', '05334564', '05370915', '05440606', '05451633', '05469417', '05473425', '05479905', '05490141', '05504941', '05506731', '05536529', '05540057', '05568180', '05575300', '0559995', '05603095', '05629315', '05633378', '0564281', '05692031', '05698579', '05735854', '05755391', '05795046', '05831953', '05841695', '05847756', '05858055', '05865514', '05888551', '05888832', '05943487', '05980399', '0598284', '061020897', '061024820', '061026858', '061052487', '06127659', '06143752', '06146423', '06185315', '06235406', '06292255', '06318952', '06347223', '06360461', '06377829', '06408804', '06412755', '06418321', '0642680', '064329', '06481536', '06587419', '06588188', '0662462', '06626817', '06632546', '06636411', '066444', '0669387', '06707517', '06731049', '06735443', '06742802', '06757361', '06764898', '06783788', '06792803', '06804214', '06838099', '06850545', '06850563', '068662', '06877375', '06895863', '069153', '06932469', '06937620', '06991548', '06997742', '07101280', '071013097', '071028087', '071031262', '071052129', '071054969', '07164907', '07165029', '07181520', '07188473', '07188976', '07197887', '07202024', '07207832', '0728', '07329608', '07329771', '07333746', '07344073', '07357647', '0738071', '07382653', '07382916', '07395734', '07405880', '07405947', '07410767', '07460659', '07478752', '07488199', '07497123', '07512871', '07519009', '07526178', '07536718', '07559691', '07577514', '07594341', '0761044', '07636962', '07638774', '07646997', '07685531', '07692734', '0770904', '07749706', '07755336', '07810397', '07859767', '07866307', '07872677', '07903683', '07920705', '07929532', '07933944', '07936948', '07973193', '081005729', '081009482', '08125373', '08140068', '0815317', '08161642', '08174235', '08209628', '08237569', '08256074', '08256182', '08259984', '08267463', '08276708', '0829420', '08333670', '08337216', '0837422', '08378545', '0838335', '08435362', '08449268', '08475818', '08476155', '08476220', '08529294', '0857154', '08575906', '08577190', '08581535', '08610796', '08610806', '08615994', '08619148', '08619158', '08695773', '08720235', '08757327', '08768431', '08790159', '08792436', '08830532', '08835495', '08873652', '08879229', '08897351', '08899398', '08924843', '08924908', '0893606', '0893735', '08948268', '08988089', '08988137', '091001930', '09101700', '091027918', '091028133', '091041591', '091050556', '091057518', '09124769', '09124924', '09134792', '09199060', '09199130', '092049', '092098', '09212403', '09212432', '09226335', '09229325', '09300735', '09302271', '09344240', '09344381', '09396917', '09430418', '09441733', '09476403', '09483768', '09515276', '09530364', '09562080', '09587331', '09623059', '09623550', '09686881', '09692951', '09710328', '09832266', '09841755', '09852922', '09894558', '09899820', '09905092', '09924431', '09944623', '101008568', '101014569', '101024729', '101030732', '1011386', '10135232', '10150398', '10150546', '10150823', '10153712', '10165773', '10178615', '10192691', '10214071', '10230386', '10259997', '10308049', '10353188', '10389676', '10432915', '10458593', '10479250', '10486600', '10486665', '10508396', '10508688', '10513478', '10539382', '10584162', '10592175', '10611499', '10676614', '10692227', '10720308', '10741400', '10744121', '10764373', '10767803', '10768582', '10770595', '10801497', '10801530', '10805015', '10838970', '10844408', '10872843', '10882764', '10890070', '10905806', '10981496', '111008275', '111015622', '111025018', '111054010', '11109097', '11121038', '11121227', '1118161', '11203341', '11236596', '11245647', '11299178', '11314126', '1133952', '11363103', '11363175', '11401138', '11409224', '11409225', '11444832', '11486259', '11530117', '11534522', '11565556', '11565568', '11573766', '11594835', '11611905', '11720154', '1172521', '11727292', '11764778', '11770346', '11771404', '11774225', '11799683', '11804510', '11852614', '11867697', '11917832', '1194023', '11941135', '121017386', '121021962', '121021992', '121029646', '12103496', '121044397', '121052836', '12164486', '12169759', '12171064', '12171286', '1217325', '12185363', '12212616', '12227083', '12228880', '12252370', '12256753', '12256924', '12290384', '12311348', '12324608', '12337259', '12389585', '12411355', '12436517', '12441088', '12472636', '12476903', '12486515', '12495058', '12518376', '12529731', '12553060', '1257426', '12585423', '12603203', '12633750', '12636393', '12649238', '12690980', '12758543', '12790828', '12824060', '12877502', '12905233', '12909810', '12931055', '12954787', '12974992', '131031196', '131031525', '131059577', '1310873', '13113081', '13113217', '13141912', '13144507', '13179529', '132279', '13291187', '13325032', '13331044', '133471', '13353399', '1337994', '1338104', '13398710', '13450765', '13473899', '13487043', '13507820', '13521934', '13522148', '13529710', '13556131', '13582977', '136596', '13663000', '13696850', '13702614', '13704637', '13719653', '13727189', '13741169', '13766925', '13774892', '13840064', '13846435', '13846504', '1384933', '13875948', '13883858', '13923518', '13927632', '13936586', '13949889', '13950007', '13962983', '13979724', '141024629', '14106963', '14115968', '14119747', '1419252', '1419267', '14195473', '14205794', '14206304', '14223524', '1424854', '14251491', '14262965', '14297899', '14301457', '14304671', '14311558', '14340006', '1434946', '14387147', '14442309', '14475748', '14489777', '1452406', '14525345', '14532211', '14549527', '14578986', '14579294', '14647171', '14745026', '14785456', '14789116', '1481847', '14822292', '1483254', '1486397', '14864984', '14898989', '14908162', '14930739', '14976228', '15193443', '15193637', '15242502', '15249108', '15258960', '15259041', '15261485', '15306274', '15310897', '15318214', '15413824', '15431717', '15437224', '15438405', '1545498', '15470821', '15471829', '15483073', '15483268', '15515630', '15526251', '15551520', '15554869', '15576963', '15592733', '15602984', '15606156', '15651065', '15665711', '15706799', '15724210', '15724756', '15747513', '15778112', '15832980', '15835441', '15849923', '15927175', '1593100', '15940467', '161013322', '161055627', '161060217', '161063540', '161071096', '16136989', '16153538', '16159849', '16208156', '16235127', '16246504', '16269474', '16275194', '16281994', '16312790', '16313390', '16370837', '16380213', '164171', '16447757', '16455041', '1646096', '16488555', '16492622', '16542203', '16558405', '16558585', '1662756', '1665343', '16686358', '16687448', '16733176', '16742254', '16757763', '16758114', '16758304', '16760290', '16803075', '16807486', '16817529', '16821117', '16821681', '1683109', '16840722', '1684683', '16995790', '171031453', '171046347', '171049310', '17123790', '17162527', '17181283', '17200035', '17207391', '17217010', '17222973', '17237601', '17249841', '17297684', '17310003', '17312112', '17317049', '17331131', '17336192', '1734966', '17394673', '17394994', '17429704', '17467548', '17476070', '17504839', '17585888', '17590063', '17600884', '17601491', '17694457', '1771689', '17726713', '17726753', '17726765', '17728532', '17736673', '17746371', '17762139', '17769067', '17797237', '17809999', '17821134', '17836636', '17861379', '17863703', '17866597', '17898250', '17938437', '179386', '17997839', '17999679', '18100464', '181047301', '181047312', '181055777', '18107163', '18107209', '18113660', '18121950', '18168931', '18175401', '18233828', '18252544', '18273447', '18275514', '18297545', '18300566', '18304862', '18313030', '18317831', '18318055', '18348399', '18351748', '18370691', '18370752', '18382215', '1838787', '18393780', '18413080', '18444016', '18444446', '18445079', '18460470', '18469632', '18500862', '1854915', '18555100', '18561799', '18619104', '18619394', '18627587', '18686103', '18727510', '18798420', '18819245', '18831955', '1883427', '18858529', '18890972', '18967896', '18986427', '191007011', '191038901', '191040689', '19172633', '19176702', '19177345', '19180760', '19201214', '19218195', '19225595', '19284338', '19284471', '19308153', '19311551', '19352518', '19359523', '19380149', '19417718', '19453281', '19474834', '19505727', '19512919', '19629133', '1969521', '19738869', '19748811', '19840182', '19860538', '19867135', '19867194', '19892545', '19922594', '19929745', '19932207', '19933801', '19971173')

# n = len(all_problem_ids)
# random.seed(2026)
# perm = torch.randperm(n)
# val_indices = perm[:n//2]
# test_indices = perm[n//2:]
# val_indices = (798, 244, 121, 673, 389, 520, 846, 768, 789, 738, 601, 413,  50, 767, 379, 148,  51, 574,  36,  18, 727, 719, 718, 800, 496, 230, 814,  98, 695, 567, 270, 460, 829,  78, 430, 584,  87, 575, 649, 752, 684,  41, 694, 598,   4, 361, 844, 396, 394, 309, 232, 838, 111,  62, 141, 178,  86, 436, 509, 884, 238, 205, 943,  90, 867, 352, 258, 878, 921,  47, 790, 442, 134, 793, 377, 735, 209, 918, 194, 495, 114, 469, 354, 160, 783, 451, 866, 606, 794,  46, 715,  60, 776, 251, 402, 211,  69, 158, 621, 737, 864, 728, 886, 119, 346,  52, 437, 262, 823, 399, 803, 686, 893, 755, 334,  40, 905, 583, 421, 403, 321, 433, 594, 422, 471, 820, 812,  42, 458, 271, 517, 162, 770, 924, 214, 638, 476, 357, 857, 432, 848, 720, 169, 835, 576, 312,  70, 705, 204, 547, 636, 322,  79, 626, 624, 602, 294, 703, 356, 166, 751, 139, 318, 713, 551, 116, 700, 881, 543, 897, 784, 849, 566, 395, 834, 333, 414, 511, 191, 896, 603, 165, 132, 371, 291, 254,  56, 146, 632, 642, 189, 585, 782, 257, 456,  77, 117, 175, 651,  44, 152, 307, 934, 450,  73,  23, 381, 431, 112, 224, 633, 519, 351, 151, 338, 285, 513, 325, 137, 781, 554, 197, 483, 219, 655, 577,  92, 680, 630, 193, 218, 888,  85, 416, 815, 316, 404, 420, 665, 341, 122, 406,  35, 548, 118, 711, 685, 909, 860, 391, 328,  93, 349, 734, 465, 455, 872, 863, 187, 415, 120, 859,  31, 136, 828,  34, 393, 927, 938, 405, 541, 670,  49, 648, 140, 922, 359, 348, 792, 453,  39, 707, 101, 367, 571, 506, 510, 298, 252, 902, 777, 236,  24, 306, 923, 479, 851, 434, 170, 714,  63,   6, 235, 795, 619, 616, 103, 290, 498, 568, 597,  99, 880, 580,  64, 392, 637, 305, 288, 459, 540, 411, 373, 739, 297, 412, 572, 843,  61, 445, 928, 746, 693, 190, 944, 250, 149, 147, 408, 721, 179, 242, 491, 388, 611, 910, 425, 821, 159, 300,  96, 747, 722, 787, 329, 482, 237, 668, 645, 195, 245, 788, 401, 940, 164, 234, 586, 915, 744, 268, 698, 691, 756, 464, 207, 625, 801, 365, 558, 899,  97, 937, 497, 614, 726, 876, 185, 400, 903, 877, 629, 733, 131, 521, 457, 301, 383, 201, 748, 942, 375, 850, 595, 724, 410, 524, 336, 376, 281, 631, 397,   7, 658, 293, 538, 287, 908, 563, 259, 549, 441, 463, 708, 641, 661,  58, 935, 634, 716, 150, 710, 468, 486, 213, 531, 688, 183, 253, 810, 579, 130, 344, 883,  10,  74, 612, 308, 173, 206, 310, 273, 289, 539,   3, 644, 778, 608, 743,  43, 188, 530, 643, 492, 772, 856, 502, 920, 562, 639, 202, 260, 808, 335)
# test_indices = (817, 407, 936, 822, 323, 374, 901, 811,   9, 108, 895, 593, 662, 588, 839,  72, 676, 656, 439, 512, 552, 438, 135, 565, 570, 654, 494, 424, 858, 144, 681, 870, 592, 809, 725, 663,  82, 666, 284, 535,  48, 561, 760, 500, 740, 514, 314, 317, 324, 757, 472, 925, 650, 845, 587, 932, 785, 462, 241, 277, 702, 805, 855, 669, 545, 156, 546, 339, 750, 659, 429, 217, 461, 869, 865, 319, 536, 765, 278, 504, 827, 364,  25, 706, 723, 824, 186, 221, 763, 610, 331, 831, 773, 350, 894, 493, 199, 804, 887,   1, 761, 679, 555, 261, 771,  14, 912, 184, 292,  17, 675, 753, 104, 556, 340, 557, 239, 613, 171, 518, 919, 248, 447, 759, 930, 418, 225, 380, 862, 503,  45, 709, 286, 664,  71, 842, 564, 825,  84, 337,  66, 560, 481,  15, 444, 387, 473, 142, 123, 360, 480, 467, 330, 931, 419, 428, 507, 283, 353, 107, 916, 246, 177, 926, 182, 917, 749, 889, 730,  16, 699, 106, 240, 775, 332, 168, 313, 678, 227, 358,  29, 622,  28, 779, 409, 653, 731, 426, 573,  83, 154, 802, 515, 155, 578, 385,  53, 109, 210, 320, 904, 758, 386, 796,  75, 729, 906, 275, 390, 914, 816, 343, 427, 692, 841,  27, 704, 623, 891, 505, 216, 475, 591, 525, 875, 852, 466, 873, 582,  22, 742, 868,  94, 766, 220, 590,  55,  11, 362, 529, 378, 470, 861, 161, 100, 223, 454, 696, 478, 677, 303, 311, 488, 208, 600, 279, 501, 754, 532, 799, 553, 264, 885, 226,  12, 153, 133, 138, 127, 355, 826, 452,  20, 674, 900, 372, 523, 607, 200, 832,  80, 745, 176, 807, 296, 929, 542, 672, 533, 269, 508, 180, 327, 933, 605,  68, 440, 741, 617, 769,  37, 477, 181, 840, 128,  19, 145, 105, 687, 907, 599,   5, 326, 871, 255, 717, 818, 589, 443, 382, 569, 526, 274, 833, 370, 347, 615, 596, 423, 484, 689, 657, 203, 913, 345, 295, 231, 898, 157, 892, 874, 646, 628,  57,  67, 222, 192, 115, 233,  30, 647,  95, 212, 682,  33,  59,  88, 581, 847, 620, 806, 449, 448, 879, 635, 534, 946, 697, 229, 266, 249,  21, 174, 516, 247, 172, 701, 683, 853,  32, 315,  81,   2,  76,  38, 527, 384, 780, 522, 228, 102, 363, 125,   0, 369, 609, 854, 499, 774, 786, 263,  26, 945, 474, 485, 911, 537, 671, 813, 446, 830, 652, 368, 764, 640, 215, 712, 198, 267, 302, 762, 819, 282, 836, 487, 890, 143, 366, 163, 124, 690, 265, 113, 276, 791, 797, 280, 299, 398, 435, 550, 490, 272, 304, 882, 544, 660, 732,   8, 941, 196,  13, 167, 618, 243,  89, 939, 667,  91,  54, 559, 110, 417, 837, 736, 256, 489, 627,  65, 604, 129, 126, 342, 528)
# OLD_val_indices = (397, 796, 640, 271, 391, 351, 687, 388, 207, 409, 559, 158, 655, 784, 826, 123,  99, 253, 219, 883, 177, 942, 903, 507, 919, 646, 466, 121, 264, 895, 857,  93, 363, 472, 727, 162, 496, 154, 301,  58, 144, 350, 830, 418, 716, 673, 552,  83, 155, 785, 336, 394, 516, 820, 642, 304, 359, 741, 124, 821, 436,  44, 710, 216, 451, 468, 519,  29, 691, 578, 876, 183, 246, 174, 146, 849, 379, 835, 230, 759, 116, 679, 750,  53, 240, 786, 495, 635, 539, 248, 531, 879, 871, 550, 781,  47, 532, 898, 512, 169)
# OLD_test_indices = (914, 594,   4, 838, 305,  77, 928, 267, 752, 697, 884, 182, 711, 624, 273, 563, 887, 818, 395, 474, 330, 140,   2, 171, 777, 916, 194, 524, 156, 678, 376, 242, 411, 538, 413,  16,  13, 433, 901, 274, 233, 754, 415, 877, 536, 505,   7, 454, 408,  76, 707, 658, 588, 657, 831, 275, 530, 484, 503, 852, 927, 295, 758, 294, 647,  78,  57, 601, 520, 132, 756, 153, 704, 521, 453, 540, 890, 494, 744, 562, 870, 787, 930, 819, 109, 455, 285, 373, 719,  69, 320, 859, 847, 316, 573,  55, 945, 390, 168, 214)


parser = argparse.ArgumentParser()
parser.add_argument(
    '-s', '--split',
    required = True
)
parser.add_argument(
    '-t', '--thresh',
    required = True
)
args = parser.parse_args()

GLOBAL_THRESHOLD = float(args.thresh)
# SPLIT = args.split
# GLOBAL_THRESHOLD = 1
# GLOBAL_THRESHOLD = 0.5
# GLOBAL_THRESHOLD = 0.25
# GLOBAL_THRESHOLD = 0.125
# SPLIT = "val"
# SPLIT = "test"


# split_indices = val_indices if SPLIT == "val" else test_indices
# split_indices = val_indices if SPLIT == "val" else test_indices
# split_ids = tuple(all_problem_ids[i] for i in split_indices)

def scale_down(reference_point):
    return reference_point * GLOBAL_THRESHOLD

def scale_up(reference_point):
    return 1 - ((1 - reference_point) * GLOBAL_THRESHOLD)

imgpatch_conformal.ABSTRACT_EXISTS_UPPER = scale_up(0.75)
imgpatch_conformal.ABSTRACT_EXISTS_LOWER = scale_down(0.25)
imgpatch_conformal.ABSTRACT_FIND_HIGH = scale_up(0.5)
imgpatch_conformal.ABSTRACT_FIND_LOW = scale_down(0.1)
imgpatch_conformal.ABSTRACT_SIMPLE_QUERY_THRESHOLD = scale_down(0.5)
imgpatch_conformal.ABSTRACT_VERIFYPROP_UPPER = scale_up(0.75)
imgpatch_conformal.ABSTRACT_VERIFYPROP_LOWER = scale_down(0.25)

imgpatch_conformal.CACHE_DIR = f"{DIRNAME}/conformal_cache"
os.makedirs(imgpatch_conformal.CACHE_DIR, exist_ok=True)

EVAL_DIRNAME = f"{DIRNAME}/conformal_exec_{GLOBAL_THRESHOLD}"
os.makedirs(EVAL_DIRNAME, exist_ok=True)

# filepaths = get_prog_filepaths_from_dirs([f"{DIRNAME}/progs_set"])

split_ids_shuffled = list(all_problem_ids)
random.seed(int(time.time_ns()))
random.shuffle(split_ids_shuffled)
for i, problem_id in enumerate(split_ids_shuffled):
    # if i < 948: continue
    
    print(f"[{i}/{len(all_problem_ids)}] Processing: {problem_id}")
    with open(f"{DIRNAME}/progs_set/{problem_id}.prog", "r") as f:
        prog = f.read()

    answer : str = problem_lookup[problem_id]['answer']

    exec_filepath = os.path.join(EVAL_DIRNAME, f"{problem_id}.json")
    err_filepath = os.path.join(EVAL_DIRNAME, f"{problem_id}.err")

    image_concrete = get_image_for_problem(problem_id, imgpatch_4o_all)
    image_abstract = get_image_for_problem(problem_id, imgpatch_conformal)
    concrete_f = get_exec_func_from_source(prog, problem_id, CONCRETE_GLOBALS)
    abstract_f = get_exec_func_from_source(prog, problem_id, ABSTRACT_GLOBALS)

    try:
        print("==== Eval Concrete ====")
        t_c_start = time.perf_counter()
        result_concrete = concrete_f(image_concrete)
        t_c_end = time.perf_counter()
    except Exception as e:
        print(f"Failed on {problem_id}")

        with open(err_filepath, "w") as f:
            f.write("CONCRETE ERROR\n" + "\n".join(traceback.format_exception(e)))
        traceback.print_exc()
        continue

    correct_concrete = check_correctness(result_concrete, answer)

    try:
        print("==== Eval Abstract ====")
        t_a_start = time.perf_counter()
        result_abstract = abstract_f(image_abstract)
        t_a_end = time.perf_counter()
    except imgpatch_conformal.ConformalModelCacheConflict as e:
        raise e
    except Exception as e:
        print(f"Failed on {problem_id}")

        with open(err_filepath, "w") as f:
            f.write("\n".join(traceback.format_exception(e)))
        traceback.print_exc()
        print(str(e))
        if str(e) == "int() argument must be a string, a bytes-like object or a real number, not 'AbstractOther'":
            continue
        elif str(e).startswith("_parse_yesno_confidence() parse error"):
            continue
        elif str(e) == "best_text_match not implemented":
            continue
        else:
            continue

    print("==== Results ====")
    print("answer", answer)
    print("result_concrete", result_concrete)
    print("result_abstract", result_abstract)

    if type(result_abstract) is epic.conformal_utils.AbstractOther:
        result_abstract_tuple = tuple(result_abstract._possibilities)
        correct_abstract = tuple({check_correctness(r, answer) for r in result_abstract._possibilities})
    else:
        assert type(result_abstract) is not epic.conformal_utils.AbstractTuple
        result_abstract_tuple = result_abstract
        correct_abstract = tuple((check_correctness(result_abstract, answer),))


    print("correct_concrete", correct_concrete)
    print("correct_abstract", correct_abstract)

    with open(exec_filepath, "w") as f:
        json.dump({
            "concrete_result": result_concrete,
            "abstract_result": result_abstract_tuple,
            "gt": answer,
            "concrete_correct": correct_concrete,
            "abstract_correct": correct_abstract,
            "concrete_time": t_c_end - t_c_start,
            "abstract_time": t_a_end - t_a_start,
        }, f)