# %%
from datasets import load_dataset
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v6")
# %%
qs = {}
import base64
import zlib
import pickle
for u in lcb_codegen['test']:
    if u['difficulty'] != 'medium':
        continue
    qs.setdefault(u['difficulty'], []).append((u['question_content'], pickle.loads(zlib.decompress(base64.b64decode(u['private_test_cases'])))))

# %%
# %%
# %%
import os
from dotenv import load_dotenv
# init with /mnt/data/benchmarking/inspect_evals/.env
load_dotenv('/mnt/data/.env')
# %%
from functools import lru_cache

@lru_cache(maxsize=None)
def standardize_question(q_tuple):
    STATEMENT, TEST_CASES = q_tuple
    import json
    u1 = json.loads(TEST_CASES)
    rst = ''
    for idx,w in enumerate(u1):
        rst+=f'Test input {idx+1}:\n{w["input"]}\nTest output {idx+1}:\n{w["output"]}\n'
    TEST_CASES = rst.strip()
    import anthropic

    client = anthropic.Anthropic(
        api_key=os.environ.get("ANTHROPIC_API_KEY"),
    )

    msgs = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "The following is a competitive programming problem. You DONT NEED TO SOLVE IT but should standardize the task into human eval format.\n\n<statement>\nThere are three cards with letters $\\texttt{a}$, $\\texttt{b}$, $\\texttt{c}$ placed in a row in some order. You can do the following operation at most once: \n\n \n-  Pick two cards, and swap them.  Is it possible that the row becomes $\\texttt{abc}$ after the operation? Output \"YES\" if it is possible, and \"NO\" otherwise.\n\nInput\n\nThe first line contains a single integer $t$ ($1 \\leq t \\leq 6$) — the number of test cases.\n\nThe only line of each test case contains a single string consisting of each of the three characters $\\texttt{a}$, $\\texttt{b}$, and $\\texttt{c}$ exactly once, representing the cards.\n\nOutput\n\nFor each test case, output \"YES\" if you can make the row $\\texttt{abc}$ with at most one operation, or \"NO\" otherwise.\n\nYou can output the answer in any case (for example, the strings \"yEs\", \"yes\", \"Yes\" and \"YES\" will be recognized as a positive answer).Sample Input 1:\n6\n\nabc\n\nacb\n\nbac\n\nbca\n\ncab\n\ncba\n\n\n\nSample Output 1:\n\nYES\nYES\nYES\nNO\nNO\nYES\n\n\nNote\n\nIn the first test case, we don't need to do any operations, since the row is already $\\texttt{abc}$.\nIn the second test case, we can swap $\\texttt{c}$ and $\\texttt{b}$: $\\texttt{acb} \\to \\texttt{abc}$.\nIn the third test case, we can swap $\\texttt{b}$ and $\\texttt{a}$: $\\texttt{bac} \\to \\texttt{abc}$.\nIn the fourth test case, it is impossible to make $\\texttt{abc}$ using at most one operation.\n</statement>\n<test_cases>\nTest input 1:\n1\nabc\n\nTest output 1:\nYES\n\nTest input 2:\n3\nabc\nabc\nabc\n\nTest output 2:\nYES\nYES\nYES\n\nTest input 3:\n5\ncab\nacb\ncba\nbac\nbca\n\nTest output 3:\nNO\nYES\nYES\nYES\nNO\n\nTest input 4:\n6\nabc\nabc\nabc\nabc\nabc\nabc\n\nTest output 4:\nYES\nYES\nYES\nYES\nYES\nYES\n</test_cases>"
                    }
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": "Here's the standardized version:\n\n```\n<prompt>\ndef can_make_abc(s: str) -> bool:\n    \"\"\" Given a string of exactly three characters 'a', 'b', and 'c' (each appearing exactly once),\n    determine if it's possible to make the string \"abc\" with at most one swap operation.\n    A swap operation consists of picking two characters and swapping their positions.\n    \n    >>> can_make_abc('abc')\n    True\n    >>> can_make_abc('acb')\n    True\n    >>> can_make_abc('bca')\n    False\n    \"\"\"\n</prompt>\n<entry_point>\ncan_make_abc\n</entry_point>\n<test>\ndef check(candidate):\n    # Test case 1\n    assert candidate('abc') == True\n    \n    # Test case 2 (repeated 'abc')\n    assert candidate('abc') == True\n    \n    # Test case 3\n    assert candidate('cab') == False\n    assert candidate('acb') == True\n    assert candidate('cba') == True\n    assert candidate('bac') == True\n    assert candidate('bca') == False\n    \n    # Additional test cases\n    assert candidate('abc') == True\n</test>\n```"
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "The following is a competitive programming problem. You DONT NEED TO SOLVE IT but should standardize the task into human eval format.\n\n<statement>\nYou are given a string $s$ of length $n$, consisting of lowercase Latin letters, and an integer $k$.\n\nYou need to check if it is possible to remove exactly $k$ characters from the string $s$ in such a way that the remaining characters can be rearranged to form a palindrome. Note that you can reorder the remaining characters in any way.\n\nA palindrome is a string that reads the same forwards and backwards. For example, the strings \"z\", \"aaa\", \"aba\", \"abccba\" are palindromes, while the strings \"codeforces\", \"reality\", \"ab\" are not.\n\nInput\n\nEach test consists of multiple test cases. The first line contains a single integer $t$ ($1 \\leq t \\leq 10^4$) — the number of the test cases. This is followed by their description.\n\nThe first line of each test case contains two integers $n$ and $k$ ($0 \\leq k < n \\leq 10^5$) — the length of the string $s$ and the number of characters to be deleted.\n\nThe second line of each test case contains a string $s$ of length $n$, consisting of lowercase Latin letters.\n\nIt is guaranteed that the sum of $n$ over all test cases does not exceed $2 \\cdot 10^5$.\n\nOutput\n\nFor each test case, output \"YES\" if it is possible to remove exactly $k$ characters from the string $s$ in such a way that the remaining characters can be rearranged to form a palindrome, and \"NO\" otherwise.\n\nYou can output the answer in any case (uppercase or lowercase). For example, the strings \"yEs\", \"yes\", \"Yes\", and \"YES\" will be recognized as positive answers.Sample Input 1:\n14\n\n1 0\n\na\n\n2 0\n\nab\n\n2 1\n\nba\n\n3 1\n\nabb\n\n3 2\n\nabc\n\n6 2\n\nbacacd\n\n6 2\n\nfagbza\n\n6 2\n\nzwaafa\n\n7 2\n\ntaagaak\n\n14 3\n\nttrraakkttoorr\n\n5 3\n\ndebdb\n\n5 4\n\necadc\n\n5 3\n\ndebca\n\n5 3\n\nabaac\n\n\n\nSample Output 1:\n\nYES\nNO\nYES\nYES\nYES\nYES\nNO\nNO\nYES\nYES\nYES\nYES\nNO\nYES\n\n\nNote\n\nIn the first test case, nothing can be removed, and the string \"a\" is a palindrome.\n\nIn the second test case, nothing can be removed, but the strings \"ab\" and \"ba\" are not palindromes.\n\nIn the third test case, any character can be removed, and the resulting string will be a palindrome.\n\nIn the fourth test case, one occurrence of the character \"a\" can be removed, resulting in the string \"bb\", which is a palindrome.\n\nIn the sixth test case, one occurrence of the characters \"b\" and \"d\" can be removed, resulting in the string \"acac\", which can be rearranged to the string \"acca\".\n\nIn the ninth test case, one occurrence of the characters \"t\" and \"k\" can be removed, resulting in the string \"aagaa\", which is a palindrome.\n</statement>\n<test_cases>\nTest input 1:\n5\n10 3\naaabbbcccd\n10 1\naaabbccddd\n10 0\naaabbccddd\n10 9\nabcdefghij\n10 2\naabbccddee\n\nTest output 1:\nYES\nYES\nNO\nYES\nYES\nTest input 2:\n5\n10 5\naaabbbbccc\n10 5\naaabbbcccc\n10 4\naabbccddeeff\n11 3\naabbccddeeff\n10 8\naaabbbbccc\nTest output 2:\nYES\nYES\nYES\nYES\nYES\nTest input 3:\n5\n15 7\naaabbbcccdddeee\n15 8\naaabbccddeeffggh\n17 9\naaabbbcccdddeeeff\n21 10\naabbccddeeffgghhiijj\n20 0\naabbccddeeffgghhiijjkk\nTest output 3:\nYES\nYES\nYES\nYES\nYES\n</test_cases>"
                    }
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": "Here's the standardized version:\n\n```\n<prompt>\ndef can_form_palindrome_after_removal(n: int, k: int, s: str) -> bool:\n    \"\"\" Given a string s of length n consisting of lowercase Latin letters and an integer k,\n    determine if it's possible to remove exactly k characters from the string such that\n    the remaining characters can be rearranged to form a palindrome.\n    \n    A palindrome is a string that reads the same forwards and backwards.\n    \n    Args:\n        n: The length of the string (0 <= k < n <= 10^5)\n        k: The number of characters to remove\n        s: The input string of lowercase Latin letters\n    \n    Returns:\n        True if it's possible to remove k characters and rearrange the rest into a palindrome,\n        False otherwise.\n    \n    >>> can_form_palindrome_after_removal(1, 0, 'a')\n    True\n    >>> can_form_palindrome_after_removal(2, 0, 'ab')\n    False\n    >>> can_form_palindrome_after_removal(3, 2, 'abc')\n    True\n    \"\"\"\n</prompt>\n<entry_point>\ncan_form_palindrome_after_removal\n</entry_point>\n<test>\ndef check(candidate):\n    # Test cases from the problem\n    assert candidate(1, 0, 'a') == True\n    assert candidate(2, 0, 'ab') == False\n    assert candidate(2, 1, 'ba') == True\n    assert candidate(3, 1, 'abb') == True\n    assert candidate(3, 2, 'abc') == True\n    assert candidate(6, 2, 'bacacd') == True\n    assert candidate(6, 2, 'fagbza') == False\n    assert candidate(6, 2, 'zwaafa') == False\n    assert candidate(7, 2, 'taagaak') == True\n    assert candidate(14, 3, 'ttrraakkttoorr') == True\n    assert candidate(5, 3, 'debdb') == True\n    assert candidate(5, 4, 'ecadc') == True\n    assert candidate(5, 3, 'debca') == False\n    assert candidate(5, 3, 'abaac') == True\n    \n    # Additional test cases\n    assert candidate(10, 3, 'aaabbbcccd') == True\n    assert candidate(10, 1, 'aaabbccddd') == True\n    assert candidate(10, 0, 'aaabbccddd') == False\n    assert candidate(10, 9, 'abcdefghij') == True\n    assert candidate(10, 2, 'aabbccddee') == True\n    \n    assert candidate(10, 5, 'aaabbbbccc') == True\n    assert candidate(10, 5, 'aaabbbcccc') == True\n    assert candidate(12, 4, 'aabbccddeeff') == True\n    assert candidate(12, 3, 'aabbccddeeff') == True\n    assert candidate(10, 8, 'aaabbbbccc') == True\n    \n    assert candidate(15, 7, 'aaabbbcccdddeee') == True\n    assert candidate(16, 8, 'aaabbccddeeffggh') == True\n    assert candidate(17, 9, 'aaabbbcccdddeeeff') == True\n    assert candidate(20, 10, 'aabbccddeeffgghhiijj') == True\n    assert candidate(22, 0, 'aabbccddeeffgghhiijjkk') == True\n</test>\n```"
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"The following is a competitive programming problem. You DONT NEED TO SOLVE IT but should standardize the task into human eval format.\n\n<statement>\n{STATEMENT}\n</statement>\n<test_cases>\n{TEST_CASES}\n</test_cases>"
                    }
                ]
            }
        ]
    # print(msgs[-1]['content'][0]['text'])

    ct = ""
    eof = False
    with client.messages.stream(
        model="claude-opus-4-1-20250805", 
        max_tokens=20000,
        temperature=1,
        system="You are an excellent data manipulator. You should NOT solve the competitive programming problem given to you and a solution from you is NOT required. Instead, your task is to standardize the given competitive programming problem to the standard human eval format. So instead of writing a complete program, the task will become completing a python function with spec (coming from the problem statement) and passing a unit test (coming from the sample input/output). The unit tests should reflect the TEST cases (the part between <test_cases></test_cases>). While leaving the essence of the problem intact, make the input/output as pythonic as possible (e.g. return boolean instead of \"YES\" or \"NO\" strings).\n\nThe following is an example human eval data point. You should standardize the task given to you like this:\n\n```\n<prompt>\nfrom typing import List, Optional\n\n\ndef longest(strings: List[str]) -> Optional[str]:\n    \"\"\" Out of list of strings, return the longest one. Return the first one in case of multiple\n    strings of the same length. Return None in case the input list is empty.\n    >>> longest(['a', 'b', 'c'])\n    'a'\n    >>> longest(['a', 'bb', 'ccc'])\n    'ccc'\n    \"\"\"\n</prompt>\n<entry_point>\nlongest\n</entry_point>\n<test>\ndef check(candidate):\n    assert candidate([]) == None\n    assert candidate(['x', 'y', 'z']) == 'x'\n    assert candidate(['x', 'yyy', 'zzzz', 'www', 'kkkk', 'abc']) == 'zzzz'\n</test>\n```",
        messages=msgs,
    ) as stream:
        for event in stream:
            if event.type == "text":
                ct += event.text
            elif event.type == "content_block_stop":
                eof = True
    try:
        # find the part between <prompt> and </prompt>
        prompt = ct.split('<prompt>')[1].split('</prompt>')[0].strip()
        test = ct.split('<test>')[1].split('</test>')[0].strip()
        entry_point = ct.split('<entry_point>')[1].split('</entry_point>')[0].strip()
        return prompt, test, entry_point
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except:
        return None, None, None

# %%
# # %%
# for u in qs['medium']:
#     if len(u[1]) > 1000:
#         continue
#     print(u[0])
#     print('-'*100)
#     import json
#     u1 = json.loads(u[1])
#     rst = ''
#     for idx,w in enumerate(u1):
#         rst+=f'Test input {idx+1}:\n{w["input"]}\nTest output {idx+1}:\n{w["output"]}\n'
#     break
# print(rst)
# %%
standardize_question(todo[0])
# %%
from multiprocessing import Pool
standardized = {}
from tqdm import tqdm
for k,v in qs.items():
    todo = [(u[0], u[1]) for u in v if len(u[1]) < 40000]
    print(f'{k}: {len(todo)}')
    with Pool(processes=20) as pool:
        standardized[k] = list(tqdm(pool.imap(standardize_question, todo), total=len(todo)))
# %%
def number_of_lines(s):
    return len(s.split('\n'))

cnt = 0
tests = []
for k in qs.keys():
    good_tests = [t for t in standardized[k] if t[1].count('assert candidate') >= 5]#[-30:]
    print(len(good_tests))
    tests.extend(good_tests)
    print(k, len(good_tests))

# %%
import json
with open('/mnt/data/benchmarking/inspect_evals/tests/humanevalplus/lcb_manytests.json','r') as f:
    previous_split = json.load(f)

with open('lcb_manytests_tmp.json', 'r') as f:
    all_tests = json.load(f)

def signature(s):
    u = map(lambda x:x.strip().replace(' ','').replace('\t','').replace('"','\''), s[1].split('\n')[1:])
    # find the longest one
    longest = max(u, key=len)
    return longest


def signature2(s):
    u = map(lambda x:x.strip().replace(' ','').replace('\t','').replace('"','\''), s[1].split('\n')[1:])
    u = [p for p in u if 'candidate' in p]
    # find the longest one
    longest = max(u, key=len)
    return longest

for a in previous_split:
    u = signature(a)
    same_signatures = []
    for t in all_tests:
        if signature(t) == u:
            same_signatures.append(t)
    if len(same_signatures) == 0:
        for t in all_tests:
            if signature2(t) == signature2(a):
                same_signatures.append(t)
        if len(same_signatures) == 0:
            for t in all_tests:
                if t[-1] == a[-1]:
                    same_signatures.append(t)
            if len(same_signatures) == 0:
                print('WARNING: one case not found')
                continue
            else:
                assert len(same_signatures) == 1
        else:
            assert len(same_signatures) == 1
    else:
        assert len(same_signatures) == 1
    
    # find that test and remove it
    del all_tests[all_tests.index(same_signatures[0])]
    assert len(same_signatures) == 1

print(f'now {len(all_tests)} tests left')


with open('lcb_manytests_split2.json', 'w') as f:
    json.dump(all_tests, f)

# %%
# %%
print(qs['hard'][-1][0])
print(qs['hard'][-1][1])
# %%
{k: len(v) for k, v in qs.items()}
# %%

