import abc
import argparse
import base64
import datetime
import io
import json
import os
from pathlib import Path
import re
from string import Template
from typing import Literal

from PIL import Image
import ale_bench
from ale_bench.data import Problem, ScoreType
from ale_bench.result import CaseResult, JudgeResult, Result
from ale_bench.utils import parse_statement
import tqdm


# Constants
EXP_ROOT_DIR = Path(__file__).resolve().parents[1]

CODE_LANGUAGE_STRING = {
    "cpp20": "C++20 (gcc 12.2.0)",
    "python": "Python (CPython 3.11.4)",
    "rust": "Rust (rustc 1.70.0)",
}
CODE_LANGUAGE_LIBRARIES = {
    "cpp20": """- AC Library@1.5.1
- Boost@1.82.0
- GMP@6.2.1
- Eigen@3.4.0-2ubuntu2""",
    "python": """- numpy==1.24.1
- scipy==1.10.1
- networkx==3.0
- sympy==1.11.1
- sortedcontainers==2.4.0
- more-itertools==9.0.0
- shapely==2.0.0
- bitarray==2.6.2
- PuLP==2.7.0
- mpmath==1.2.1
- pandas==1.5.2
- z3-solver==4.12.1.0
- scikit-learn==1.2.0
- ortools==9.5.2237
- ac-library-python
- setuptools==66.0.0
- cppyy==2.4.1
- torch==1.13.1
- polars==0.15.15
- lightgbm==3.3.1
- gmpy2==2.1.5
- numba==0.57.0""",
    "rust": """- ac-library-rs@=0.1.1
- once_cell@=1.18.0
- static_assertions@=1.1.0
- varisat@=0.2.2
- memoise@=0.3.2
- argio@=0.2.0
- bitvec@=1.0.1
- counter@=0.5.7
- hashbag@=0.1.11
- pathfinding@=4.3.0
- recur-fn@=2.2.0
- indexing@=0.4.1
- amplify@=3.14.2
- amplify_derive@=2.11.3
- amplify_num@=0.4.1
- easy-ext@=1.0.1
- multimap@=0.9.0
- btreemultimap@=0.1.1
- bstr@=1.6.0
- az@=1.2.1
- glidesort@=0.1.2
- tap@=1.0.1
- omniswap@=0.1.0
- multiversion@=0.7.2
- num@=0.4.1
- num-bigint@=0.4.3
- num-complex@=0.4.3
- num-integer@=0.1.45
- num-iter@=0.1.43
- num-rational@=0.4.1
- num-traits@=0.2.15
- num-derive@=0.4.0
- ndarray@=0.15.6
- nalgebra@=0.32.3
- alga@=0.9.3
- libm@=0.2.7
- rand@=0.8.5
- getrandom@=0.2.10
- rand_chacha@=0.3.1
- rand_core@=0.6.4
- rand_hc@=0.3.2
- rand_pcg@=0.3.1
- rand_distr@=0.4.3
- petgraph@=0.6.3
- indexmap@=2.0.0
- regex@=1.9.1
- lazy_static@=1.4.0
- ordered-float@=3.7.0
- ascii@=1.1.0
- permutohedron@=0.2.4
- superslice@=1.0.0
- itertools@=0.11.0
- itertools-num@=0.1.3
- maplit@=1.0.2
- either@=1.8.1
- im-rc@=15.1.0
- fixedbitset@=0.4.2
- bitset-fixed@=0.1.0
- proconio@=0.4.5
- text_io@=0.1.12
- rustc-hash@=1.1.0
- smallvec@=1.11.0""",
}
CODE_BLOCK_LANGUAGE_NAME = {
    "cpp20": "cpp",
    "python": "python",
    "rust": "rust",
    "markdown": "md",
}
CODE_BLOCK_STRING = {
    "cpp20": "```cpp\n// Your code here\n```",
    "python": "```python\n# Your code here\n```",
    "rust": "```rust\n// Your code here\n```",
    "markdown": "```md\nYour summary here\n```",
}
CODE_BLOCK_MATCH = {
    "cpp20": re.compile(r"```cpp\n(.+?)\n```", re.DOTALL),
    "python": re.compile(r"```python\n(.+?)\n```", re.DOTALL),
    "rust": re.compile(r"```rust\n(.+?)\n```", re.DOTALL),
    "markdown": re.compile(r"```md\n(.+?)\n```", re.DOTALL),
}

SYSTEM_PROMPT = {
    "en": (
        "You are a world-class algorithm engineer, and you are very good at programming. "
        "Now, you are participating in a programming contest. "
        "You are asked to solve a heuristic problem, known as an NP-hard problem."
    ),
    "ja": (
        "あなたは世界トップクラスのアルゴリズムエンジニアであり、プログラミングがとても得意です。"
        "今、あなたはプログラミングコンテストに参加しています。"
        "あなたはNP困難問題として知られるヒューリスティック問題を解くよう求められています。"
    ),
}

CONSIDERATION_PROMPT = {
    "en": (
        "There is a problem statement at the end of this message. "
        "First, please analyze the problem statement. "
        "Please think about the essential points of the problem and possible algorithms to get higher rank in the contest. "
    ),
    "ja": (
        "このメッセージの最後に問題文があります。"
        "まず、問題文を分析してください。"
        "問題の本質的なポイントと、コンテストでより高い順位を得る可能性のあるアルゴリズムについて考えてください。"
    ),
}
IMPLEMENTATION_ANY_PROMPT = {
    "en": Template(
        "Next, please implement your solution in any of the following languages: ${language_strings}. "
        "Your solution code should be written in the specified code block as follows:\n${code_blocks}\n"
        "You can use external libraries for each language. "
        "The available libraries are as follows:\n${libraries}\n\n"
    ),
    "ja": Template(
        "続いて、次のいずれかの言語で解法を実装してください: ${language_strings}。"
        "解法コードは、次のように指定されたコードブロックに記述してください:\n${code_blocks}\n"
        "各言語で外部ライブラリを使用できます。"
        "使用可能なライブラリは次の通りです:\n${libraries}\n\n"
    ),
}
IMPLEMENTATION_SPECIFIC_PROMPT = {
    "en": Template(
        "Next, please implement your solution in ${language}. "
        "Your solution code should be written in the ${code_block} code block. "
        "You can use external libraries as follows:\n${libraries}\n\n"
    ),
    "ja": Template(
        "続いて、${language}で解法を実装してください。"
        "解法コードは、${code_block}コードブロックに記述してください。"
        "使用可能なライブラリは次の通りです:\n${libraries}\n\n"
    ),
}
PROBLEM_HEADER_PROMPT = {
    "en": Template(
        "[Problem statement]\n"
        "Execution time limit: ${time_limit} sec / Memory limit: ${memory_limit} MB\n"
    ),
    "ja": Template(
        "[問題文]\n"
        "実行時間制限: ${time_limit} sec / メモリ制限: ${memory_limit} MB\n"
    ),
}

NO_CODE_BLOCK_ANY_PROMPT = {
    "en": Template(
        "No valid code block found. "
        "Please implement your solution in any of the following languages: ${language_strings}. "
        "Your solution code should be written in the specified code block as follows:\n${code_blocks}"
    ),
    "ja": Template(
        "有効なコードブロックが見つかりませんでした。"
        "次のいずれかの言語で解法を実装してください: ${language_strings}。"
        "解法コードは、次のように指定されたコードブロックに記述してください:\n${code_blocks}"
    ),
}
NO_CODE_BLOCK_SPECIFIC_PROMPT = {
    "en": Template(
        "No valid code block found. "
        "Please implement your solution in ${language}. "
        "Your solution code should be written in the ${code_block} code block."
    ),
    "ja": Template(
        "有効なコードブロックが見つかりませんでした。"
        "${language}で解法を実装してください。"
        "解法コードは、${code_block}コードブロックに記述してください。"
    ),
}

FEEDBACK_PROMPT = {
    "en": Template(
        "${feedback}\n\n"
        "Based on the above feedback, please consider the ways to improve your solution. "
        "Firstly, please analyze this given feedback and list what insights can be gained from it. "
        "Then, based on the insights, please refine your code to achieve better performance. "
        "It can be a simple bug fix, the introduction of a new algorithm, or any degree of change from minor to major. "
    ),
    "ja": Template(
        "${feedback}\n\n"
        "上記のフィードバックをもとに、解法を改善する方法を考えてください。"
        "まず、このフィードバックを分析し、そこから得られる洞察を列挙してください。"
        "次に、その洞察に基づいて、より良いパフォーマンスを達成するためにコードを改良してください。"
        "単純なバグ修正、新しいアルゴリズムの導入、または小さな変更から大きな変更まで、どの程度の変更でも構いません。"
    ),
}
FEEDBACK_PROMPT_WITH_SUMMARY = {
    "en": Template(
        "\n\n[Summary of your previous attempts]\n"
        "${action_summary}\n\n"
        "[Your best submission]\n"
        "### Code\n"
        "${best_code}\n\n"
        "### Feedback\n"
        "${best_feedback}\n\n"
        "[Your latest submission]\n"
        "### Code\n"
        "${latest_code}\n\n"
        "### Feedback\n"
        "${latest_feedback}\n\n"
        "Based on the above feedback, please consider the ways to improve your solution. "
        "Firstly, please analyze this given feedback and list what insights can be gained from it. "
        "Apart from that, please create a new summary including the content of the summary of your previous attempts in Markdown format in the ${summary_code_block} code block. "
        "If this code block in this format is not found, the summary of your previous attempts will not be input in the next turn. "
        "Then, based on the insights, please refine your code to achieve better performance. "
        "It can be a simple bug fix, the introduction of a new algorithm, or any degree of change from minor to major. "
    ),
    "ja": Template(
        "\n\n[あなたの過去の試行の要約]\n"
        "${action_summary}\n\n"
        "[あなたのベスト提出]\n"
        "### コード\n"
        "${best_code}\n\n"
        "### フィードバック\n"
        "${best_feedback}\n\n"
        "[あなたの最新の提出]\n"
        "### コード\n"
        "${latest_code}\n\n"
        "### フィードバック\n"
        "${latest_feedback}\n\n"
        "上記のフィードバックをもとに、解法を改善する方法を考えてください。"
        "まず、このフィードバックを分析し、そこから得られる洞察を列挙してください。"
        "またそれとは別に、これまでの試行の要約の内容も含めた新しい要約をMarkdown形式で${summary_code_block}コードブロックに記述してください。"
        "この様式のコードブロックが見つからない場合、次のターンにおけるあなたの過去の試行の要約は入力されません。"
        "次に、その洞察に基づいて、より良いパフォーマンスを達成するためにコードを改良してください。"
        "単純なバグ修正、新しいアルゴリズムの導入、または小さな変更から大きな変更まで、どの程度の変更でも構いません。"
    ),
}
REFINE_ANY_PROMPT = {
    "en": Template("Your solution code should be written in the specified code block as follows:\n${code_blocks}"),
    "ja": Template("解法コードは、次のように指定されたコードブロックに記述してください:\n${code_blocks}"),
}
REFINE_SPECIFIC_PROMPT = {
    "en": Template("Your solution code should be written in the ${code_block} code block."),
    "ja": Template("解法コードは、${code_block}コードブロックに記述してください。"),
}
NO_SUMMARY_PROMPT = {
    "en": "Your summary was not found. The summary must be written in the Markdown format in the ```md\n<!-- Your summary here -->\n``` code block.",
    "ja": "あなたの要約は見つかりませんでした。要約は必ずMarkdown形式で```md\n<!-- Your summary here -->\n```コードブロックに記述してください。",
}


# Custom JSON encoder / decoder
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, bytes):
            return "BYTES_OBJECT_BASE64:" + base64.b64encode(obj).decode("ascii")
        if isinstance(obj, datetime.datetime):
            return "DATETIME_OBJECT:" + obj.isoformat()
        if isinstance(obj, Image.Image):
            buffer = io.BytesIO()
            obj.save(buffer, "png")
            return "PIL_IMAGE_OBJECT_BASE64:" + base64.b64encode(buffer.getvalue()).decode("ascii")
        return super().default(obj)


class CustomJSONDecoder(json.JSONDecoder):
    def __init__(self, *args, **kwargs):
        super().__init__(object_hook=self.object_hook, *args, **kwargs)

    def object_hook(self, obj):
        for key, value in obj.items():
            if isinstance(value, str):
                if value.startswith("BYTES_OBJECT_BASE64:"):
                    obj[key] = base64.b64decode(value.removeprefix("BYTES_OBJECT_BASE64:"))
                    continue
                if value.startswith("DATETIME_OBJECT:"):
                    obj[key] = datetime.datetime.fromisoformat(value.removeprefix("DATETIME_OBJECT:"))
                    continue
                if value.startswith("PIL_IMAGE_OBJECT_BASE64:"):
                    image_data = base64.b64decode(value.removeprefix("PIL_IMAGE_OBJECT_BASE64:"))
                    obj[key] = Image.open(io.BytesIO(image_data))
                    continue
        return obj


# Abstract LLM class
class BaseLLM(abc.ABC):
    def __init__(self,
        system_prompt: str | None,
        log_dir: str | os.PathLike,
        num_retry: int = 5,
    ) -> None:
        self.system_prompt = system_prompt
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(parents=True, exist_ok=True)
        self.num_retry = num_retry
        self.messages = []

    @abc.abstractmethod
    def send_user_message(self, contents: list[str | Image.Image]) -> None:
        pass

    @abc.abstractmethod
    def send_user_message_new_thread(self, contents: list[str | Image.Image]) -> None:
        pass

    @abc.abstractmethod
    def get_last_response(self) -> str:
        pass

    @abc.abstractmethod
    def load_history(self, file_path: str | os.PathLike) -> None:
        pass

    @abc.abstractmethod
    def save_history(self, file_path: str | os.PathLike) -> None:
        pass


# Time limit handler
class TimeUp(Exception):
    pass


def check_time_limit(end_time: datetime.datetime) -> None:
    if datetime.datetime.now(tz=end_time.tzinfo) > end_time:
        raise TimeUp("Time up!")


# Command line arguments
class BaseNamespace(argparse.Namespace):
    code_language: Literal["any", "cpp20", "python", "rust"]
    duration: float | None
    exp_dir: Path
    first_accept: bool
    lite_version: bool
    num_codes: int
    num_no_code_patience: int
    num_workers: int
    problem_id: str
    prompt_language: Literal["en", "ja"]
    summarize: bool
    timer_margin_minutes: float
    use_image: bool


def get_common_argument_parser(description: str) -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("--code_language", type=str, choices=["any", "cpp20", "python", "rust"], required=True, help="Language of the code to run. If you set 'any', the model can choose any of the three languages.")
    parser.add_argument("--duration", type=float, default=None, help="Duration in hours to run the ALE-Bench. If you set this, the value must be greater than 1/30 (2 minutes).")
    parser.add_argument("--exp_dir", type=Path, required=True, help="Directory to save the experiment results.")
    parser.add_argument("--first_accept", action="store_true", help="Finish the run when the first code is accepted by public evaluation.")
    parser.add_argument("--lite_version", action="store_true", help="Use the lite version of ALE-Bench.")
    parser.add_argument("--num_codes", type=int, default=-1, help="Number of codes to generate. If -1, there is no limit.")
    parser.add_argument("--num_no_code_patience", type=int, default=0, help="Number of no code patience to stop the loop.")
    parser.add_argument("--num_workers", type=int, default=1, help="Number of workers to use. This is used for parallel processing of running the input cases.")
    parser.add_argument("--problem_id", type=str, required=True, help="Problem ID to run.")
    parser.add_argument("--prompt_language", type=str, choices=["en", "ja"], default="en", help="Language of the prompt.")
    parser.add_argument("--summarize", action="store_true", help="Summarize the history to reduce the size of the prompt.")
    parser.add_argument("--timer_margin_minutes", type=float, default=5.0, help="Margin in minutes to schedule the time limit.")
    parser.add_argument("--use_image", action="store_true", help="Use images in the problem statement.")
    return parser


# ALE-Bench
def merge_text_contents(contents: list[str | Image.Image]) -> str:
    merged_contents = []
    current_content = contents[0]
    for content in contents[1:]:
        if isinstance(content, str):
            current_content += content  # Concatenate strings
        elif isinstance(content, Image.Image):
            if current_content != "":
                merged_contents.append(current_content)
                current_content = ""
            merged_contents.append(content)
        else:
            raise ValueError(f"Invalid content type: {type(content)}")
    if current_content != "":
        merged_contents.append(current_content)
    return merged_contents


def create_initial_message(args: BaseNamespace, problem: Problem) -> list[str | Image.Image]:
    contents = [CONSIDERATION_PROMPT[args.prompt_language]]
    if args.code_language == "any":
        contents.append(IMPLEMENTATION_ANY_PROMPT[args.prompt_language].substitute(
            language_strings=", ".join(CODE_LANGUAGE_STRING.values()),
            code_blocks="\n".join([f"- {CODE_LANGUAGE_STRING[lang]}: {block}" for lang, block in CODE_BLOCK_STRING.items()]),
            libraries="\n".join([f"[{CODE_LANGUAGE_STRING[lang]}]\n{lib}" for lang, lib in CODE_LANGUAGE_LIBRARIES.items()]),
        ))
    else:
        contents.append(IMPLEMENTATION_SPECIFIC_PROMPT[args.prompt_language].substitute(
            language=CODE_LANGUAGE_STRING[args.code_language],
            code_block=CODE_BLOCK_STRING[args.code_language],
            libraries=CODE_LANGUAGE_LIBRARIES[args.code_language],
        ))
    contents.append(PROBLEM_HEADER_PROMPT[args.prompt_language].substitute(
        time_limit=problem.constraints.time_limit,
        memory_limit=problem.constraints.memory_limit // 1024 // 1024,
    ))
    if args.use_image:
        contents.extend(
            parse_statement(problem.statement, problem.statement_images)
            if args.prompt_language == "en" else
            parse_statement(problem.statement_ja, problem.statement_images)
        )
    else:
        contents.append(problem.statement if args.prompt_language == "en" else problem.statement_ja)
    return merge_text_contents(contents)


def no_code_block_message(args: BaseNamespace) -> str:
    if args.code_language == "any":
        return NO_CODE_BLOCK_ANY_PROMPT[args.prompt_language].substitute(
            language_strings=", ".join(CODE_LANGUAGE_STRING.values()),
            code_blocks="\n".join([f"- {CODE_LANGUAGE_STRING[lang]}: {block}" for lang, block in CODE_BLOCK_STRING.items()]),
        )
    else:
        return NO_CODE_BLOCK_SPECIFIC_PROMPT[args.prompt_language].substitute(
            language=CODE_LANGUAGE_STRING[args.code_language],
            code_block=CODE_BLOCK_STRING[args.code_language],
        )


def case_result_feedback(case_idx: int, case_result: CaseResult) -> str:
    return f"""- Case {case_idx}:
    Absolute score: {case_result.absolute_score}
    Execution time: {case_result.execution_time:.3f} sec
    Memory usage: {case_result.memory_usage // 1024 // 1024} MB
    Standard error: \"{case_result.error_str}\"
    Message: \"{case_result.message}\""""


def result_feedback(result: Result) -> str:
    feedback = f"[Public test result]\nOverall judge result: {result.overall_judge_result.value}\n"
    if result.overall_judge_result == JudgeResult.ACCEPTED:
        feedback += f"Overall absolute score: {result.overall_absolute_score}\n"
        feedback += "\n".join([
            f"- Case {i}: {case_result.absolute_score}"
            for i, case_result in enumerate(result.case_results, 1)
        ])
    else:
        selected_case_idx = 0
        for idx, case_result in enumerate(result.case_results):
            if case_result.judge_result == result.overall_judge_result:
                selected_case_idx = idx
                break
        feedback += case_result_feedback(selected_case_idx + 1, result.case_results[selected_case_idx])
    return feedback


def create_feedback_message(args: BaseNamespace, public_result: Result) -> list[str | Image.Image]:
    feedback = result_feedback(public_result)
    if args.code_language == "any":
        return [
            FEEDBACK_PROMPT[args.prompt_language].substitute(feedback=feedback) + REFINE_ANY_PROMPT[args.prompt_language].substitute(
                code_blocks="\n".join([f"- {CODE_LANGUAGE_STRING[lang]}: {block}" for lang, block in CODE_BLOCK_STRING.items()]),
            )
        ]
    else:
        return [
            FEEDBACK_PROMPT[args.prompt_language].substitute(feedback=feedback) + REFINE_SPECIFIC_PROMPT[args.prompt_language].substitute(
                code_block=CODE_BLOCK_STRING[args.code_language],
            )
        ]


def create_feedback_message_with_summary(
    args: BaseNamespace, problem: Problem, codes_history: list[tuple[Result | None, str, str]], last_summary: str | None,
) -> list[str | Image.Image]:
    best_result, best_code_language, best_code = select_submission_code(codes_history, problem.metadata.score_type, args.first_accept)
    best_feedback = result_feedback(best_result) if best_result is not None else "You have not submitted any code yet."
    latest_result, latest_code_language, latest_code = codes_history[-1]
    latest_feedback = result_feedback(latest_result) if latest_result is not None else "You have not submitted any code yet."

    contents = [PROBLEM_HEADER_PROMPT[args.prompt_language].substitute(
        time_limit=problem.constraints.time_limit,
        memory_limit=problem.constraints.memory_limit // 1024 // 1024,
    )]
    if args.use_image:
        contents.extend(
            parse_statement(problem.statement, problem.statement_images)
            if args.prompt_language == "en" else
            parse_statement(problem.statement_ja, problem.statement_images)
        )
    else:
        contents.append(problem.statement if args.prompt_language == "en" else problem.statement_ja)
    contents.append(
        FEEDBACK_PROMPT_WITH_SUMMARY[args.prompt_language].substitute(
            action_summary=last_summary if last_summary is not None else NO_SUMMARY_PROMPT[args.prompt_language],
            best_code=f"```{CODE_BLOCK_LANGUAGE_NAME[best_code_language]}\n{best_code}\n```",
            best_feedback=best_feedback,
            latest_code=f"```{CODE_BLOCK_LANGUAGE_NAME[latest_code_language]}\n{latest_code}\n```" if latest_code != best_code else "The latest code is the same as the best code.",
            latest_feedback=latest_feedback if latest_code != best_code else "The latest feedback is the same as the best feedback.",
            summary_code_block=CODE_BLOCK_STRING["markdown"],
        )
    )
    if args.code_language == "any":
        contents.append(REFINE_ANY_PROMPT[args.prompt_language].substitute(
            code_blocks="\n".join([f"- {CODE_LANGUAGE_STRING[lang]}: {block}" for lang, block in CODE_BLOCK_STRING.items()]),
        ))
    else:
        contents.append(REFINE_SPECIFIC_PROMPT[args.prompt_language].substitute(
            code_block=CODE_BLOCK_STRING[args.code_language],
        ))
    return merge_text_contents(contents)


def get_code_from_response(response: str, code_language: str) -> tuple[str, str]:
    if code_language in CODE_BLOCK_MATCH:
        match = CODE_BLOCK_MATCH[code_language].findall(response)
        if len(match) > 0:
            return code_language, match[-1]  # Get the last code block
    elif code_language == "any":
        for lang, pattern in CODE_BLOCK_MATCH.items():
            match = pattern.findall(response)
            if len(match) > 0:
                return lang, match[-1]  # Get the last code block
    return "", ""


def select_submission_code(codes_history: list[tuple[Result | None, str, str]], score_type: ScoreType, first_accept: bool) -> tuple[Result | None, str, str]:
    if len(codes_history) == 0:
        return None, "", ""  # No code found
    best_score = -1 if score_type == ScoreType.MAXIMIZE else 1000000000000000000
    best_result, best_code_language, best_code = None, None, None
    for result, code_language, code in codes_history:
        if result is not None and result.overall_judge_result == JudgeResult.ACCEPTED:
            if first_accept:
                return result, code_language, code
            elif score_type == ScoreType.MAXIMIZE and result.overall_absolute_score > best_score:
                best_score = result.overall_absolute_score
                best_result = result
                best_code_language = code_language
                best_code = code
            elif score_type == ScoreType.MINIMIZE and result.overall_absolute_score < best_score:
                best_score = result.overall_absolute_score
                best_result = result
                best_code_language = code_language
                best_code = code
    if best_code_language is not None and best_code is not None:
        return best_result, best_code_language, best_code
    return codes_history[-1]  # Return the last code if no accepted code is found


def replace_details_in_case_result(result: Result) -> Result:
    processed_case_results = []
    need_details = False
    if result.overall_judge_result != JudgeResult.ACCEPTED:
        need_details = True
    for case_result in result.case_results:
        if need_details and case_result.judge_result == result.overall_judge_result:
            processed_case_results.append(CaseResult(
                input_str=case_result.input_str,
                output_str=case_result.output_str,
                error_str=case_result.error_str,
                judge_result=case_result.judge_result,
                message=case_result.message,
                absolute_score=case_result.absolute_score,
                relative_score=case_result.relative_score,
                local_visualization=case_result.local_visualization,
                execution_time=case_result.execution_time,
                memory_usage=case_result.memory_usage,
            ))
            need_details = False
        else:
            processed_case_results.append(CaseResult(
                input_str=None,
                output_str=None,
                error_str=None,
                judge_result=case_result.judge_result,
                message=case_result.message,
                absolute_score=case_result.absolute_score,
                relative_score=case_result.relative_score,
                local_visualization=None,
                execution_time=case_result.execution_time,
                memory_usage=case_result.memory_usage,
            ))
    return Result(
        allow_score_non_ac=result.allow_score_non_ac,
        resource_usage=result.resource_usage,
        case_results=processed_case_results,
    )


def main_loop(args: BaseNamespace, llm: BaseLLM) -> None:
    # In order to avoid not enough time to run the ALE-Bench
    if args.duration is not None and args.duration * 60 <= args.timer_margin_minutes:
        raise ValueError(f"Duration must be at least greater than timer_margin_minutes ({args.timer_margin_minutes}).")

    num_codes = args.num_codes
    if num_codes == -1:
        num_codes = 1000000000  # Set a large number to avoid stopping the loop
    codes_history: list[tuple[Result, str, str]] = []
    summary_history: list[str | None] = []
    args.exp_dir.mkdir(parents=True, exist_ok=True)
    log_file = args.exp_dir / f"log_{args.problem_id}.txt"

    # Start the ALE-Bench session
    ale_bench_session = ale_bench.start(
        problem_id=args.problem_id,
        lite_version=args.lite_version,
        session_duration=datetime.timedelta(hours=args.duration) if args.duration is not None else None,
        num_workers=args.num_workers,
        run_visualization_server=False,
    )
    pbar = tqdm.tqdm(
        total=num_codes,
        desc=f"Running {args.problem_id}",
        unit="code",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} codes",
    )

    # Schedule the time limit
    end_at = ale_bench_session.session_started_at
    if args.duration is None:
        end_at += ale_bench_session.problem.metadata.duration
    else:
        end_at += min(datetime.timedelta(hours=args.duration), ale_bench_session.problem.metadata.duration)
    # We need this margin because even the timer is alarmed, the process may not be terminated immediately (especially the LLM API call)
    timer_at = end_at - datetime.timedelta(minutes=args.timer_margin_minutes)

    try:
        # Initial message
        check_time_limit(timer_at)
        llm.send_user_message(create_initial_message(args, ale_bench_session.problem))
        # Parse the response to extract the code
        code_language, code = get_code_from_response(llm.get_last_response(), args.code_language)
        for _ in range(args.num_no_code_patience):
            if code_language != "":
                break
            check_time_limit(timer_at)
            llm.send_user_message([no_code_block_message(args)])
            code_language, code = get_code_from_response(llm.get_last_response(), args.code_language)
        if code_language == "":
            raise ValueError("No code found in the response.")
        codes_history.append((None, code_language, code))
        num_codes -= 1; pbar.update(1)
        # Public evaluation
        public_result = replace_details_in_case_result(ale_bench_session.public_eval(code, code_language))
        codes_history[-1] = (public_result, code_language, code)  # Update the history with the result
        if public_result.overall_judge_result == JudgeResult.ACCEPTED and args.first_accept:
            num_codes = 0  # Stop the loop if the first accepted code is found when `first_accept` is True
        last_summary = "Your first submission was done and you need to start logging your attempts from now on."
        summary_history.append(last_summary)

        # Feedback loop
        for _ in range(num_codes):
            # Send the feedback message and refine the code
            if args.summarize:
                check_time_limit(timer_at)
                llm.send_user_message_new_thread(create_feedback_message_with_summary(
                    args, ale_bench_session.problem, codes_history, last_summary
                ))  # Problem statement and current situation
            else:
                check_time_limit(timer_at)
                llm.send_user_message(create_feedback_message(args, public_result))
            # Parse the response to extract the code
            code_language, code = "", ""
            current_summary = None
            for _ in range(args.num_no_code_patience):
                last_response = llm.get_last_response()
                code_language, code = get_code_from_response(last_response, args.code_language)
                summary_block = CODE_BLOCK_MATCH["markdown"].findall(last_response)
                if len(summary_block) > 0:
                    current_summary = summary_block[-1]  # Get the last summary block in the code extraction loop
                    # LLMs response with only code block without summary if they are argued with `no code block`
                if code_language != "":
                    last_summary = current_summary
                    break
                check_time_limit(timer_at)
                llm.send_user_message([no_code_block_message(args)])
            if code_language == "":
                raise ValueError("No code found in the response.")
            codes_history.append((None, code_language, code))
            summary_history.append(last_summary)
            pbar.update(1)
            # Public evaluation
            public_result = replace_details_in_case_result(ale_bench_session.public_eval(code, code_language))
            codes_history[-1] = (public_result, code_language, code)
            if public_result.overall_judge_result == JudgeResult.ACCEPTED and args.first_accept:
                break  # Stop the loop if the first accepted code is found when `first_accept` is True
    except TimeUp:
        print(f"[{args.problem_id}] Time is up! Stating the private evaluation.", file=log_file.open("a"))
    except Exception as e:
        print(f"[{args.problem_id}] Error: {e}", file=log_file.open("a"))

    # Private evaluation
    _, submission_code_language, submission_code = select_submission_code(
        codes_history, ale_bench_session.problem.metadata.score_type, args.first_accept
    )
    try:
        private_result, rank, performance = ale_bench_session.private_eval(submission_code, submission_code_language)
        print(f"[{args.problem_id}] Rank: {rank}, Performance: {performance}", file=log_file.open("a"))
        # Save the result
        json.dump({
            "problem_id": args.problem_id, "rank": rank, "performance": performance, "private_result": private_result.model_dump(),
        }, (args.exp_dir / f"private_result_{args.problem_id}.json").open("w"))
    except:
        print(f"[{args.problem_id}] Error: Private evaluation failed. ({args.problem_id})", file=log_file.open("a"))
    ale_bench_session.save(args.exp_dir / f"session_{args.problem_id}.json")
    json.dump([
        [None if r is None else r.model_dump(), cl, c] for r, cl, c in codes_history
    ], (args.exp_dir / f"codes_history_{args.problem_id}.json").open("w"))
    if args.summarize:
        json.dump(summary_history, (args.exp_dir / f"summary_history_{args.problem_id}.json").open("w"))
    llm.save_history(args.exp_dir / f"llm_history_{args.problem_id}.json")
