import random
import re
import ast
from typing import List, Tuple, Optional
import json
import pandas as pd
import backoff
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
import openai
import os
from openai import OpenAI, OpenAIError
from ollama import Client as OllamaClient
import tiktoken
from typing import Dict, Any, Tuple, List, Optional

import time
import copy
import traceback
# ===================== Tokenizer helper (usage estimation) =====================

def get_tokenizer():
    o200k_base = tiktoken.get_encoding("o200k_base")
    _tokenizer = tiktoken.Encoding(
        name="o200k_harmony",
        pat_str=o200k_base._pat_str,
        mergeable_ranks=o200k_base._mergeable_ranks,
        special_tokens={
            **o200k_base._special_tokens,
            "<|startoftext|>": 199998,
            "<|endoftext|>": 199999,
            "<|reserved_200000|>": 200000,
            "<|reserved_200001|>": 200001,
            "<|return|>": 200002,
            "<|constrain|>": 200003,
            "<|reserved_200004|>": 200004,
            "<|channel|>": 200005,
            "<|start|>": 200006,
            "<|end|>": 200007,
            "<|message|>": 200008,
            "<|reserved_200009|>": 200009,
            "<|reserved_200010|>": 200010,
            "<|reserved_200011|>": 200011,
            "<|call|>": 200012,
        }
        | {f"<|reserved_{i}|>": i for i in range(200013, 201088)},
    )
    return _tokenizer

class LLM:
    def __init__(self,
                source,  # 'huggingface' or 'openai'
                lm_id,
                prompt_template_path,
                communication,
                cot,
                sampling_parameters,
                agent_id
                ):
        self.rooms_explored = None
        self.goal_desc = None
        self.agent_id = agent_id
        self.agent_name = "Alice" if agent_id == 0 else "Bob"
        self.oppo_name = "Alice" if agent_id == 1 else "Bob"
        self.oppo_pronoun = "she" if agent_id == 1 else "he"
        self.debug = sampling_parameters.debug
        self.rooms = []
        self.prompt_template_path = prompt_template_path
        self.single = 'single' in self.prompt_template_path
        df = pd.read_csv(self.prompt_template_path)
        self.prompt_template = df['prompt'][0].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
        self.evaluation_prompt_template = df['prompt'][2].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
        self.check_prompt_template = df['prompt'][1].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
        self.communication_prompt_template = df['prompt'][3].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)

        self.communication = communication
        self.cot = cot  # {"none"|"true"|"light"|"medium"|"heavy"} or bool
        self.source = 'huggingface' if source == 'hf' else source
        self.lm_id = lm_id
        self.chat = any(k in lm_id for k in ['gpt-3.5-turbo', 'gpt-4', 'gpt-oss', 'gpt-4o', 'gpt-5', 'gemma', 'chat'])
        self.total_cost = 0
        self.oppo_plan = None

        self._rng = random.Random(42)
        self.cot_rounds = 1
        self.stop_on_convergence = True
        self.anneal_temperature = True

        # OpenAI + Ollama clients
        self.openai_client = None
        self.ollama_client = None

        # HF objects (optional)
        self.hf_model = None
        self.hf_tokenizer = None

        self.token_cost = 0
        self.message_cost = 0
        self.agent_memory = None
        self.env_api = None
        self.initial_plan = None

        # Sampling params (normalize keys across backends)
        if self.source == 'openai':
            if self.chat:
                self.sampling_params = {
                    # "num_ctx": getattr(sampling_parameters, "max_tokens", 1024),
                    "max_tokens": getattr(sampling_parameters, "max_tokens", 1024),
                    "temperature": getattr(sampling_parameters, "t", 1.0),
                    "top_p": getattr(sampling_parameters, "top_p", 1.0),
                    "n": getattr(sampling_parameters, "n", 1),
                }
            else:
                self.sampling_params = {
                    "max_tokens": getattr(sampling_parameters, "max_tokens", 1024),
                    "temperature": getattr(sampling_parameters, "t", 1.0),
                    "top_p": getattr(sampling_parameters, "top_p", 1.0),
                    "n": getattr(sampling_parameters, "n", 1),
                }
        elif self.source == 'huggingface':
            try:
                from transformers import LlamaForCausalLM, LlamaTokenizer
                self.hf_tokenizer = LlamaTokenizer.from_pretrained(self.lm_id, use_fast=True)
                self.hf_model = LlamaForCausalLM.from_pretrained(self.lm_id, device_map='auto', load_in_4bit=True)
            except Exception as e:
                raise RuntimeError(f"Failed to load HF model {self.lm_id}: {e}")
            self.sampling_params = {
                "max_new_tokens": getattr(sampling_parameters, "max_tokens", 256),
                "temperature": getattr(sampling_parameters, "t", 1.0),
                "top_p": getattr(sampling_parameters, "top_p", 1.0),
                "num_return_sequences": getattr(sampling_parameters, "n", 1),
                'use_cache': True,
                'return_dict_in_generate': True,
                'do_sample': True,
            }
        else:
            raise ValueError("invalid source")

        # Initialize engine
        self.generator = self._lm_engine(self.source, self.lm_id)

        # Runtime states populated by run(...)
        self.current_room: Optional[str] = None
        self.object_list = None
        self.holding_objects = None
        self.obj_per_room = None

        self.opponent_agent_id = 1 if self.agent_id == 0 else 0
        self.opponent_last_room = None
        self.roomname2id: dict[str, int] = {}
        self.goal_location_id: Optional[int] = None

        self.ACTION_PREFIX_RE = re.compile(r'^\s*\[([a-zA-Z_]+)\]\s*<.*?>\s*\(\d+\)\s*$')
        self.ACTION_ID_RE = re.compile(r'\((\d+)\)\s*$')
        self.base_url = ''

    # ----------------------------- Engine builder -----------------------------
    def _lm_engine(self, source: str, lm_id: str):
        """Returns a callable: (prompt, sampling_params, is_check=False) -> (outputs, usage_dollars, usage_tokens)
        outputs = [generated_text, generated_reasoning(optional)]
        """

        # OpenAI + Ollama clients are created lazily to avoid unnecessary env lookups.
        def _get_openai(base_url=None, api_key=None):
            if self.openai_client is None:
                if base_url is None:
                    self.openai_client = OpenAI()
                else:
                    self.openai_client = OpenAI(base_url=base_url, api_key=api_key)
            return self.openai_client

        def _get_ollama():
            if self.ollama_client is None:
                host = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
                self.ollama_client = OllamaClient(host=host)
            return self.ollama_client

        def _is_ollama_model(model_id: str) -> bool:
            return any(model_id.lower().startswith(prefix) for prefix in ["gpt-oss", "gemma", "gemma3", "gemma2"]) \
                or model_id.lower() in {"gpt-oss:20b", "gemma3:4b", "gemma2:9b"}

        @backoff.on_exception(backoff.expo, OpenAIError, max_tries=5)
        def _generate(prompt, sampling_params, is_check: bool = False):
            usage_tokens = 0
            generated_reasoning = ""

            if source == 'openai':
                # Route to Ollama for local models, OpenAI Responses for 4o-mini etc.
                if _is_ollama_model(lm_id):
                    if "gpt-oss" in lm_id:
                        ollama = _get_ollama()
                        think_effort = "medium"
                        if isinstance(self.cot, str) and self.cot not in ("none", "true"):
                            think_effort = self.cot
                        resp = ollama.chat(model=lm_id, messages=prompt, think=(think_effort if not is_check else "medium"), options=sampling_params)
                        # Ollama python client returns: { 'message': {'content': str, 'thinking': str}, ... }
                        generated_text = resp.message.content
                        generated_reasoning = getattr(resp.message, 'thinking', '') or resp.message.get('thinking', '') if hasattr(resp, 'message') else ''
                    elif "gemma" in lm_id:
                        ollama = _get_openai(base_url="http://localhost:11434", api_key="ollama")
                        resp = ollama.chat.completions.create(model=lm_id, messages=prompt, **sampling_params)
                        generated_text = [resp.choices[i].message.content for i in range(sampling_params['n'])][0]
                else:
                    client = _get_openai()
                    # Use Responses API (works for gpt-4o-mini, gpt-5-nano, etc.)
                    resp = client.responses.create(model=lm_id, input=prompt)
                    generated_text = resp.output_text

                # Estimate tokens (best-effort, local tokenizer)
                _tok = get_tokenizer()


                try:
                    last_user = prompt[-1]['content'] if isinstance(prompt, list) and prompt else ""
                    usage_tokens = len(_tok.encode(last_user)) + len(_tok.encode(generated_text)) + len(_tok.encode(generated_reasoning))
                except Exception:
                    usage_tokens = 0
                return [generated_text, generated_reasoning], 0, usage_tokens

            elif source == 'huggingface':
                # Simple HF text generation (no separate reasoning)
                if isinstance(prompt, list):
                    # Convert chat format to a flat prompt
                    flat = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in prompt])
                else:
                    flat = str(prompt)
                input_ids = self.hf_tokenizer(flat, return_tensors="pt").input_ids.to('cuda')
                prompt_len = input_ids.shape[-1]
                with torch.inference_mode():
                    out = self.hf_model.generate(input_ids, pad_token_id=self.hf_tokenizer.eos_token_id, **self.sampling_params)
                text = self.hf_tokenizer.batch_decode(out.sequences[:, prompt_len:])
                text = [s.strip() for s in text]
                text = [s[:-4] if s.endswith('</s>') else s for s in text]
                gen = text[0] if text else ""
                _tok = get_tokenizer()
                try:
                    usage_tokens = len(_tok.encode(flat)) + len(_tok.encode(gen))
                except Exception:
                    usage_tokens = 0
                return [gen, ""], 0, usage_tokens

            else:
                raise ValueError("invalid source")

        return _generate


    def reset(self, rooms_name, goal_objects):
        self.rooms = rooms_name
        self.goal_desc = self.goal2description(goal_objects)

        self.token_cost = 0
        self.message_cost = 0
        self.opponent_last_room = None
        self.initial_plan = None


    def goal2description(self, goals):  # {predicate: count}
        s = "Transport "
        r = None
        for object_name, count in goals.items():
            s += f"{count} {object_name}{'s' if count > 1 else ''}, "

        s = s[:-2] + f" to the bed."
        return s


    def progress2text(self, current_step, satisfied, opponent_grabbed_objects, opponent_last_room,):
        s = f"I've taken {current_step}/3000 steps. "

        sss = {}
        for room, obj_list in self.obj_per_room.items():
            sr = ""
            s_obj = ""
            s_con = ""
            s_bed = ""
            objs = obj_list[0]
            cons = obj_list[1]
            if len(objs) > 0:
                if len(objs) == 1:
                    x = objs[0]
                    s_obj += f"a target object <{x['name']}> ({x['id']})"
                else:
                    ss = ', '.join([f"<{x['name']}> ({x['id']})" for x in objs])
                    s_obj += f"target objects " + ss

            if len(cons) > 0:
                if len(cons) == 1:
                    x = cons[0]
                    s_con = f"a container <{x['name']}> ({x['id']})"
                else:
                    ss = ', '.join([f"<{x['name']}> ({x['id']})" for x in cons])
                    s_con = f"containers " + ss
            if len(obj_list[2]) > 0:
                s_bed = 'the goal position bed'
            if s_obj == "" and s_con == "" and s_bed == "":
                sr += 'nothing'
            elif s_obj != "" and s_con != "" and s_bed == "":
                sr += s_obj + ', and ' + s_con
            elif s_obj != "" and s_con == "" and s_bed != "":
                sr += s_obj + ', and ' + s_bed
            elif s_obj == "" and s_con != "" and s_bed != "":
                sr += s_con + ', and ' + s_bed
            elif s_obj != "" and s_con != "" and s_bed != "":
                sr += s_obj + ', ' + s_con + ', and ' + s_bed
            else:
                sr += s_obj + s_con + s_bed
            sss[room] = sr

        if len(satisfied) == 0:
            if len(self.object_list[2]) == 0:
                s += "I haven't found the goal position bed. "
            else:
                s += ""
        else:
            s += f"{'I' if self.single else 'We'}'ve already transported "
            unique_satisfied = []
            for x in satisfied:
                if x not in unique_satisfied:
                    unique_satisfied.append(x)
            if len([x for x in unique_satisfied if x['type'] == 0]) == 0:
                s += 'nothing'
            s += ', '.join([f"<{x['name']}> ({x['id']})" for x in unique_satisfied if x['type'] == 0])
            s += ' to the bed. '

        s_hold = ["", ""]
        for i, obj in enumerate(self.holding_objects):
            if obj['type'] == 0:
                s_hold[i] = f"a target object <{obj['name']}> ({obj['id']}). "
            elif obj['type'] == 1:
                ss = ""
                cnt = 0
                for j, o in enumerate(obj['contained']):
                    if o is None:
                        break
                    cnt += 1
                    ss += f"<{obj['contained_name'][j]}> ({o}), "
                if cnt == 0:
                    ss = 'nothing'
                else:
                    ss = f"target object{'s' if cnt > 1 else ''} {ss[:-2]}"
                s_hold[i] = f"a container <{obj['name']}> ({obj['id']}) with {ss} in it. "

        if self.holding_objects[0]["type"] == 0 and self.holding_objects[1]['type'] == 0:
            s += f"I'm holding two target objects <{self.holding_objects[0]['name']}> ({self.holding_objects[0]['id']}) and <{self.holding_objects[1]['name']}> ({self.holding_objects[1]['id']}). "
        elif s_hold[0] == "" and s_hold[1] == "":
            s += "I'm holding nothing. "
        elif s_hold[0] != "" and s_hold[1] != "":
            s += f"I'm holding {s_hold[0][:-2]}, and {s_hold[1]}"
        else:
            s += f"I'm holding {s_hold[0]}{s_hold[1]}"

        # print(self.current_room, self.obj_per_room)
        if self.current_room not in self.rooms_explored: pred_room = 'none'
        else: pred_room = self.rooms_explored[self.current_room]
        if pred_room != 'all' and sss[self.current_room] == 'nothing':
            s += f"I'm in the {self.current_room}, where I've explored {pred_room} of it. "
        else:
            s += f"I'm in the {self.current_room}, where I've explored {pred_room} of it and found {sss[self.current_room]}. "
        ### opponent modeling
        if not self.single:
            s_hold = ["", ""]
            for i, obj in enumerate(opponent_grabbed_objects):
                if obj['type'] == 0:
                    s_hold[i] = f"a target object <{obj['name']}> ({obj['id']}). "
                elif obj['type'] == 1:
                    ss = ""
                    cnt = 0
                    for j, o in enumerate(obj['contained']):
                        if o is None:
                            break
                        cnt += 1
                        ss += f"<{obj['contained_name'][j]}> ({o}), "
                    if cnt == 0:
                        ss = 'nothing'
                    else:
                        ss = f"target object{'s' if cnt > 1 else ''} {ss[:-2]}"
                    s_hold[i] = f"a container <{obj['name']}> ({obj['id']}) with {ss} in it. "
            if opponent_grabbed_objects[0]["type"] == 0 and opponent_grabbed_objects[1]['type'] == 0:
                ss = f"two target objects <{opponent_grabbed_objects[0]['name']}> ({opponent_grabbed_objects[0]['id']}) and <{opponent_grabbed_objects[1]['name']}> ({opponent_grabbed_objects[1]['id']}). "
            if s_hold[0] == "" and s_hold[1] == "":
                ss = "nothing. "
            elif s_hold[0] != "" and s_hold[1] != "":
                ss = f"{s_hold[0][:-2]}, and {s_hold[1]}"
            else:
                ss = f"{s_hold[0]}{s_hold[1]}"

            if opponent_last_room is None:
                s += f"I don't know where {self.oppo_name} is. "
            elif opponent_last_room == self.current_room:
                s += f"I also see {self.oppo_name} here in the {self.current_room}, {self.oppo_pronoun} is holding {ss}"
            else:
                s += f"Last time I saw {self.oppo_name} was in the {opponent_last_room}, {self.oppo_pronoun} was holding {ss}"

        for room in self.rooms:
            if room == self.current_room:
                continue
            #s += f"I've explored {self.rooms_explored[room] if room in self.rooms_explored else 'None'} of the {room}, and I found {sss[room]} there. "
            if room not in self.rooms_explored: pred_room = 'none'
            else: pred_room = self.rooms_explored[room]
            if pred_room != 'all' and sss[room] == 'nothing':
                s += f"I've explored {pred_room} of the {room}. "
            else:
                s += f"I've explored {pred_room} of the {room}, and I found {sss[room]} there. "

        return s


    def parse_answer(self, available_actions, text):
        flags = 'AC'
        for i in range(len(available_actions)):
            action = available_actions[i]
            if action.lower() in text.lower():
                return available_actions[i], flags
        sents = text.split('\n')  # Split by space
        words = []
        for sent in sents:
            words.extend(sent.split(' '))
        words = list(filter(None, words))  # Remove empty strings from the result

        for i in range(len(available_actions)):
            action = available_actions[i]
            option = chr(ord('A') + i)
            # txt = text.lower()
            if f"option {option}" in text or f"{option}." in words or f"{option}," in words or f"{option}\n" in text.split(" ") or f"Option {option}" in text or f"({option})" in words or f"action {option}" in text or (len(text) <= 2 and option in text):
                return action, flags
        print("WARNING! Fuzzy match!")
        flags = "Fuzzy match"
        for i in range(len(available_actions)):
            action = available_actions[i]
            if self.communication and i == 0:
                continue
            act = "None"
            name = "None"
            id = "None"
            if action.startswith('go to'):
                # act = 'go to'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('explore'):
                act = 'explore'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('go grasp'):
                act = 'grasp'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('put'):
                act = 'put'
            elif action.startswith('transport'):
                act = 'transport'
            option = chr(ord('A') + i)
            if name in text and id in text:
                return action, flags
        for i in range(len(available_actions)):
            action = available_actions[i]
            if self.communication and i == 0:
                continue
            act = "None"
            name = "None"
            id = "None"
            if action.startswith('go to'):
                # act = 'go to'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('explore'):
                act = 'explore'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('go grasp'):
                act = 'grasp'
                name = action.split(' ')[-2][1:-1]
                id = action.split(' ')[-1][1:-1]
            elif action.startswith('put'):
                act = 'put'
            elif action.startswith('transport'):
                act = 'transport'
            option = chr(ord('A') + i)
            if f"{option} " in text or act in text or name in text or id in text:
                return action, flags

        text = text.replace("Answer",'').replace(":","").strip()
        if len(text) == 1:
            i = ord(text) - ord('A')
            if i in range(len(available_actions)):
                return available_actions[i], flags
        print("WARNING! No available action parsed!!! Random choose one")
        flags = "failed to parse"
        return random.choice(available_actions), flags

    def calculate_cost(self, pos1, pos2):
        return len(self.agent_memory.find_shortest_path(pos1, pos2)[0]) * 2

    def get_available_plans(self):
        """
        go to room {}
        explore current room {}
        go grasp target object / container {}
        holding both container and object: put obj into the container
        holding any goal objects: transport holding objects to the bed
        """
        available_plans = []
        action_cost = {}
        opp_pos = self.env_api['center_of_room'](self.opponent_last_room) if self.opponent_last_room else None
        if self.holding_objects[0]['type'] is None or self.holding_objects[1]['type'] is None:
            for obj in self.object_list[0]:
                text_action = f"go grasp target object <{obj['name']}> ({obj['id']})"
                cost = len(self.agent_memory.find_shortest_path(self.agent_memory.obs["agent"][:3], obj['position'])[0]) * 2
                oppo_cost = 'unknown'
                if self.opponent_last_room:
                    oppo_cost = self.calculate_cost(opp_pos, obj['position'])
                action_cost[text_action] = cost

                cost_text = f" - my cost: {cost} steps, opponent cost: {oppo_cost}"
                text_action = f"{text_action}{cost_text}"
                available_plans.append(text_action)
            if not (self.holding_objects[0]['type'] == 1 or self.holding_objects[1]['type'] == 1):
                for obj in self.object_list[1]:
                    text_action = f"go grasp container <{obj['name']}> ({obj['id']})"
                    cost = len(self.agent_memory.find_shortest_path(self.agent_memory.obs["agent"][:3], obj['position'])[0]) * 2
                    action_cost[text_action] = cost
                    oppo_cost = 'unknown'
                    if self.opponent_last_room:
                        oppo_cost = self.calculate_cost(opp_pos, obj['position'])
                    cost_text = f" - my cost: {cost} steps, opponent cost: {oppo_cost}"
                    text_action = f"{text_action}{cost_text}"
                    available_plans.append(text_action)
        else:
            if self.holding_objects[0]['type'] == 1 and self.holding_objects[0]['contained'][-1] is None and self.holding_objects[1]['type'] == 0:
                text_action = f"put <{self.holding_objects[1]['name']}> ({self.holding_objects[1]['id']}) into the container <{self.holding_objects[0]['name']}> ({self.holding_objects[0]['id']})"
                cost = 2
                action_cost[text_action] = cost
                text_action = f"{text_action} - cost: {cost} steps"
                available_plans.append(text_action)
            elif self.holding_objects[1]['type'] == 1 and self.holding_objects[1]['contained'][-1] is None and self.holding_objects[0]['type'] == 0:
                text_action = f"put <{self.holding_objects[0]['name']}> ({self.holding_objects[0]['id']}) into the container <{self.holding_objects[1]['name']}> ({self.holding_objects[1]['id']})"
                cost = 2
                action_cost[text_action] = cost
                text_action = f"{text_action} - cost: {cost} steps"
                available_plans.append(text_action)
        if any(obj['type'] is not None for obj in self.holding_objects) and len(self.object_list[2]) != 0:
            text_action = f"transport objects I'm holding to the bed"
            cost = len(self.agent_memory.find_shortest_path(self.agent_memory.obs["agent"][:3], self.object_list[2][0]['position'])[0]) * 2
            action_cost[text_action] = cost
            oppo_cost = 'unknown'
            if self.opponent_last_room:
                oppo_cost = self.calculate_cost(opp_pos, self.object_list[2][0]['position'])
            cost_text = f" - my cost: {cost} steps, opponent cost: {oppo_cost}"
            text_action = f"{text_action}{cost_text}"
            available_plans.append(text_action)
        for room in self.rooms:
            if room == self.current_room or room is None or room == 'None':
                continue
            text_action = f"go to {room}"
            room_pos = self.env_api['center_of_room'](room)
            cost = len(self.agent_memory.find_shortest_path(self.agent_memory.obs["agent"][:3], room_pos)[0]) * 2
            action_cost[text_action] = cost

            oppo_cost = 'unknown'
            if self.opponent_last_room:
                oppo_cost = self.calculate_cost(opp_pos, room_pos)
            cost_text = f" - my cost: {cost} steps, opponent cost: {oppo_cost}"
            text_action = f"{text_action}{cost_text}"

            available_plans.append(text_action)
        if self.current_room not in self.rooms_explored or self.rooms_explored[self.current_room] != 'all':
            text_action = f"explore current room {self.current_room}"
            cost = 1
            action_cost[text_action] = cost
            text_action = f"{text_action} - cost: {cost} steps"
            available_plans.append(text_action)

        plans = ""
        for i, plan in enumerate(available_plans):
            plans += f"{chr(ord('A') + i)}. {plan}\n"

        for idx, plan in enumerate(available_plans):
            replace_idx = plan.index(' -')
            available_plans[idx] = plan[:replace_idx]
        return plans, len(available_plans), available_plans, action_cost

    def _strip_code_fences(self, s: str) -> str:
        s = (s or "").strip()
        if s.startswith("```"):
            s = re.sub(r"^```[a-zA-Z0-9_+\-]*\n", "", s)
            if s.endswith("```"):
                s = s[:-3]
        return s.strip()

    def _safe_generator(self, prompt_obj, params, *, is_check: bool = False, retries: int = 2, sleep: float = 0.4):
        last_exc = None
        for t in range(retries + 1):
            try:
                outputs, usage, tokens = self.generator(prompt_obj, params, is_check=is_check)
                outs = [(self._strip_code_fences(o) if isinstance(o, str) else o) for o in outputs]
                self.token_cost += tokens
                print("token_cost:",self.token_cost)
                return outs, usage, tokens
            except Exception as e:
                last_exc = e
                if t < retries:
                    time.sleep(sleep * (1.5 ** t))
                else:
                    raise last_exc


    def _build_prompt_block(
            self,
            current_step, satisfied, opponent_grabbed_objects, opponent_last_room,
            action_history: List[str], dialogue_history: List[str]
    ) -> Tuple[str, str, str, str]:

        """progress/action/dialogue를 템플릿에 주입한 기본 prompt 반환"""
        progress_desc = self.progress2text(
            current_step, satisfied, opponent_grabbed_objects, opponent_last_room,)

        action_history_desc = ", ".join(action_history[-10:] if len(action_history) > 10 else action_history)
        dialogue_history_desc = '\n'.join(dialogue_history[-3:] if len(dialogue_history) > 3 else dialogue_history)

        prompt = (self.prompt_template
                  .replace('$GOAL$', self.goal_desc)
                  .replace('$PROGRESS$', progress_desc)
                  .replace('$ACTION_HISTORY$', action_history_desc)
                  .replace('$DIALOGUE_HISTORY$', dialogue_history_desc))
        return prompt, progress_desc, action_history_desc, dialogue_history_desc

    def _coerce_available_list(self, lst: List[str]) -> List[str]:
        seen, cleaned = set(), []
        for a in lst:
            t = (a or "").strip()
            if t and t not in seen:
                seen.add(t)
                cleaned.append(t)
        return cleaned

    def format_message(self, text: str) -> str:
        if text is None:
            return text
        line = text.strip()
        print("original message:",line)
        m = re.match(r"(?:send a message)\s*(.*)", line)
        if not m:
            return text  # 접두어 없으면 원래 문자열 그대로 반환

        message = m.group(1)
        cleaned = message.replace('<', '').replace('>', '').replace('"', '').replace('send a message',"").strip()
        return f'send a message <"{cleaned}">'

    def _is_action_in_available(self, action_str: str, available_list: List[str]) -> bool:
        if not action_str:
            return "", False
        a = action_str.strip()
        # 1) send_message 는 항상 허용
        if a.startswith("send a message"):
            a = self.format_message(a)
            return a, True

        if any(a == x.strip() for x in available_list):
            return a, True

        return "", False


    # ========= (Planner) =========
    def _plan(
            self,
            mode: str,
            prompt: str,
            available_plans_list: List[str],
            *,
            stage_usages: Dict[str, float],
            stage_tokens: Dict[str, int]
    ) -> Tuple[str, Dict[str, Any]]:
        info: Dict[str, Any] = {}
        plan: Optional[str] = None

        if mode == "reasoning":
            if self.debug:
                print(f"[Planning(reasoning)] prompt:\n{prompt}")

            chat_prompt = [
                {"role": "system", "content": f"Reasoning: {self.cot}"},
                {"role": "user", "content": prompt}
            ]
            try:
                outputs, usage, tokens = self._safe_generator(chat_prompt if self.chat else prompt,
                                                              self.sampling_params)
                info['planning'] = [chat_prompt, outputs]
            except Exception as e:
                if self.debug:
                    print(f"[PLANNER] generator failed: {e} -> fallback first available")
                plan = available_plans_list[0]
                info.update({"plan": plan})
                return plan, info

            generated_samples = outputs[0] if outputs else ""
            generated_reasoning = outputs[1] if len(outputs) > 1 else ""

            stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
            stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)


            info['reasoning_traces'] = generated_reasoning
            info['final_reasoning'] = generated_samples

            if self.debug:
                print(f"[Planning(reasoning)] reasoning:\n{generated_reasoning}")
                print(f"[Planning(reasoning)] samples:\n{generated_samples}")

            answer, flags = self.parse_answer(available_plans_list, generated_samples)

            candidate = (answer or "").strip()
            if self._is_action_in_available(candidate, available_plans_list):
                plan = candidate
            else:
                chosen = None
                for a in available_plans_list:
                    if a in generated_samples:
                        chosen = a
                        break
                plan = chosen or available_plans_list[0]

            info['plan'] = plan
            return plan, info

        # mode == "base" : CoT k회 개선 루프 후 최종 액션만
        k = max(int(self.cot_rounds), 1)
        stop_on_convergence = bool(self.stop_on_convergence)
        anneal = bool(self.anneal_temperature)

        if self.debug:
            print(f"[Planning(base)] rounds={k}, stop_on_convergence={stop_on_convergence}, anneal={anneal}")
            print(f"[Planning(base)] base_prompt:\n{prompt}")

        base_params = copy.deepcopy(self.sampling_params)
        reasoning_traces = ""
        last_norm = None

        # Round 1
        round_prompt = prompt + " Let's think step by step."
        chat_prompt = [{"role": "user", "content": round_prompt}]
        try:
            outputs, usage, tokens = self._safe_generator(chat_prompt, base_params)
            info['planning'] = [chat_prompt, outputs]
        except Exception as e:
            if self.debug:
                print(f"[Planning(base) r1] generator failed: {e} -> fallback first available")
            plan = available_plans_list[0]
            info.update({"plan": plan})
            return plan, info

        generated_reasoning = outputs[0] if outputs else ""
        stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
        stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)

        reasoning_traces += generated_reasoning

        if self.debug:
            print(f"[CoT r1] reasoning:\n{generated_reasoning}")

        # Round 2..k
        for r in range(2, k + 1):
            if anneal and "temperature" in base_params:
                t0 = float(self.sampling_params.get("temperature", 1.0))
                t_min = 0.2
                frac = (r - 1) / (k - 1) if k > 1 else 1.0
                base_params["temperature"] = max(t_min, float(t0 - (t0 - t_min) * frac))

            reflection_prompt = (
                "You are refining your previous reasoning only for planning. "
                "1) Critique weaknesses or missing checks. "
                "2) Provide an improved, concise step-by-step plan. "
                "Do NOT output the final action yet."
            )
            round_prompt = (
                f"{prompt}\n\n"
                f"--- Previous reasoning (r{r - 1}) ---\n{generated_reasoning}\n\n"
                f"--- Instruction ---\n{reflection_prompt}"
            )
            chat_prompt = [{"role": "user", "content": round_prompt}]
            try:
                outputs, usage, tokens = self._safe_generator(chat_prompt, base_params)
                info['planning'] += [chat_prompt, outputs]
            except Exception as e:
                if self.debug:
                    print(f"[Planning(base) r{r}] generator failed: {e} (stop refine)")
                break

            generated_reasoning = outputs[0] if outputs else ""
            stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
            stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)

            reasoning_traces += generated_reasoning

            if self.debug:
                print(f"[CoT r{r}] reasoning:\n{generated_reasoning}")

            if stop_on_convergence:
                cur_norm = "".join(generated_reasoning.split())
                if last_norm is not None and cur_norm == last_norm:
                    if self.debug:
                        print(f"[CoT] Early stop at round {r} due to convergence.")
                    break
                last_norm = cur_norm

        # Final: action only
        final_reasoning = generated_reasoning
        chat_prompt = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": final_reasoning},
            {"role": "user", "content": "Answer with only one best next action. So the answer is"}
        ]
        try:
            outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params)
            info['planning'] += [chat_prompt, outputs]
            if self.debug:
                print(f"[Planning(base) final] outputs: {outputs}")
        except Exception as e:
            if self.debug:
                print(f"[Planning(base) final] generator failed: {e}")
            outputs, usage, tokens = ([""], 0.0, 0)

        generated_samples = outputs[0] if outputs else ""
        stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
        stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)

        answer, _ = self.parse_answer(available_plans_list, generated_samples)
        candidate = (answer or "").strip()
        if self._is_action_in_available(candidate, available_plans_list):
            plan = candidate
        else:
            chosen = None
            for a in available_plans_list:
                if a in generated_samples:
                    chosen = a
                    break
            plan = chosen or available_plans_list[0]

        info.update({
            "reasoning_traces": reasoning_traces,
            "final_reasoning": final_reasoning,
            "plan": plan
        })
        return plan, info

    def run_comm(self, last_message, current_step, current_room, rooms_explored, holding_objects,
                 satisfied, object_list, obj_per_room,action_history, dialogue_history,
                 opponent_grabbed_objects, opponent_last_room):


        self.current_room = current_room
        self.rooms_explored = rooms_explored
        self.holding_objects = holding_objects
        self.object_list = object_list
        self.obj_per_room = obj_per_room

        self.opponent_last_room = opponent_last_room

        self.current_step = current_step
        prompt, progress_desc, action_history_desc, dialogue_history_desc = self._build_prompt_block(
            current_step, satisfied, opponent_grabbed_objects, opponent_last_room,
            action_history, dialogue_history
        )

        comm_prompt = (self.communication_prompt_template
                       .replace('$GOAL$', self.goal_desc)
                       .replace('$PROGRESS$', progress_desc)
                       .replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
                       .replace('$ACTION_HISTORY$', action_history_desc)
                       .replace('$LAST_MESSAGE$', last_message))

        if self.debug:
            print(f"[Communicator] comm_prompt:\n{comm_prompt}")

        info: Dict[str, Any] = {}
        chat_prompt = [{"role": "user", "content": comm_prompt}]
        try:
            comm_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
            info['communication'] = [chat_prompt, comm_outputs]
        except Exception as e:
            if self.debug:
                print(f"[Communicator] generator failed: {e} (skip communication)")
            comm_outputs, usage, tokens = ([""], 0.0, 0)

        comm_raw = comm_outputs[0] if comm_outputs else ""
        comm_raw = self.format_message(comm_raw)
        if self.debug:
            print(f"[Communicator] comm_raw: {comm_raw}")

        info.update({
            "comm_outputs": comm_outputs,
            "usage": tokens
        })
        action = {"type": 6,
                  "message": "1"+comm_raw.replace("send a message", "")}
        return action, info

    def _flag(
            self,
            plan: str,
            current_room: str,
            object_per_room,
            action_history: List[str],
    ) -> bool:
        flag = True

        obj_name, obj_id  = parse_name_id(plan)

        for obj_dict in self.visible_objects:
            if obj_dict['id'] == obj_id:
                flag = False
                break
        for obj_dict in object_per_room[current_room][0]:
            if obj_dict['id'] == obj_id:
                flag = False
                break
        for obj_dict in object_per_room[current_room][1]:
            if obj_dict['id'] == obj_id:
                flag = False
                break

        if 'explore' in plan or 'transport' in plan or 'put' in plan:
            flag = False

        recent_actions = action_history[-2:] if len(action_history) >= 2 else action_history
        if any("send a message" in a for a in recent_actions):
            flag = False

        return flag

    # ========= 검증 → 평가 → 선택 =========
    def _verify_evaluate_select(
            self,
            mode: str,
            plan: str,
            *,
            progress_desc: str,
            dialogue_history_desc: str,
            action_history_desc: str,
            available_plans_str: str,
            available_plans_list: List[str],
            stage_usages: Dict[str, float],
            stage_tokens: Dict[str, int],
            reasoning_trace_for_check: str
    ) -> Tuple[str, Dict[str, Any]]:
        info: Dict[str, Any] = {}

        # Verifier 프롬프트
        check_prompt = (self.check_prompt_template
                        .replace('$REASONING_TRACE$', reasoning_trace_for_check)
                        .replace('$GOAL$', self.goal_desc)
                        .replace('$PROGRESS$', progress_desc)
                        .replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
                        .replace('$ACTION_HISTORY$', action_history_desc)
                        .replace('$AVAILABLE_ACTIONS$', available_plans_str))

        if self.debug:
            print(f"[Verifier] check_prompt:\n{check_prompt}")

        chat_prompt = [{"role": "user", "content": check_prompt}] if mode == "base" else [
            {"role": "system", "content": f"Reasoning: {self.cot}"},
            {"role": "user", "content": check_prompt}
        ]
        try:
            check_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
            info['verification'] = [chat_prompt, check_outputs]
        except Exception as e:
            if self.debug:
                print(f"[Verifier] generator failed: {e} (skip verification)")
            check_outputs, usage, tokens = ([""], 0.0, 0)

        scen_tree_raw = check_outputs[0] if check_outputs else ""
        stage_usages['verifier'] = stage_usages.get('verifier', 0.0) + float(usage)
        stage_tokens['verifier'] = stage_tokens.get('verifier', 0) + int(tokens)
        self.total_cost += usage

        info['check_outputs'] = check_outputs
        info['check_usage'] = usage

        if self.debug:
            print(f"[Verifier] scenario_tree:\n{scen_tree_raw}")

        # Evaluator 프롬프트
        evaluation_prompt = (self.evaluation_prompt_template
                             .replace('$SCENARIO_TREE$', scen_tree_raw)
                             .replace('$GOAL$', self.goal_desc)
                             .replace('$PROGRESS$', progress_desc)
                             .replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
                             .replace('$ACTION_HISTORY$', action_history_desc)
                             .replace('$AVAILABLE_ACTIONS$', available_plans_str))

        if self.debug:
            print(f"[Evaluator] evaluation_prompt:\n{evaluation_prompt}")
        chat_prompt = [{"role": "user", "content": evaluation_prompt}]
        try:
            evaluation_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
            info['evaluation'] = [chat_prompt, evaluation_outputs]
        except Exception as e:
            if self.debug:
                print(f"[Evaluator] generator failed: {e} (keep original plan)")
            evaluation_outputs, usage, tokens = ([""], 0.0, 0)

        evaluation_raw = evaluation_outputs[0] if evaluation_outputs else ""
        stage_usages['evaluator'] = stage_usages.get('evaluator', 0.0) + float(usage)
        stage_tokens['evaluator'] = stage_tokens.get('evaluator', 0) + int(tokens)
        self.total_cost += usage

        info['evaluation_outputs'] = evaluation_outputs
        info['evaluation_usage'] = usage

        if self.debug:
            print(f"[Evaluator] output:\n{evaluation_raw}")

        # 파싱 → 랭킹 → 선택
        try:
            evals = self._safe_parse_evals(evaluation_raw)
            result = self.select_best_action(evals)
            candidate_plan = (result.get('best_action') or "").strip()
            ranked = result.get('ranked', [])

            print("candidate_plan:", candidate_plan)
            print("available_plans_list:", available_plans_list)
            candidate_plan = candidate_plan.split("-")[0][:-1]
            print(self._is_action_in_available(candidate_plan, available_plans_list))
            if candidate_plan and self._is_action_in_available(candidate_plan, available_plans_list)[1]:
                plan = self._is_action_in_available(candidate_plan, available_plans_list)[0]
            else:
                if self.initial_plan:
                    fallback = self.initial_plan
                else:
                    fallback = self._first_valid_action_from_ranked(ranked, available_plans_list)
                if fallback:
                    plan = fallback
                else:
                    if self.debug:
                        print("[Selector] No valid evaluated action found; keep original plan.")
        except Exception as e:
            if self.debug:
                print(f"[Evaluator] parse/select error: {e} (keep original plan)")
                traceback.print_exc()

        info['plan'] = plan
        return plan, info

    def _first_valid_action_from_ranked(self, ranked: List[Dict[str, Any]], available_list: List[str]) -> Optional[str]:
        for row in ranked or []:
            a = (row.get('action') or "").strip()
            if 'cost' in a:
                a = a.split('-')[0][:-1]
            refined_a, check = self._is_action_in_available(a, available_list)
            if check:
                return refined_a
        return None

    def normalize_1to5(self, x: int) -> float:
        """1..5 -> 0..1 선형 정규화."""
        x = max(1, min(5, int(x)))
        return (x - 1) / 4.0

    def select_best_action(
            self,
            evals: Dict[str, Dict[str, Any]],
            **kw
    ) -> Dict[str, Any]:
        """
        최우선(utility 최대) 액션 및 요약 반환.
        """
        ranked = self.rank_scenarios(evals, **kw)
        best = ranked[0] if ranked else {}
        return {
            'best_action': best.get('action'),
            'best_action_id': best.get('action_id'),
            'best_scenario': best.get('scenario'),
            'utility': best.get('utility'),
            'L_used': best.get('L_used'),
            'G_used': best.get('G_used'),
            'C_used': best.get('C_used'),
            'ranked': ranked,  # 전체 랭킹 포함
        }

    def compute_utility(
            self,
            likelihood: int,
            gain: int,
            cost_penalty: int,
            action: str,
            lambda_cost: float = 1.0,
            normalize: bool = True,
    ) -> Tuple[float, float, float, float]:
        """
        반환: (U, L_used, P_used, cost)
        - L_used, P_used: 정규화 여부에 따라 0..1 또는 원점수(1..5)
        """
        L = self.normalize_1to5(likelihood) if normalize else float(likelihood)
        G = self.normalize_1to5(gain) if normalize else float(gain)
        C = self.normalize_1to5(cost_penalty) if normalize else float(cost_penalty)
        U = L * G - lambda_cost * C
        return U, L, G, C

    def get_action_id(self, action: str) -> int:
        m = self.ACTION_ID_RE.search(action)
        return int(m.group(1)) if m else -1


    def rank_scenarios(
            self,
            evals: Dict[str, Dict[str, Any]],
            lambda_cost: float = 1.0,
            normalize: bool = True,
    ) -> List[Dict[str, Any]]:
        """
        각 시나리오에 대해 유틸리티 계산 후 U 내림차순 정렬된 리스트 반환.
        """
        rows = []
        for scen, d in evals.items():
            L = int(d['Likelihood'])
            G = int(d['Gain'])
            C = int(d['CostPenalty'])
            A = str(d['Action'])
            U, L_used, G_used, C_used = self.compute_utility(
                L, G, C, A,
                lambda_cost=lambda_cost,
                normalize=normalize,
            )
            rows.append({
                'scenario': scen,
                'action': A,
                'action_id': self.get_action_id(A),
                'likelihood': L,
                'gain': G,
                'cost_penalty': C,
                'L_used': L_used,
                'G_used': G_used,
                'C_used': C_used,
                'utility': U,
            })
        rows.sort(key=lambda r: r['utility'], reverse=True)
        return rows


    def parse_evals_to_dict(self, evals_raw: Any) -> Dict[str, Dict[str, Any]]:
        """
        LLM 출력(e.g., 문자열) → Dict[str, Dict[str, Any]] 로 변환.
        처리 순서:
        1) 이미 dict면 그대로 반환
        2) code fence 제거
        3) JSON 시도 (double-quoted)
        4) ast.literal_eval 시도 (single-quoted 파이썬 dict)
        5) 실패 시 에러
        + 구조 검증/보정: Likelihood/Performance int, Action str 강제
        """
        if isinstance(evals_raw, dict):
            data = evals_raw
        else:
            s = str(evals_raw)
            s = self._strip_code_fences(s)

            try:
                data = json.loads(s)
            except Exception:

                try:
                    data = ast.literal_eval(s)
                except Exception as e:
                    raise ValueError(
                        f"Failed to parse evaluator output as dict. "
                        f"Got type={type(evals_raw)} preview={s[:120]!r}"
                    ) from e

        if not isinstance(data, dict):
            raise ValueError(f"Parsed evaluator output is not a dict. Got type={type(data)}")

        cleaned: Dict[str, Dict[str, Any]] = {}
        for scen, d in data.items():
            if not isinstance(d, dict):
                raise ValueError(f"Scenario '{scen}' is not a dict: {type(d)}")

            L = d.get('Likelihood')
            G = d.get('Gain')
            C = d.get('CostPenalty')
            A = d.get('Action')

            # 타입 보정
            try:
                L = int(L)
            except Exception:
                raise ValueError(f"Scenario '{scen}': Likelihood not int-like: {L!r}")
            try:
                G = int(G)
            except Exception:
                raise ValueError(f"Scenario '{scen}': Gain not int-like: {G!r}")
            try:
                C = int(C)
            except Exception:
                raise ValueError(f"Scenario '{scen}': CostPenalty not int-like: {C!r}")

            if not isinstance(A, str):
                A = "" if A is None else str(A)

            cleaned[scen] = {'Likelihood': L, 'Gain': G, 'CostPenalty': C, 'Action': A}

        return cleaned


    def _safe_parse_evals(self, evals_raw: Any) -> Dict[str, Dict[str, Any]]:
        import json, re

        def _try_json(s: str):
            return json.loads(s)

        def _strip(s: str) -> str:
            return self._strip_code_fences(s or "")

        s = _strip(evals_raw if isinstance(evals_raw, str) else str(evals_raw))


        try:
            return self.parse_evals_to_dict(s)
        except Exception:
            pass

        m = re.search(r"```json\s*(\{.*?\})\s*```", s, re.S | re.I)
        if m:
            block = m.group(1)
            try:
                return self.parse_evals_to_dict(block)
            except Exception:
                try:
                    return _try_json(block)
                except Exception:
                    pass


        def _largest_json_object(text: str):
            start_idx = None
            depth = 0
            best = None
            for i, ch in enumerate(text):
                if ch == '{':
                    if depth == 0:
                        start_idx = i
                    depth += 1
                elif ch == '}':
                    if depth > 0:
                        depth -= 1
                        if depth == 0 and start_idx is not None:
                            cand = text[start_idx:i + 1]
                            # 더 긴 후보를 보관
                            if best is None or len(cand) > len(best):
                                best = cand
            return best

        largest = _largest_json_object(s)
        if largest:
            try:
                return self.parse_evals_to_dict(largest)
            except Exception:

                import json
                return json.loads(largest)


        m2 = re.search(r"\{.*\}\s*$", s, re.S)
        if m2:
            try:
                return self.parse_evals_to_dict(m2.group(0))
            except Exception:
                import json
                return json.loads(m2.group(0))

        # 모두 실패 시 예외
        raise ValueError("Failed to extract JSON from evaluator output.")


    # ========= 통합 파이프라인 =========
    def run_pipeline(
            self,
            mode: str,
            current_step: int,
            current_room: str,
            rooms_explored: dict,
            holding_objects: list,
            satisfied: list,
            object_list: list,
            obj_per_room: dict,
            action_history: list,
            dialogue_history: list,
            opponent_grabbed_objects: Optional[list] = None,
            opponent_last_room: Optional[str] = None,
    ) -> Tuple[Optional[str], Dict[str, Any]]:

        """mode: 'reasoning' | 'base'"""
        info: Dict[str, Any] = {}
        stage_usages: Dict[str, float] = {}
        stage_tokens: Dict[str, int] = {}


        # 1) prompt
        prompt, progress_desc, action_history_desc, dialogue_history_desc = self._build_prompt_block(
            current_step, satisfied, opponent_grabbed_objects, opponent_last_room,
            action_history, dialogue_history
        )

        # 2) available action
        message = None
        available_plans_str, num, available_plans_list, action_cost = self.get_available_plans()
        available_plans_list = self._coerce_available_list(available_plans_list)

        if num == 0 or (message is not None and num == 1):
            if self.debug:
                print("Warning! No available plans!")
            info.update({"num_available_actions": num, "plan": None})
            return None, info

        prompt = prompt.replace('$AVAILABLE_ACTIONS$', available_plans_str)


        # 3) planning
        plan, plan_info = self._plan(
            mode, prompt, available_plans_list,
            stage_usages=stage_usages, stage_tokens=stage_tokens
        )
        self.initial_plan = plan if self._is_action_in_available(plan,available_plans_list) else None
        info.update(plan_info)
        flag = self._flag(
            plan, current_room, obj_per_room, action_history)

        if flag:
            reasoning_trace_for_check = (
                    info.get('reasoning_traces')
                    or info.get('final_reasoning')
                    or ""
            )
            plan, v_info = self._verify_evaluate_select(
                mode, plan,
                progress_desc=progress_desc,
                dialogue_history_desc=dialogue_history_desc,
                action_history_desc=action_history_desc,
                available_plans_str=available_plans_str,
                available_plans_list=available_plans_list,
                stage_usages=stage_usages,
                stage_tokens=stage_tokens,
                reasoning_trace_for_check=reasoning_trace_for_check
            )
            info.update(v_info)


        if self.debug:
            print(f"[Final] plan: {plan}\n")

        info.update({
            "num_available_actions": num,
            "prompts": prompt,
            "plan": plan,
            "total_cost": self.total_cost,
            "usage": sum(stage_tokens.values()),
            "stage_usages": stage_usages,
            "stage_tokens": stage_tokens
        })
        return plan, info




    # ------------------------------- Dispatcher -------------------------------
    def run(
            self,
            current_step: int,
            current_room: str,
            rooms_explored: dict,
            holding_objects: list,
            satisfied: list,
            object_list: list,
            obj_per_room: dict,
            action_history: list,
            dialogue_history: list,
            opponent_grabbed_objects: Optional[list] = None,
            opponent_last_room: Optional[str] = None,
            visible_objects = None,
    ):
        """Route to reasoning or base pipeline depending on model family.
        Defaults to reasoning path for gpt-oss/gemma/gpt-4o-mini families.
        """

        self.current_room = current_room
        self.rooms_explored = rooms_explored
        self.holding_objects = holding_objects
        self.object_list = object_list
        self.obj_per_room = obj_per_room
        self.visible_objects = visible_objects
        self.opponent_last_room = opponent_last_room

        if 'gpt-oss' in self.lm_id:

            return self.run_pipeline(
                "reasoning",
                current_step,
                current_room,
                rooms_explored,
                holding_objects,
                satisfied,
                object_list,
                obj_per_room,
                action_history,
                dialogue_history,
                opponent_grabbed_objects,
                opponent_last_room,)
        else:
            return self.run_pipeline(
                "base",
                current_step,
                current_room,
                rooms_explored,
                holding_objects,
                satisfied,
                object_list,
                obj_per_room,
                action_history,
                dialogue_history,
                opponent_grabbed_objects,
                opponent_last_room,)


def parse_name_id(s: str):
    match = re.search(r"<([^<>]+)>\s*\((\d+)\)", s.strip())
    if match:
        name, value = match.groups()
        return name, int(value)
    return None, None
