import random
import ast
import openai
import torch
import json
import os
import pandas as pd
from openai import OpenAIError
import backoff
import numpy as np
from openai import OpenAI
import re
import tiktoken
import time, copy, random
from typing import Dict, Any, Tuple, List, Optional
def get_tokenizer():
	o200k_base = tiktoken.get_encoding("o200k_base")
	_tokenizer = tiktoken.Encoding(
		name="o200k_harmony",
		pat_str=o200k_base._pat_str,
		mergeable_ranks=o200k_base._mergeable_ranks,
		special_tokens={
			**o200k_base._special_tokens,
			"<|startoftext|>": 199998,
			"<|endoftext|>": 199999,
			"<|reserved_200000|>": 200000,
			"<|reserved_200001|>": 200001,
			"<|return|>": 200002,
			"<|constrain|>": 200003,
			"<|reserved_200004|>": 200004,
			"<|channel|>": 200005,
			"<|start|>": 200006,
			"<|end|>": 200007,
			"<|message|>": 200008,
			"<|reserved_200009|>": 200009,
			"<|reserved_200010|>": 200010,
			"<|reserved_200011|>": 200011,
			"<|call|>": 200012,
		} | {
			f"<|reserved_{i}|>": i for i in range(200013, 201088)
		},
	)
	return _tokenizer

class LLM:
	def __init__(self,
				source,  # 'huggingface' or 'openai'
				lm_id,
				prompt_template_path,
				communication,
				cot,
				sampling_parameters,
				agent_id,
				random_reasoning,
				cot_round
				):
		self.goal_desc = None
		self.goal_location_with_r = None
		self.agent_id = agent_id
		self.opponent_agent_id = 2 if agent_id == 1 else 1
		self.agent_name = "Alice" if agent_id == 1 else "Bob"
		self.oppo_name = "Alice" if agent_id == 2 else "Bob"
		self.oppo_pronoun = "she" if agent_id == 2 else "he"
		self.debug = sampling_parameters.debug
		self.goal_location = None
		self.goal_location_id = None
		self.roomname2id = {}
		self.rooms = []
		self.prompt_template_path = prompt_template_path
		self.single = 'single' in self.prompt_template_path
		df = pd.read_csv(self.prompt_template_path)
		self.prompt_template = df['prompt'][0].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
		self.check_prompt_template = df['prompt'][1].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
		self.evaluation_prompt_template = df['prompt'][2].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)
		self.communication_prompt_template = df['prompt'][3].replace("$AGENT_NAME$", self.agent_name).replace("$OPPO_NAME$", self.oppo_name)

		self.steps = 0

		self.communication = communication
		self.cot = cot
		self.source = source
		self.lm_id = lm_id
		self.chat = 'gpt-3.5-turbo' in lm_id or 'gpt-4' in lm_id or 'gpt-oss' in lm_id or 'gpt-5-nano' in lm_id
		self.OPENAI_KEY = None
		self.total_cost = 0
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

		self.oppo_plan = None
		self.random_reasoning = random_reasoning
		self._rng = random.Random(42)

		self.cot_rounds = cot_round
		self.stop_on_convergence = True 
		self.anneal_temperature = True

		self.DEFAULT_COST_TABLE = {
			'send_message': 1.0,   
			'goput': 1.0,          
			'goexplore': 1.0,     
			'gocheck': 1.0,       
			'gograb': 1.0,        
		}

		self.ACTION_PREFIX_RE = re.compile(r'^\s*\[([a-zA-Z_]+)\]\s*<.*?>\s*\(\d+\)\s*$')
		self.ACTION_ID_RE     = re.compile(r'\((\d+)\)\s*$')

		if self.source == 'openai':
			openai.api_key = os.getenv("OPENAI_KEY")
			if self.chat:
				self.sampling_params = {
					"num_ctx": sampling_parameters.max_tokens,
					"temperature": sampling_parameters.t,
					"top_p": sampling_parameters.top_p,
					"n": sampling_parameters.n,
				}
			else:
				self.sampling_params = {
					"max_tokens": sampling_parameters.max_tokens,
					"temperature": sampling_parameters.t,
					"top_p": sampling_parameters.top_p,
					"n": sampling_parameters.n,
					"logprobs": sampling_parameters.logprobs,
					"echo": sampling_parameters.echo,
				}
		elif source == 'huggingface':
			self.sampling_params = {
				"max_tokens": sampling_parameters.max_tokens,
				"temperature": sampling_parameters.t,
				"top_p": sampling_parameters.top_p,
				"n": sampling_parameters.n,
			}
		elif source == "debug":
			self.sampling_params = sampling_parameters
		else:
			raise ValueError("invalid source")

		def lm_engine(source, lm_id, device):
			if source == 'huggingface':
				from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
				from transformers import AutoProcessor, Gemma3nForConditionalGeneration
				print(f"loading huggingface model {lm_id}")
				if lm_id == "qwen3-4b":
					model_name = "Qwen/Qwen3-4B-Instruct-2507"
					tokenizer = AutoTokenizer.from_pretrained(model_name)
					model = AutoModelForCausalLM.from_pretrained(
						model_name,
						torch_dtype="auto",
						device_map="auto"
					)
				elif lm_id == "llama-3.2-3b":
					model_name = "meta-llama/Llama-3.2-3B-Instruct"
					pipeline = pipeline(
						"text-generation",
						model=model_name,
						torch_dtype=torch.bfloat16,
						device_map="auto",
					)
				print(f"loaded huggingface model {lm_id}")
	
			@backoff.on_exception(backoff.expo, OpenAIError)
			def _generate(prompt, sampling_params, is_check=False):
				usage = 0
				if source == 'openai':
					try:
						if self.chat:
							if lm_id == "gpt-oss:20b":
								from ollama import Client
								ollama_host = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
								ollama = Client(host=ollama_host)
							else:   
								client = OpenAI()
							if self.cot != "none" and self.cot != "true":
								if is_check:
									response = ollama.chat('gpt-oss:20b', messages=prompt, think="medium", options=sampling_params)
								else:
									response = ollama.chat('gpt-oss:20b', messages=prompt, think=self.cot, options=sampling_params)
							else:
								response = client.responses.create(
									model=lm_id, input=prompt
								)
							# print(json.dumps(response, indent=4))
							if "gpt-oss" in self.lm_id:
								generated_samples = response.message.content
								generated_reasoning = response.message.thinking
							elif 'gpt-4o-mini' in self.lm_id or 'gpt-5-nano' in self.lm_id:
								generated_samples = response.output_text
								generated_reasoning = ""
							else:
								generated_samples = [response.choices[i].message.content for i in range(sampling_params['n'])][0]
								generated_reasoning = ""
							
							if self.debug:
								with open(f"LLM/chat_raw.json", 'a') as f:
									f.write(json.dumps(generated_samples, indent=4))
									f.write('\n')
						elif "text-" in lm_id:
							response = client.completions.create(model=lm_id, prompt=prompt, **sampling_params)
							if self.debug:
								with open(f"LLM/raw.json", 'a') as f:
									f.write(json.dumps(response, indent=4))
									f.write('\n')
							generated_samples = [response.choices[i].text for i in range(sampling_params['n'])]
						else:
							raise ValueError(f"{lm_id} not available!")
					except OpenAIError as e:
						print(e)
						raise e
				elif source == 'huggingface':
					if lm_id == "qwen3-4b":
						text = tokenizer.apply_chat_template(
							prompt,
							tokenize=False,
							add_generation_prompt=True,
							enable_thinking=False
						)
						model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
						# conduct text completion
						generated_ids = model.generate(
							**model_inputs,
							**sampling_params
						)
						output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

						generated_samples = tokenizer.decode(output_ids, skip_special_tokens=True)
						generated_reasoning = ""
					elif lm_id == "llama-3.2-3b":
						text = pipeline.tokenizer.apply_chat_template(
							prompt, 
							tokenize=False, 
							add_generation_prompt=True
						)
						terminators = [
							pipeline.tokenizer.eos_token_id,
							pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
						]
						outputs = pipeline(
							text,
							eos_token_id=terminators,
							pad_token_id = pipeline.tokenizer.eos_token_id,
							return_full_text=False,
							**sampling_params
						)
						generated_samples = outputs[0]["generated_text"]
						generated_reasoning = ""
					elif lm_id == "gemma3-4b" or lm_id == "gemma3-1b" or lm_id == "gemma3-12b" or lm_id == "gemma3-27b":
						client = OpenAI(
							base_url="http://localhost:11434/v1",  # Local Ollama API
							api_key="ollama"                       # Dummy key
						)
						_lm_id = lm_id.replace("-", ":").replace("b", "B")
						response = client.chat.completions.create(
							model=_lm_id, messages=prompt, **sampling_params
						)
						generated_samples = [response.choices[i].message.content for i in range(sampling_params['n'])][0]
						generated_reasoning = ""
				elif source == "debug":
					return ["navigation"]
				else:
					raise ValueError("invalid source")
				_tokenizer = get_tokenizer()
				if "gpt-oss" in self.lm_id:
					content_usage = _tokenizer.encode(prompt[-1]['content'])
					generated_samples_usage = _tokenizer.encode(generated_samples)
					generated_reasoning_usage = _tokenizer.encode(generated_reasoning)
				else:
					content_usage = _tokenizer.encode(prompt[-1]['content'])
					generated_samples_usage = _tokenizer.encode(generated_samples)
					generated_reasoning_usage = [""]
				usage = len(content_usage) + len(generated_samples_usage) + len(generated_reasoning_usage)
				return [generated_samples, generated_reasoning], 0, usage

			return _generate

		self.generator = lm_engine(self.source, self.lm_id, self.device)


	def reset(self, rooms_name, roomname2id, goal_location, unsatisfied):
		self.rooms = rooms_name
		self.roomname2id = roomname2id
		self.goal_location = goal_location
		self.goal_location_id = int(self.goal_location.split(' ')[-1][1:-1])
		self.goal_desc, self.goal_location_with_r = self.goal2description(unsatisfied, None)
		self.steps = 0

	def goal2description(self, goals, goal_location_room):  # {predicate: count}
		# print(goals)
		map_rel_to_pred = {
			'inside': 'into',
			'on': 'onto',
		}
		s = "Find and put "
		r = None
		for predicate, vl in goals.items():
			relation, obj1, obj2 = predicate.split('_')
			count = vl
			if count == 0:
				continue
			if relation == 'holds':
				continue
				# s += f"Alice holds a book, "
			elif relation == 'sit':
				continue
				# s += f"Alice sits in {obj2}, "
			else:
				s += f"{count} {obj1}{'s' if count > 1 else ''}, "
				r = relation
		if r is None:
			return "None."

		s = s[:-2] + f" {map_rel_to_pred[r]} the {self.goal_location}."
		# if type(goal_location_room) is not list:
		# 	s += f" in the {goal_location_room}."
		# else:
		# 	ss = ' or '.join([f'{room}' for room in goal_location_room])
		# 	s += f", which may be in the {ss}."
		return s, f"{map_rel_to_pred[r]} the {self.goal_location}"
		
	def parse_answer(self, available_actions, text):
		for idx, action in enumerate(available_actions):
			available_actions[idx] = available_actions[idx].split(' -')[0]
		
		for i in range(len(available_actions)):
			action = available_actions[i]
			if action in text:
				return action

		for i in range(len(available_actions)):
			action = available_actions[i]
			option = chr(ord('A') + i)
			# txt = text.lower()
			if f"option {option}" in text or f"{option}." in text.split(' ') or f"{option}," in text.split(' ') or f"Option {option}" in text or f"({option})" in text:
				return action
			
			parsed_action = action.split(' -')[0]
			if parsed_action in text:
				return action
			
			pattern = r"\((\d+)\)"
			match = re.search(pattern, text)
			if match:
				_id = match.group(1)  
				if _id in action:     
					return action

		text = text.replace(".", "").replace("*", "").replace("Answer: ", "")
		for i in range(len(available_actions)):
			action = available_actions[i]
			act, name, id = action.split(' ')
			option = chr(ord('A') + i)
			if f"{option} " in text or act in text or name in text or id in text:
				return action
			if f"{option}" == text:
				return action
		print("text", text)
		print("WARNING! No available action parsed!!! Random choose one")
		return random.choice(available_actions)
		
	def progress2text(self, opponent_same_room, current_room, grabbed_objects, unchecked_containers, ungrabbed_objects, goal_location_room, satisfied, opponent_grabbed_objects, opponent_last_room, room_explored):
		sss = {}
		for room, objs in ungrabbed_objects.items():
			cons = unchecked_containers[room]
			extra_obj = None
			if type(goal_location_room) is not list and goal_location_room == room:
				extra_obj = self.goal_location
			if objs is None and extra_obj is None and (room_explored is None or not room_explored[room]):
				sss[room] = f"The {room} is unexplored. "
				continue
			s = ""
			s_obj = ""
			s_con = ""
			if extra_obj is not None:
				s_obj = f"{extra_obj}, "
			if objs is not None and len(objs) > 0:
				if len(objs) == 1:
					x = objs[0]
					s_obj += f"<{x['class_name']}> ({x['id']})"
				else:
					ss = ', '.join([f"<{x['class_name']}> ({x['id']})" for x in objs])
					s_obj += ss
			elif extra_obj is not None:
				s_obj = s_obj[:-2]
			if cons is not None and len(cons) > 0:
				if len(cons) == 1:
					x = cons[0]
					s_con = f"an unchecked container <{x['class_name']}> ({x['id']})"
				else:
					ss = ', '.join([f"<{x['class_name']}> ({x['id']})" for x in cons])
					s_con = f"unchecked containers " + ss
			if s_obj == "" and s_con == "":
				s += 'nothing'
				if room_explored is not None and not room_explored[room]:
					s += ' yet'
			elif s_obj != "" and s_con != "":
				s += s_obj + ', and ' + s_con
			else:
				s += s_obj + s_con
			sss[room] = s

		if len(satisfied) == 0:
			s = ""
		else:
			s = f"{'I' if self.single else 'We'}'ve already found and put "
			s += ', '.join([f"<{x['class_name']}> ({x['id']})" for x in satisfied])
			s += ' ' + self.goal_location_with_r + '. '

		if len(grabbed_objects) == 0:
			s += "I'm holding nothing. "
		else:
			s += f"I'm holding <{grabbed_objects[0]['class_name']}> ({grabbed_objects[0]['id']}). "
			if len(grabbed_objects) == 2:
				s = s[:-2] + f" and <{grabbed_objects[1]['class_name']}> ({grabbed_objects[1]['id']}). "
		
		
		s += f"I'm in the {current_room['class_name']}, where I can currently see {sss[current_room['class_name']]}. "
		
		### opponent modeling ###
		if not self.single:
			ss = ""
			if len(opponent_grabbed_objects) == 0:
				ss += "nothing. "
			else:
				ss += f"<{opponent_grabbed_objects[0]['class_name']}> ({opponent_grabbed_objects[0]['id']}). "
				if len(opponent_grabbed_objects) == 2:
					ss = ss[:-2] + f" and <{opponent_grabbed_objects[1]['class_name']}> ({opponent_grabbed_objects[1]['id']}). "
			
			if opponent_last_room is None:
				s += f"I don't know where {self.oppo_name} is. "
			elif opponent_last_room == current_room['class_name']:
				if opponent_same_room:
					
					s += f"I also see {self.oppo_name} here in the {current_room['class_name']}, {self.oppo_pronoun} is holding {ss}"
				else:
					
					s += f"I last saw {self.oppo_name} in the {opponent_last_room}, and at that time, {self.oppo_pronoun} was holding {ss} I'm not sure where {self.oppo_pronoun} is now. "
			else:

				s += f"I last saw {self.oppo_name} in the {opponent_last_room}, and at that time, {self.oppo_pronoun} was holding {ss} I'm not sure where {self.oppo_pronoun} is now. "


		for room in self.rooms:
			if room == current_room['class_name']:
				continue
			if 'unexplored' in sss[room]:
				s += sss[room]
			else:

				s += f"When I last checked the {room}, I remember finding {sss[room]}. "

		return f"I've taken {self.steps}/250 steps. " + s


	def get_available_plans(self, physical_memory, grabbed_objects, unchecked_containers, ungrabbed_objects, message, room_explored):
		"""
		[goexplore] <room>
		[gocheck] <container>
		[gograb] <target object>
		[goput] <goal location>
		[send_message] <"">
		"""
		available_plans = []
		for room in self.rooms:
			if (room_explored is None or room_explored[room]) and unchecked_containers[room] is not None:
				continue
			my_distance = np.linalg.norm(physical_memory[self.roomname2id[room]] - physical_memory[self.agent_id], 2)
			my_distance = round(my_distance, 2)

			if physical_memory[self.opponent_agent_id] is not None:
				other_distance = np.linalg.norm(physical_memory[self.roomname2id[room]] - physical_memory[self.opponent_agent_id], 2)
				other_distance = round(other_distance, 2)
			else:
				other_distance = None

			if other_distance is not None:
				available_plans.append(f"[goexplore] <{room}> ({self.roomname2id[room]}) - my cost: {my_distance} meters, opponent cost: {other_distance} meters")
			else:
				available_plans.append(f"[goexplore] <{room}> ({self.roomname2id[room]}) - my cost: {my_distance} meters")

		if len(grabbed_objects) < 2:
			for cl in unchecked_containers.values():
				if cl is None:
					continue
				for container in cl:
					my_distance = np.linalg.norm(physical_memory[container['id']] - physical_memory[self.agent_id], 2)
					my_distance = round(my_distance, 2)

					if physical_memory[self.opponent_agent_id] is not None:
						other_distance = np.linalg.norm(physical_memory[container['id']] - physical_memory[self.opponent_agent_id], 2)
						other_distance = round(other_distance, 2)
					else:
						other_distance = None

					if other_distance is not None:
						available_plans.append(f"[gocheck] <{container['class_name']}> ({container['id']}) - my cost: {my_distance} meters, opponent cost: {other_distance} meters")
					else:
						available_plans.append(f"[gocheck] <{container['class_name']}> ({container['id']}) - my cost: {my_distance} meters")
			for ol in ungrabbed_objects.values():
				if ol is None:
					continue
				for obj in ol:
					my_distance = np.linalg.norm(physical_memory[obj['id']] - physical_memory[self.agent_id], 2)
					my_distance = round(my_distance, 2)

					if physical_memory[self.opponent_agent_id] is not None:
						other_distance = np.linalg.norm(physical_memory[obj['id']] - physical_memory[self.opponent_agent_id], 2)
						other_distance = round(other_distance, 2)
					else:
						other_distance = None

					if other_distance is not None:
						available_plans.append(f"[gograb] <{obj['class_name']}> ({obj['id']}) - my cost: {my_distance} meters, opponent cost: {other_distance} meters")
					else:
						available_plans.append(f"[gograb] <{obj['class_name']}> ({obj['id']}) - my cost: {my_distance} meters")

		if len(grabbed_objects) > 0 and self.goal_location_id in physical_memory:
			distance = np.linalg.norm(physical_memory[self.goal_location_id] - physical_memory[self.agent_id], 2)
			distance = round(distance, 2)
			available_plans.append(f"[goput] {self.goal_location} - my cost: {distance} meters")
		
		plans = ""
		for i, plan in enumerate(available_plans):
			plans += f"{chr(ord('A') + i)}. {plan}\n"

		return plans, len(available_plans), available_plans

	def parse_evals_to_dict(self, evals_raw: Any) -> Dict[str, Dict[str, Any]]:
		if isinstance(evals_raw, dict):
			data = evals_raw
		else:
			s = str(evals_raw)
			s = self._strip_code_fences(s)

			try:
				data = json.loads(s)
			except Exception:
				try:
					data = ast.literal_eval(s)
				except Exception as e:
					raise ValueError(
						f"Failed to parse evaluator output as dict. "
						f"Got type={type(evals_raw)} preview={s[:120]!r}"
					) from e

		if not isinstance(data, dict):
			raise ValueError(f"Parsed evaluator output is not a dict. Got type={type(data)}")

		cleaned: Dict[str, Dict[str, Any]] = {}
		for scen, d in data.items():
			if not isinstance(d, dict):
				raise ValueError(f"Scenario '{scen}' is not a dict: {type(d)}")

			L = d.get('Likelihood')
			G = d.get('Gain')
			C = d.get('CostPenalty')
			A = d.get('Action')

			try:
				L = int(L)
			except Exception:
				raise ValueError(f"Scenario '{scen}': Likelihood not int-like: {L!r}")
			try:
				G = int(G)
			except Exception:
				raise ValueError(f"Scenario '{scen}': Gain not int-like: {G!r}")
			try:
				C = int(C)
			except Exception:
				raise ValueError(f"Scenario '{scen}': CostPenalty not int-like: {C!r}")

			if not isinstance(A, str):
				A = "" if A is None else str(A)

			cleaned[scen] = {'Likelihood': L, 'Gain': G, 'CostPenalty': C, 'Action': A}

		return cleaned
		
	def normalize_1to5(self, x: int) -> float:
		x = max(1, min(5, int(x)))
		return (x - 1) / 4.0

	def get_action_prefix(self, action: str) -> str:
		m = self.ACTION_PREFIX_RE.match(action)
		return m.group(1).lower() if m else 'unknown'

	def get_action_id(self, action: str) -> int:
		m = self.ACTION_ID_RE.search(action)
		return int(m.group(1)) if m else -1

	def action_cost(self, action: str, cost_table: Dict[str, float]) -> float:
		prefix = self.get_action_prefix(action)
		return cost_table.get(prefix, 1.0)

	def compute_utility(
		self,
		likelihood: int,
		gain: int,
		cost_penalty: int,
		action: str,
		lambda_cost: float = 1.0,
		normalize: bool = True,
	) -> Tuple[float, float, float, float]:
		L = self.normalize_1to5(likelihood) if normalize else float(likelihood)
		G = self.normalize_1to5(gain) if normalize else float(gain)
		C = self.normalize_1to5(cost_penalty) if normalize else float(cost_penalty)
		U = L * G - lambda_cost * C
		return U, L, G, C

	def rank_scenarios(
		self,
		evals: Dict[str, Dict[str, Any]],
		lambda_cost: float = 1.0,
		normalize: bool = True,
	) -> List[Dict[str, Any]]:
		rows = []
		for scen, d in evals.items():
			L = int(d['Likelihood'])
			G = int(d['Gain'])
			C = int(d['CostPenalty'])
			A = str(d['Action'])
			U, L_used, G_used, C_used = self.compute_utility(
				L, G, C, A,
				lambda_cost=lambda_cost,
				normalize=normalize,
			)
			rows.append({
				'scenario': scen,
				'action': A,
				'action_id': self.get_action_id(A),
				'likelihood': L,
				'gain': G,
				'cost_penalty': C,
				'L_used': L_used,
				'G_used': G_used,
				'C_used': C_used,
				'utility': U,
			})
		rows.sort(key=lambda r: r['utility'], reverse=True)
		return rows

	def select_best_action(
		self,
		evals: Dict[str, Dict[str, Any]],
		**kw
	) -> Dict[str, Any]:
		ranked = self.rank_scenarios(evals, **kw)
		best = ranked[0] if ranked else {}
		return {
			'best_action': best.get('action'),
			'best_action_id': best.get('action_id'),
			'best_scenario': best.get('scenario'),
			'utility': best.get('utility'),
			'L_used': best.get('L_used'),
			'G_used': best.get('G_used'),
			'C_used': best.get('C_used'),
			'ranked': ranked,
		}

	def _strip_code_fences(self, s: str) -> str:
		s = (s or "").strip()
		if s.startswith("```"):
			s = re.sub(r"^```[a-zA-Z0-9_+\-]*\n", "", s)
			if s.endswith("```"):
				s = s[:-3]
		return s.strip()

	def _safe_generator(self, prompt_obj, params, *, is_check: bool = False, retries: int = 2, sleep: float = 0.4):
		last_exc = None
		for t in range(retries + 1):
			try:
				outputs, usage, tokens = self.generator(prompt_obj, params, is_check=is_check)
				outs = [(self._strip_code_fences(o) if isinstance(o, str) else o) for o in outputs]
				return outs, usage, tokens
			except Exception as e:
				last_exc = e
				if t < retries:
					time.sleep(sleep * (1.5 ** t))
				else:
					raise last_exc

	def format_message(self, text: str) -> str:
		if text is None:
			return text
		line = text.strip()
		m = re.match(r"(\[send_message\])\s*(.*)", line)
		if not m:
			return text  # leave untouched if it doesn't match the expected prefix
		prefix, message = m.groups()
		cleaned = message.replace('<', '').replace('>', '').replace('"', '').strip()
		return f'{prefix} <"{cleaned}">'
		
	def _is_action_in_available(self, action_str: str, available_list: List[str]) -> bool:
		if not action_str:
			return "", False
		a = action_str.strip()
		if a.startswith("[send_message]"):
			a = self.format_message(a)
			return a, True

		if any(a == x.strip() for x in available_list):
			return a, True
			
		m = re.search(r"\((\d+)\)", a)
		if not m:
			return "", False
		aid = int(m.group(1))
		for x in available_list:
			mx = re.search(r"\((\d+)\)", x)
			if mx and int(mx.group(1)) == aid:
				return x, True
		return "", False

	def _first_valid_action_from_ranked(self, ranked: List[Dict[str, Any]], available_list: List[str]) -> Optional[str]:
		for row in ranked or []:
			a = (row.get('action') or "").strip()
			refined_a, check = self._is_action_in_available(a, available_list)
			if check:
				return refined_a
		return None

	def _coerce_available_list(self, lst: List[str]) -> List[str]:
		seen, cleaned = set(), []
		for a in lst:
			t = (a or "").strip()
			if t and t not in seen:
				seen.add(t)
				cleaned.append(t)
		return cleaned

	def _safe_parse_evals(self, evals_raw: Any) -> Dict[str, Dict[str, Any]]:
		import json, re

		def _try_json(s: str):
			return json.loads(s)

		def _strip(s: str) -> str:
			return self._strip_code_fences(s or "")

		s = _strip(evals_raw if isinstance(evals_raw, str) else str(evals_raw))

		try:
			return self.parse_evals_to_dict(s)
		except Exception:
			pass

		m = re.search(r"```json\s*(\{.*?\})\s*```", s, re.S | re.I)
		if m:
			block = m.group(1)
			try:
				return self.parse_evals_to_dict(block)
			except Exception:
				try:
					return _try_json(block)
				except Exception:
					pass

		def _largest_json_object(text: str) -> str | None:
			start_idx = None
			depth = 0
			best = None
			for i, ch in enumerate(text):
				if ch == '{':
					if depth == 0:
						start_idx = i
					depth += 1
				elif ch == '}':
					if depth > 0:
						depth -= 1
						if depth == 0 and start_idx is not None:
							cand = text[start_idx:i+1]
							# 더 긴 후보를 보관
							if best is None or len(cand) > len(best):
								best = cand
			return best

		largest = _largest_json_object(s)
		if largest:
			try:
				return self.parse_evals_to_dict(largest)
			except Exception:
				import json
				return json.loads(largest)

		m2 = re.search(r"\{.*\}\s*$", s, re.S)
		if m2:
			try:
				return self.parse_evals_to_dict(m2.group(0))
			except Exception:
				import json
				return json.loads(m2.group(0))

		# 모두 실패 시 예외
		raise ValueError("Failed to extract JSON from evaluator output.")

	def _build_prompt_block(
		self,
		opponent_same_room, current_room, grabbed_objects,
		unchecked_containers, ungrabbed_objects, goal_location_room,
		satisfied, opponent_grabbed_objects, opponent_last_room, room_explored,
		action_history: List[str], dialogue_history: List[str]
	) -> Tuple[str, str, str, str]:
		progress_desc = self.progress2text(
			opponent_same_room, current_room, grabbed_objects,
			unchecked_containers, ungrabbed_objects, goal_location_room,
			satisfied, opponent_grabbed_objects, opponent_last_room, room_explored
		)
		action_history_desc = ", ".join(action_history[-10:] if len(action_history) > 10 else action_history)
		dialogue_history_desc = '\n'.join(dialogue_history[-3:] if len(dialogue_history) > 3 else dialogue_history)

		prompt = (self.prompt_template
				.replace('$GOAL$', self.goal_desc)
				.replace('$PROGRESS$', progress_desc)
				.replace('$ACTION_HISTORY$', action_history_desc)
				.replace('$DIALOGUE_HISTORY$', dialogue_history_desc))
		return prompt, progress_desc, action_history_desc, dialogue_history_desc

	def _flag(
		self,
		plan: str,
		current_room: Dict[str, Any],
		unchecked_containers: Dict[str, List[Dict[str, Any]]],
		ungrabbed_objects: Dict[str, List[Dict[str, Any]]],
		action_history: List[str],
		opponent_same_room: bool
	) -> bool:
		flag = True
		cr = current_room['class_name']

		for container in unchecked_containers.get(cr, []):
			if str(container.get('id')) in plan:
				flag = False
				break
		if flag:
			for obj in ungrabbed_objects.get(cr, []):
				if str(obj.get('id')) in plan:
					flag = False
					break

		if "[goput]" in plan:
			flag = False

		recent_actions = action_history[-2:] if len(action_history) >= 2 else action_history
		if any("[send_message]" in a for a in recent_actions):
			flag = False

		if self.random_reasoning == "when":
			flag = (random.random() < 0.5) and flag

		if not self.communication:
			flag = False

		return flag

	def _plan(
		self,
		mode: str,
		prompt: str,
		available_plans_list: List[str],
		*,
		stage_usages: Dict[str, float],
		stage_tokens: Dict[str, int]
	) -> Tuple[str, Dict[str, Any]]:
		info: Dict[str, Any] = {}
		plan: Optional[str] = None

		if mode == "reasoning":
			if self.debug:
				print(f"[Planning(reasoning)] prompt:\n{prompt}")

			chat_prompt = [
				{"role": "system", "content": f"Reasoning: {self.cot}"},
				{"role": "user", "content": prompt}
			]
			try:
				outputs, usage, tokens = self._safe_generator(chat_prompt if self.chat else prompt, self.sampling_params)
				info['planning'] = [chat_prompt, outputs]
			except Exception as e:
				if self.debug:
					print(f"[PLANNER] generator failed: {e} -> fallback first available")
				plan = available_plans_list[0]
				info.update({"plan": plan})
				return plan, info

			generated_samples = outputs[0] if outputs else ""
			generated_reasoning = outputs[1] if len(outputs) > 1 else ""

			stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
			stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)
			self.total_cost += usage

			info['reasoning_traces'] = generated_reasoning
			info['final_reasoning'] = generated_samples

			if self.debug:
				print(f"[Planning(reasoning)] reasoning:\n{generated_reasoning}")
				print(f"[Planning(reasoning)] samples:\n{generated_samples}")

			candidate = (self.parse_answer(available_plans_list, generated_samples) or "").strip()
			if self._is_action_in_available(candidate, available_plans_list):
				plan = candidate
			else:
				chosen = None
				for a in available_plans_list:
					if a in generated_samples:
						chosen = a
						break
				plan = chosen or available_plans_list[0]

			info['plan'] = plan
			return plan, info

		k = max(int(self.cot_rounds), 1)
		stop_on_convergence = bool(self.stop_on_convergence)
		anneal = bool(self.anneal_temperature)

		if self.debug:
			print(f"[Planning(base)] rounds={k}, stop_on_convergence={stop_on_convergence}, anneal={anneal}")
			print(f"[Planning(base)] base_prompt:\n{prompt}")

		base_params = copy.deepcopy(self.sampling_params)
		reasoning_traces = ""
		last_norm = None

		round_prompt = prompt + " Let's think step by step."
		chat_prompt = [{"role": "user", "content": round_prompt}]
		try:
			outputs, usage, tokens = self._safe_generator(chat_prompt, base_params)
			info['planning'] = [chat_prompt, outputs]
		except Exception as e:
			if self.debug:
				print(f"[Planning(base) r1] generator failed: {e} -> fallback first available")
			plan = available_plans_list[0]
			info.update({"plan": plan})
			return plan, info

		generated_reasoning = outputs[0] if outputs else ""
		stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
		stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)
		self.total_cost += usage
		reasoning_traces += generated_reasoning

		if self.debug:
			print(f"[CoT r1] reasoning:\n{generated_reasoning}")

		for r in range(2, k + 1):
			if anneal and "temperature" in base_params:
				t0 = float(self.sampling_params.get("temperature", 1.0))
				t_min = 0.2
				frac = (r - 1) / (k - 1) if k > 1 else 1.0
				base_params["temperature"] = max(t_min, float(t0 - (t0 - t_min) * frac))

			reflection_prompt = (
				"You are refining your previous reasoning only for planning. "
				"1) Critique weaknesses or missing checks. "
				"2) Provide an improved, concise step-by-step plan. "
				"Do NOT output the final action yet."
			)
			round_prompt = (
				f"{prompt}\n\n"
				f"--- Previous reasoning (r{r-1}) ---\n{generated_reasoning}\n\n"
				f"--- Instruction ---\n{reflection_prompt}"
			)
			chat_prompt = [{"role": "user", "content": round_prompt}]
			try:
				outputs, usage, tokens = self._safe_generator(chat_prompt, base_params)
				info['planning'] += [chat_prompt, outputs]
			except Exception as e:
				if self.debug:
					print(f"[Planning(base) r{r}] generator failed: {e} (stop refine)")
				break

			generated_reasoning = outputs[0] if outputs else ""
			stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
			stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)
			self.total_cost += usage
			reasoning_traces += generated_reasoning

			if self.debug:
				print(f"[CoT r{r}] reasoning:\n{generated_reasoning}")

			if stop_on_convergence:
				cur_norm = "".join(generated_reasoning.split())
				if last_norm is not None and cur_norm == last_norm:
					if self.debug:
						print(f"[CoT] Early stop at round {r} due to convergence.")
					break
				last_norm = cur_norm

		# Final: action only
		final_reasoning = generated_reasoning
		chat_prompt = [
			{"role": "user", "content": prompt},
			{"role": "assistant", "content": final_reasoning},
			{"role": "user", "content": "Answer with only one best next action. So the answer is"}
		]
		try:
			outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params)
			info['planning'] += [chat_prompt, outputs]
			if self.debug:
				print(f"[Planning(base) final] outputs: {outputs}")
		except Exception as e:
			if self.debug:
				print(f"[Planning(base) final] generator failed: {e}")
			outputs, usage, tokens = ([""], 0.0, 0)

		generated_samples = outputs[0] if outputs else ""
		stage_usages['planner'] = stage_usages.get('planner', 0.0) + float(usage)
		stage_tokens['planner'] = stage_tokens.get('planner', 0) + int(tokens)
		self.total_cost += usage

		candidate = (self.parse_answer(available_plans_list, generated_samples) or "").strip()
		if self._is_action_in_available(candidate, available_plans_list):
			plan = candidate
		else:
			chosen = None
			for a in available_plans_list:
				if a in generated_samples:
					chosen = a
					break
			plan = chosen or available_plans_list[0]

		info.update({
			"reasoning_traces": final_reasoning,
			"final_reasoning": final_reasoning,
			"plan": plan
		})
		return plan, info

	def _verify_evaluate_select(
		self,
		mode: str,
		plan: str,
		*,
		progress_desc: str,
		dialogue_history_desc: str,
		action_history_desc: str,
		available_plans_str: str,
		available_plans_list: List[str],
		stage_usages: Dict[str, float],
		stage_tokens: Dict[str, int],
		reasoning_trace_for_check: str
	) -> Tuple[str, Dict[str, Any]]:
		info: Dict[str, Any] = {
		check_prompt = (self.check_prompt_template
						.replace('$REASONING_TRACE$', reasoning_trace_for_check)
						.replace('$GOAL$', self.goal_desc)
						.replace('$PROGRESS$', progress_desc)
						.replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
						.replace('$ACTION_HISTORY$', action_history_desc)
						.replace('$AVAILABLE_ACTIONS$', available_plans_str))

		if self.debug:
			print(f"[Verifier] check_prompt:\n{check_prompt}")

		chat_prompt = [{"role": "user", "content": check_prompt}] if mode == "base" else [
			{"role": "system", "content": f"Reasoning: {self.cot}"},
			{"role": "user", "content": check_prompt}
		]
		try:
			check_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
			info['verification'] = [chat_prompt, check_outputs]
		except Exception as e:
			if self.debug:
				print(f"[Verifier] generator failed: {e} (skip verification)")
			check_outputs, usage, tokens = ([""], 0.0, 0)

		scen_tree_raw = check_outputs[0] if check_outputs else ""
		stage_usages['verifier'] = stage_usages.get('verifier', 0.0) + float(usage)
		stage_tokens['verifier'] = stage_tokens.get('verifier', 0) + int(tokens)
		self.total_cost += usage

		info['check_outputs'] = check_outputs
		info['check_usage'] = usage

		if self.debug:
			print(f"[Verifier] scenario_tree:\n{scen_tree_raw}")

		evaluation_prompt = (self.evaluation_prompt_template
							.replace('$SCENARIO_TREE$', scen_tree_raw)
							.replace('$GOAL$', self.goal_desc)
							.replace('$PROGRESS$', progress_desc)
							.replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
							.replace('$ACTION_HISTORY$', action_history_desc)
							.replace('$AVAILABLE_ACTIONS$', available_plans_str))

		if self.debug:
			print(f"[Evaluator] evaluation_prompt:\n{evaluation_prompt}")
		chat_prompt = [{"role": "user", "content": evaluation_prompt}]
		try:
			evaluation_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
			info['evaluation'] = [chat_prompt, evaluation_outputs]
		except Exception as e:
			if self.debug:
				print(f"[Evaluator] generator failed: {e} (keep original plan)")
			evaluation_outputs, usage, tokens = ([""], 0.0, 0)

		evaluation_raw = evaluation_outputs[0] if evaluation_outputs else ""
		stage_usages['evaluator'] = stage_usages.get('evaluator', 0.0) + float(usage)
		stage_tokens['evaluator'] = stage_tokens.get('evaluator', 0) + int(tokens)
		self.total_cost += usage

		info['evaluation_outputs'] = evaluation_outputs
		info['evaluation_usage'] = usage

		if self.debug:
			print(f"[Evaluator] output:\n{evaluation_raw}")

		try:
			evals = self._safe_parse_evals(evaluation_raw)
			result = self.select_best_action(evals)
			candidate_plan = (result.get('best_action') or "").strip()
			ranked = result.get('ranked', [])

			if candidate_plan and self._is_action_in_available(candidate_plan, available_plans_list)[1]:
				plan = self._is_action_in_available(candidate_plan, available_plans_list)[0]
			else:
				fallback = self._first_valid_action_from_ranked(ranked, available_plans_list)
				if fallback:
					plan = fallback
				else:
					if self.debug:
						print("[Selector] No valid evaluated action found; keep original plan.")
		except Exception as e:
			if self.debug:
				print(f"[Evaluator] parse/select error: {e} (keep original plan)")

		info['plan'] = plan
		return plan, info

	def run_pipeline(
		self,
		mode: str,
		opponent_same_room, physical_memory, id_inside_room, current_room,
		grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects,
		goal_location_room, action_history, dialogue_history,
		opponent_grabbed_objects, opponent_last_room, room_explored=None
	) -> Tuple[Optional[str], Dict[str, Any]]:
		"""mode: 'reasoning' | 'base'"""
		info: Dict[str, Any] = {}
		stage_usages: Dict[str, float] = {}
		stage_tokens: Dict[str, int] = {}
		token_cost_sum = 0  

		prompt, progress_desc, action_history_desc, dialogue_history_desc = self._build_prompt_block(
			opponent_same_room, current_room, grabbed_objects,
			unchecked_containers, ungrabbed_objects, goal_location_room,
			satisfied, opponent_grabbed_objects, opponent_last_room, room_explored,
			action_history, dialogue_history
		)

		message = None
		available_plans_str, num, available_plans_list = self.get_available_plans(
			physical_memory, grabbed_objects, unchecked_containers, ungrabbed_objects, message, room_explored
		)
		available_plans_list = self._coerce_available_list(available_plans_list)
		if num == 0 or (message is not None and num == 1):
			if self.debug:
				print("Warning! No available plans!")
			info.update({"num_available_actions": num, "plan": None})
			return None, info

		prompt = prompt.replace('$AVAILABLE_ACTIONS$', available_plans_str)

		plan, plan_info = self._plan(
			mode, prompt, available_plans_list,
			stage_usages=stage_usages, stage_tokens=stage_tokens
		)
		info.update(plan_info)

		flag = self._flag(
			plan, current_room, unchecked_containers, ungrabbed_objects, action_history, opponent_same_room
		)
		
		if flag:
			reasoning_trace_for_check = (
				info.get('reasoning_traces')
				or info.get('final_reasoning')
				or ""
			)
			plan, v_info = self._verify_evaluate_select(
				mode, plan,
				progress_desc=progress_desc,
				dialogue_history_desc=dialogue_history_desc,
				action_history_desc=action_history_desc,
				available_plans_str=available_plans_str,
				available_plans_list=available_plans_list,
				stage_usages=stage_usages,
				stage_tokens=stage_tokens,
				reasoning_trace_for_check=reasoning_trace_for_check
			)
			info.update(v_info)

		if plan is None:
			plan = available_plans_list[0]

		if '-' in plan:
			plan = plan.split("-")[0][:-1]

		if self.debug:
			print(f"[Final] plan: {plan}\n")

		info.update({
			"num_available_actions": num,
			"prompts": prompt,
			"plan": plan,
			"total_cost": self.total_cost,
			"usage": sum(stage_tokens.values()),
			"stage_usages": stage_usages,
			"stage_tokens": stage_tokens
		})
		return plan, info

	def run_reasoning_model(
		self, opponent_same_room, physical_memory, id_inside_room, current_room,
		grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects,
		goal_location_room, action_history, dialogue_history,
		opponent_grabbed_objects, opponent_last_room, room_explored=None
	):
		return self.run_pipeline(
			"reasoning",
			opponent_same_room, physical_memory, id_inside_room, current_room,
			grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects,
			goal_location_room, action_history, dialogue_history,
			opponent_grabbed_objects, opponent_last_room, room_explored
		)

	def run_base_model(
		self, opponent_same_room, physical_memory, id_inside_room, current_room,
		grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects,
		goal_location_room, action_history, dialogue_history,
		opponent_grabbed_objects, opponent_last_room, room_explored=None
	):
		return self.run_pipeline(
			"base",
			opponent_same_room, physical_memory, id_inside_room, current_room,
			grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects,
			goal_location_room, action_history, dialogue_history,
			opponent_grabbed_objects, opponent_last_room, room_explored
		)


	def run(self, steps, opponent_same_room, physical_memory, id_inside_room, current_room, grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects, goal_location_room, action_history, dialogue_history, opponent_grabbed_objects, opponent_last_room, room_explored = None):
		self.steps = steps
		if "gpt-oss" in self.lm_id:
			return self.run_reasoning_model(opponent_same_room, physical_memory, id_inside_room, current_room, grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects, goal_location_room, action_history, dialogue_history, opponent_grabbed_objects, opponent_last_room, room_explored)
		else:
			return self.run_base_model(opponent_same_room, physical_memory, id_inside_room, current_room, grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects, goal_location_room, action_history, dialogue_history, opponent_grabbed_objects, opponent_last_room, room_explored)

	def run_comm(self, last_message, steps, opponent_same_room, id_inside_room, current_room, grabbed_objects, satisfied, unchecked_containers, ungrabbed_objects, goal_location_room, action_history, dialogue_history, opponent_grabbed_objects, opponent_last_room, room_explored = None):
		self.steps = steps
		prompt, progress_desc, action_history_desc, dialogue_history_desc = self._build_prompt_block(
			opponent_same_room, current_room, grabbed_objects,
			unchecked_containers, ungrabbed_objects, goal_location_room,
			satisfied, opponent_grabbed_objects, opponent_last_room, room_explored,
			action_history, dialogue_history
		)

		comm_prompt = (self.communication_prompt_template
						.replace('$GOAL$', self.goal_desc)
						.replace('$PROGRESS$', progress_desc)
						.replace('$DIALOGUE_HISTORY$', dialogue_history_desc)
						.replace('$ACTION_HISTORY$', action_history_desc)
						.replace('$LAST_MESSAGE$', last_message))

		if self.debug:
			print(f"[Communicator] comm_prompt:\n{comm_prompt}")

		info: Dict[str, Any] = {}
		chat_prompt = [{"role": "user", "content": comm_prompt}]
		try:
			comm_outputs, usage, tokens = self._safe_generator(chat_prompt, self.sampling_params, is_check=True)
			info['communication'] = [chat_prompt, comm_outputs]
		except Exception as e:
			if self.debug:
				print(f"[Communicator] generator failed: {e} (skip communication)")
			comm_outputs, usage, tokens = ([""], 0.0, 0)

		comm_raw = comm_outputs[0] if comm_outputs else ""
		comm_raw = self.format_message(comm_raw)
		if self.debug:
			print(f"[Communicator] comm_raw: {comm_raw}")
		
		info.update({
			"comm_outputs": comm_outputs,
			"usage": tokens
		})
		return comm_raw, info