import os
import sys
import time
import traceback
import signal
import fcntl

import json
import argparse
import random
import shutil
import numpy as np
import uuid
import math
import mpmath
import struct

root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)


from python_src.dag import DAG
from python_src.dag_json import dag_from_json
from python_src.node import Node
from worker.evolution import make_non_dominated_sorter, mutate_one_step_remove_redundance
from worker.spec_loader import (
    build_from_legacy_params,
    build_fun_eval,
    build_training_data,
    get_search_config,
    load_spec,
    select_piece,
)
from python_src.precision import normalize_precision_model, precision_format_to_dag_dtype
def int64_to_double(bits):
    # Pack 64-bit integer into bytes and unpack as double
    return struct.unpack('d', struct.pack('Q', bits))[0]

class ConsoleLogger:
    def __init__(self, task_tag=None, worker_id=None):
        self.task_tag = task_tag
        self.worker_id = worker_id
        
    def log(self, level, message, **kwargs):
        try:
            # Add base fields.
            if self.task_tag:
                kwargs['task_tag'] = self.task_tag
            if self.worker_id:
                kwargs['worker_id'] = self.worker_id
            kwargs['timestamp'] = int(time.time() * 1000)
            kwargs['node_type'] = 'worker_node'
            
            timestamp_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
            console_msg = f"[{timestamp_str}] [{level}] {message}"
            
            # Add important fields to console output.
            important_fields = ['task_id', 'worker_id', 'task_tag']
            log_details = []
            for key in important_fields:
                if key in kwargs:
                    log_details.append(f"{key}={kwargs[key]}")
            
            # Add remaining fields.
            for k, v in kwargs.items():
                if k not in important_fields and k != 'timestamp' and k != 'node_type':
                    log_details.append(f"{k}={v}")
            
            if log_details:
                console_msg += " | " + ", ".join(log_details)
            
            # Colorize by log level.
            if level == 'ERROR':
                # Red.
                print(f"\033[91m{console_msg}\033[0m")
            elif level == 'WARNING':
                # Yellow.
                print(f"\033[93m{console_msg}\033[0m")
            elif level == 'INFO':
                # Green.
                print(f"\033[92m{console_msg}\033[0m")
            elif level == 'DEBUG':
                # Blue.
                print(f"\033[94m{console_msg}\033[0m")
            else:
                print(console_msg)
                
        except:
            pass

class LocalFileLock:
    """Local file lock."""
    
    def __init__(self, logger=None):
        self.logger = logger
        self.lock_file = None
        self.lock_path = None
        self.acquired = False
        
        default_op_root = os.getenv("ANUM_OP_ROOT") or os.path.join(root_dir, "op")
        self.lock_dir = os.getenv("ANUM_LOCK_DIR", os.path.join(default_op_root, ".locks"))
        os.makedirs(self.lock_dir, exist_ok=True)
    
    def acquire_lock(self, lock_name, worker_id, timeout=86400):
        lock_filename = lock_name.replace(':', '_').replace('/', '_') + '.lock'
        self.lock_path = os.path.join(self.lock_dir, lock_filename)
        
        try:
            self.lock_file = open(self.lock_path, 'w')
            fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
            
            self.lock_file.write(f"{worker_id}:{int(time.time())}")
            self.lock_file.flush()
            self.acquired = True
            
            if self.logger:
                self.logger.log('INFO', "File lock acquired",
                               lock_file=self.lock_path, 
                               worker_id=worker_id)
            return True
            
        except (IOError, OSError):
            if self.lock_file:
                try:
                    self.lock_file.close()
                except:
                    pass
                self.lock_file = None
            
            if self.logger:
                self.logger.log('DEBUG', "File lock is already held",
                               lock_file=self.lock_path, 
                               worker_id=worker_id)
            return False
            
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to acquire file lock: {str(e)}",
                               lock_file=self.lock_path, 
                               worker_id=worker_id)
            return False
    
    def release_lock(self):
        if not self.acquired or not self.lock_file:
            return True
        
        try:
            fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_UN)
            self.lock_file.close()
            
            if self.lock_path and os.path.exists(self.lock_path):
                os.unlink(self.lock_path)
            
            if self.logger:
                self.logger.log('INFO', "File lock released", lock_file=self.lock_path)
            
            return True
            
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to release file lock: {str(e)}",
                               lock_file=self.lock_path)
            return False
        finally:
            self.lock_file = None
            self.lock_path = None
            self.acquired = False
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.release_lock()

class DAGReconstructor:
    """Reconstruct DAG objects from JSON data."""
    
    @staticmethod
    def from_json(dag_data):
        """Create a DAG object from JSON data."""
        return dag_from_json(dag_data, default_name="dag")
    
    @staticmethod
    def to_json(dag):
        """Convert a DAG object to JSON data."""
        dag_dict = {
            "id": dag.id,
            "parent_id": getattr(dag, 'parent_id', -1),
            "name": getattr(dag, 'name', f"dag_{int(time.time())}"),
            "num_inputs": dag.num_inputs,
            "dtype": dag.dtype,
            "optimization_error": dag.optimization_error,
            "nodes": []
        }
        
        # Add node information.
        for i, node in enumerate(dag.nodes):
            node_dict = {
                "id": i,
                "type": node.type,
                "value": getattr(node, 'value', 0.0),
                "prev": [dag.nodes.index(prev_node) for prev_node in node.prev],
                "next": [dag.nodes.index(next_node) for next_node in node.next]
            }
            dag_dict["nodes"].append(node_dict)
        
        return dag_dict

class EvolutionProcessor:
    """Core worker-side DAG evolution processor."""

    def __init__(
        self,
        num_mantain=40,
        fun="exp",
        spec_path=None,
        piece_id=None,
        search_config=None,
        logger=None,
        **kwargs,
    ):
        self.logger = logger
        self.num_mantain = num_mantain
        self.nds = make_non_dominated_sorter()
        self.search_config = None
        self.mutation_cfg = None
        self.optimizer_cfg = None
        self.num_steps = kwargs.get("num_steps", 10)
        self.population_size = None
        self.rng = None
        self.eval_dag_dtype = "float32"
        self.metric_format = "fp32"
        self.metric_type = "rel"

        if spec_path:
            spec = load_spec(spec_path)
            piece = select_piece(spec, piece_id)
            fun_eval = build_fun_eval(spec.target, spec.precision_model)
            precision_info = normalize_precision_model(spec.precision_model)
            self.train_X, self.train_Y = build_training_data(
                piece,
                spec.sampling,
                fun_eval,
                precision=spec.precision_model,
            )
            self.train_dataset = (self.train_X, self.train_Y)
            self.search_config = search_config or get_search_config(spec)
            if self.search_config:
                self.num_mantain = self.search_config.evolution.num_mantain
                self.num_steps = self.search_config.evolution.mutation.steps_per_mutation
                self.mutation_cfg = self.search_config.evolution.mutation
                self.optimizer_cfg = self.search_config.optimizer
                self.population_size = self.search_config.evolution.population_size
            self.rng = np.random.default_rng(spec.sampling.seed)
            random.seed(spec.sampling.seed)
            self.eps = kwargs.get("eps", spec.metric.denom_eps)
            self.eval_dag_dtype = precision_format_to_dag_dtype(precision_info["compute_format"])
            self.metric_format = precision_info["output_format"]
            self.metric_type = getattr(spec.metric, "type", "rel")
            return

        # Use spec_loader for unified function loading and legacy --fun support.
        random.seed(1234567)
        N_data = 5000
        fun_idx = kwargs.get("v")
        eps = kwargs.get("eps", 1e-6)

        try:
            _, self.train_X, self.train_Y = build_from_legacy_params(
                fun=fun,
                start=None,  # Use the default interval.
                end=None,
                fun_idx=fun_idx,
                n_data=N_data,
                seed=1234567,
                eps=eps,
            )[:3]
            self.train_dataset = (self.train_X, self.train_Y)
            self.eps = eps
        except ValueError as e:
            raise ValueError(f"Unsupported function '{fun}': {e}") from e

    def mutate_one_step_remove_redundance(self, compute_graph_list, best, num_mutate=2):
        """Run one worker-side DAG evolution batch."""
        self.logger.log('INFO', "Starting evolution batch", input_graphs=len(compute_graph_list), best_count=len(best), num_mutate=num_mutate)
        
        try:
            start_time = time.time()
            evolved_graphs, evolved_best = mutate_one_step_remove_redundance(
                compute_graph_list,  # This list can be modified in-place by the original function
                self.num_mantain,
                best,                # This list can be modified in-place by the original function
                self.train_dataset,
                num_mutate,
                self.nds,             # Pass the NDS instance from this class
                num_steps=self.num_steps,
                eps=self.eps,
                eval_dtype=self.eval_dag_dtype,
                metric_format=self.metric_format,
                mutation_cfg=self.mutation_cfg,
                optimizer_cfg=self.optimizer_cfg,
                rng=self.rng,
                population_size=self.population_size,
                metric_type=self.metric_type,
            )
            process_time = time.time() - start_time
            
            self.logger.log('INFO', "Evolution batch complete",
                           process_time=f"{process_time:.2f}s",
                           output_graphs=len(evolved_graphs), 
                           best_count=len(evolved_best))
            return evolved_graphs, evolved_best

        except Exception as e:
            # Capture detailed exception information and traceback.
            exc_type, exc_value, exc_traceback = sys.exc_info()
            stack_trace = traceback.format_exception(exc_type, exc_value, exc_traceback)
            stack_trace_str = ''.join(stack_trace)
            
            # Log detailed error information and traceback.
            self.logger.log('ERROR', f"mutate_one_step_remove_redundance failed: {str(e)}",
                           stack_trace=stack_trace_str)
            
            raise

class GracefulShutdownHandler:
    """Graceful shutdown handler that quickly recovers the current task."""
    
    def __init__(self, logger=None):
        self.logger = logger
        self.is_shutdown_requested = False
        self.current_processing_file = None
        self.task_current_folder = None
        self.worker_id = None
        
    def setup_signal_handlers(self):
        """Set signal handlers."""
        try:
            signal.signal(signal.SIGTERM, self._signal_handler)
            signal.signal(signal.SIGINT, self._signal_handler)
            if self.logger:
                self.logger.log('INFO', "SIGTERM and SIGINT handlers installed")
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to install signal handlers: {str(e)}")
    
    def _signal_handler(self, signum, frame):
        """Signal handler: recover the current task quickly."""
        signal_name = 'SIGTERM' if signum == signal.SIGTERM else 'SIGINT'
        if self.logger:
            self.logger.log('INFO', f"Received {signal_name}; recovering current task and exiting")
        
        self.is_shutdown_requested = True
        
        # Recover the currently processing task immediately.
        self._recover_current_processing_task()
    
    def _recover_current_processing_task(self):
        """Recover the currently processing task back to the current directory."""
        if not self.current_processing_file or not self.task_current_folder or not self.worker_id:
            return
            
        try:
            if os.path.exists(self.current_processing_file):
                # Recover the original filename from the processing filename.
                original_filename = os.path.basename(self.current_processing_file)
                if f'.processing_{self.worker_id}' in original_filename:
                    original_filename = original_filename.replace(f'.processing_{self.worker_id}', '')
                    
                    # Move it back to the current directory.
                    recovered_path = os.path.join(self.task_current_folder, original_filename)
                    
                    # Keep recovery lightweight so shutdown remains fast.
                    try:
                        shutil.move(self.current_processing_file, recovered_path)
                        if self.logger:
                            self.logger.log('INFO', "Recovered unfinished task",
                                           recovered_file=original_filename)
                    except Exception as move_error:
                        # If the destination already exists, remove the processing file to avoid duplicates.
                        try:
                            os.remove(self.current_processing_file)
                            if self.logger:
                                self.logger.log('INFO', "Move failed; removed processing file to avoid duplication",
                                               processing_file=self.current_processing_file)
                        except:
                            pass
                else:
                    # If the filename format is unexpected, remove it.
                    try:
                        os.remove(self.current_processing_file)
                    except:
                        pass
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to recover task quickly: {str(e)}",
                               processing_file=self.current_processing_file)

class TaskProcessor:
    """Task processor."""
    
    def __init__(self, base_op_folder="./op", task_tag="default", 
                 worker_id=None, check_interval=5, num_mantain=40, logger=None,
                 fun="exp", spec_path=None, piece_id=None, **kwargs):
        self.task_tag = task_tag
        # If worker_id is not provided, generate a random one
        self.worker_id = worker_id if worker_id else uuid.uuid4().hex[:8]
        self.check_interval = check_interval
        self.logger = logger
        self.num_mantain = num_mantain
        
        # Folder paths.
        self.task_current_folder = os.path.join(base_op_folder, task_tag, "task", "current")
        self.dag_pool_folder = os.path.join(base_op_folder, task_tag, "dag", "pool")
        self.stop_file_path = os.path.join(base_op_folder, task_tag, "STOP")
        
        # Initialize graceful shutdown handling.
        self.shutdown_handler = GracefulShutdownHandler(logger=logger)
        self.shutdown_handler.task_current_folder = self.task_current_folder
        self.shutdown_handler.worker_id = self.worker_id
        
        # Install signal handlers.
        self.shutdown_handler.setup_signal_handlers()
        
        # Ensure required folders exist.
        for folder in [self.task_current_folder, self.dag_pool_folder]:
            if not os.path.exists(folder):
                try:
                    os.makedirs(folder, exist_ok=True)
                    self.logger.log('INFO', "Created directory", path=folder)
                except Exception as e:
                    self.logger.log('ERROR', f"Failed to create directory: {str(e)}", path=folder)
        
        # Initialize ID manager.
        self.dag_id_manager = DagIdManager(
            task_tag=task_tag, 
            logger=logger,
            base_op_folder=base_op_folder
        )
        
        # Initialize BestOf manager.
        self.bestof_manager = BestOfGraphManager(
            base_op_folder=base_op_folder,
            task_tag=task_tag,
            logger=logger
        )
        
        # Initialize evolution processor.
        self.evolution_processor = EvolutionProcessor(
            num_mantain=num_mantain,
            logger=logger,
            fun=fun,
            spec_path=spec_path,
            piece_id=piece_id,
            **kwargs,
        )
        self.dag_reconstructor = DAGReconstructor()

    def _stop_requested(self):
        return os.path.exists(self.stop_file_path)
    
    def get_available_tasks(self):
        """Return available task files."""
        try:
            if not os.path.exists(self.task_current_folder):
                return []
            
            task_files = [f for f in os.listdir(self.task_current_folder) if f.endswith('.json')]
            return task_files
        except Exception as e:
            self.logger.log('ERROR', f"Failed to list task files: {str(e)}")
            return []
    
    def acquire_task(self, task_filename):
        """Acquire and lock one task file."""
        task_file_path = os.path.join(self.task_current_folder, task_filename)
        
        # Use task_tag and task filename so different tasks use different locks.
        lock_name = f"{self.task_tag}:task_acquire:{task_filename}"
        
        # Use a fresh lock object for each acquisition to avoid stale state.
        task_lock = LocalFileLock(logger=self.logger)
        
        try:
            # Acquire the file lock with a short timeout; it only protects file moves.
            if not task_lock.acquire_lock(lock_name, self.worker_id, timeout=30):
                self.logger.log('DEBUG', "Could not acquire task lock; another worker may have taken it",
                               filename=task_filename, 
                               worker_id=self.worker_id)
                return None
            
            # Move the file while holding the lock to acquire it atomically.
            processing_path = task_file_path + f".processing_{self.worker_id}"
            
            if os.path.exists(task_file_path):
                shutil.move(task_file_path, processing_path)
                self.logger.log('INFO', "Task file acquired",
                               filename=task_filename, 
                               processing_path=processing_path,
                               worker_id=self.worker_id)
                return processing_path
            else:
                self.logger.log('DEBUG', "Task file no longer exists; another worker may have processed it",
                               filename=task_filename,
                               worker_id=self.worker_id)
                return None
                    
        except Exception as e:
            self.logger.log('ERROR', f"Failed to acquire task file: {str(e)}",
                           filename=task_filename,
                           worker_id=self.worker_id)
            return None
        finally:
            # Release the file lock.
            try:
                task_lock.release_lock()
            except Exception as e:
                self.logger.log('DEBUG', f"Exception while releasing lock; usually harmless: {str(e)}",
                               filename=task_filename)
    
    def process_task_file(self, task_file_path):
        """Process a single task file."""
        try:
            # Read task file.
            with open(task_file_path, 'r') as f:
                task_data = json.load(f)
            
            task_id = task_data['meta'].get('task_id', 'unknown')
            
            # Reconstruct DAG objects.
            compute_graph_list = []
            for idx, dag_data in enumerate(task_data['dags']):
                try:
                    dag_id = dag_data.get('id', -1)
                    dag = self.dag_reconstructor.from_json(dag_data)
                    compute_graph_list.append(dag)
                except Exception as e:
                    self.logger.log('ERROR', f"Failed to reconstruct DAG: {str(e)}", dag_index=idx)
                    continue
            
            if not compute_graph_list:
                self.logger.log('WARNING', "No processable DAGs found", task_id=task_id)
                return None
            
            
            # Run evolution.
            best = []
            num_mutate = None if self.evolution_processor.population_size else 2
            
            start_time = time.time()
            evolved_graphs, evolved_best = self.evolution_processor.mutate_one_step_remove_redundance(
                compute_graph_list, best, num_mutate
            )
            process_time = time.time() - start_time
            
            self.logger.log('INFO', "Evolution completed",
                           task_id=task_id, 
                           process_time=f"{process_time:.2f}s",
                           input_graphs=len(compute_graph_list),
                           evolved_graphs=len(evolved_graphs),
                           best_count=len(evolved_best))
            
            # Build evolved DAG data list.
            evolved_dag_list = []
            for idx, dag in enumerate(evolved_graphs):
                dag_json = self.dag_reconstructor.to_json(dag)
                evolved_dag_list.append(dag_json)
            
            # Build bestof DAG data list.
            bestof_dag_list = []
            if evolved_best:
                for idx, best_tuple in enumerate(evolved_best):
                    # Each evolved_best entry is (dag, optimization result, compute cost).
                    best_dag = best_tuple[0]
                    best_dag_json = self.dag_reconstructor.to_json(best_dag)
                    bestof_dag_list.append(best_dag_json)
            
            self.logger.log('INFO', "Task processing complete",
                           task_id=task_id,
                           output_dags=len(evolved_dag_list),
                           bestof_graphs=len(bestof_dag_list))
            
            return {
                'evolved_dags': evolved_dag_list,
                'bestof_dags': bestof_dag_list,
                'task_id': task_id,
                'task_tag': self.task_tag,
                'worker_id': self.worker_id
            }
            
        except Exception as e:
            # Capture detailed exception information and traceback.
            exc_type, exc_value, exc_traceback = sys.exc_info()
            stack_trace = traceback.format_exception(exc_type, exc_value, exc_traceback)
            stack_trace_str = ''.join(stack_trace)
            
            self.logger.log('ERROR', f"Error while processing task file: {str(e)}",
                           task_file=task_file_path,
                           stack_trace=stack_trace_str)
            
            return None
    
    def save_dags_to_pool(self, task_result_data):
        """Save DAGs directly to the pool folder with atomic writes and unique IDs."""
        try:
            evolved_dags = task_result_data.get('evolved_dags', [])
            bestof_dags = task_result_data.get('bestof_dags', [])
            task_id = task_result_data.get('task_id', 'unknown')
            worker_id = task_result_data.get('worker_id', self.worker_id)
            
            saved_files = []
            bestof_saved_files = []
            timestamp = int(time.time() * 1000)
            
            # Save evolved DAGs.
            for idx, dag_data in enumerate(evolved_dags):
                try:
                    # Assign a unique DAG ID.
                    if 'id' not in dag_data or dag_data['id'] == -1:
                        dag_data['id'] = self.dag_id_manager.get_next_id()
                    
                    # Generate filename.
                    dag_filename = f"{self.task_tag}_dag_{timestamp}_{task_id}_{worker_id}_{idx}.json"
                    dag_file_path = os.path.join(self.dag_pool_folder, dag_filename)
                    temp_file_path = dag_file_path + f".tmp_{worker_id}"
                    
                    # Atomic write: write a temporary file, then rename it.
                    with open(temp_file_path, 'w') as f:
                        json.dump(dag_data, f, indent=2)
                    
                    # Atomic rename.
                    os.rename(temp_file_path, dag_file_path)
                    saved_files.append(dag_file_path)
                    
                except Exception as e:
                    self.logger.log('ERROR', f"Failed to save evolved DAG: {str(e)}",
                                   dag_index=idx, 
                                   task_id=task_id)
                    # Clean up temporary file.
                    try:
                        if os.path.exists(temp_file_path):
                            os.remove(temp_file_path)
                    except:
                        pass
            
            # Save bestof DAGs through BestOfGraphManager.
            for idx, bestof_dag_data in enumerate(bestof_dags):
                try:
                    # Assign a unique DAG ID if needed.
                    if 'id' not in bestof_dag_data or bestof_dag_data['id'] == -1:
                        bestof_dag_data['id'] = self.dag_id_manager.get_next_id()
                    
                    # Save the bestof graph.
                    saved_path = self.bestof_manager.save_bestof_graph(bestof_dag_data)
                    if saved_path:
                        bestof_saved_files.append(saved_path)
                    
                except Exception as e:
                    self.logger.log('ERROR', f"Failed to save bestof DAG: {str(e)}",
                                   bestof_index=idx, 
                                   task_id=task_id)
            
            self.logger.log('INFO', "DAG save complete",
                           task_id=task_id,
                           evolved_dags=len(evolved_dags),
                           bestof_dags=len(bestof_dags),
                           saved_files=len(saved_files),
                           bestof_saved_files=len(bestof_saved_files))
            
            return saved_files + bestof_saved_files
            
        except Exception as e:
            self.logger.log('ERROR', f"Failed to save DAGs to pool: {str(e)}")
            return []
    
    def monitor_and_process_tasks(self, stop_event=None):
        """Continuously monitor and process tasks."""

        # Initialize heartbeat counter.
        heartbeat_counter = 0
        
        while not (stop_event and stop_event.is_set()) and not self.shutdown_handler.is_shutdown_requested:
            try:
                if self._stop_requested():
                    self.logger.log('INFO', "STOP file detected; worker will exit")
                    break
                # Check whether a shutdown signal was received.
                if self.shutdown_handler.is_shutdown_requested:
                    self.logger.log('INFO', "Stop signal received; exiting")
                    break
                
                # Heartbeat log hook; optionally print every 10 checks.
                heartbeat_counter += 1
                #if heartbeat_counter % 10 == 0:
                #    print(f"Worker {self.worker_id} heartbeat | check_count: {heartbeat_counter}")
                
                # Fetch available tasks.
                try:
                    available_tasks = self.get_available_tasks()
                except Exception as e:
                    self.logger.log('ERROR', f"Failed to fetch available task list: {str(e)}")
                    time.sleep(self.check_interval)
                    continue
                
                if available_tasks:
                    self.logger.log('INFO', "Available tasks found", task_count=len(available_tasks))
                    
                    # Pick one task at random.
                    task_filename = random.choice(available_tasks)
                    self.logger.log('INFO', "Selected task file", filename=task_filename)
                    
                    # Try to acquire the task.
                    try:
                        processing_file_path = self.acquire_task(task_filename)
                    except Exception as e:
                        self.logger.log('ERROR', f"Exception while acquiring task: {str(e)}", filename=task_filename)
                        time.sleep(self.check_interval)
                        continue
                    
                    if processing_file_path:
                        # Record the processing file so signal handling can recover it quickly.
                        self.shutdown_handler.current_processing_file = processing_file_path
                        
                        self.logger.log('INFO', "Task acquired", filename=task_filename, processing_path=processing_file_path)
                        try:
                            # Process task.
                            try:
                                result_data = self.process_task_file(processing_file_path)
                            except Exception as e:
                                # Capture detailed exception information and traceback.
                                exc_type, exc_value, exc_traceback = sys.exc_info()
                                stack_trace = traceback.format_exception(exc_type, exc_value, exc_traceback)
                                stack_trace_str = ''.join(stack_trace)
                                
                                self.logger.log('ERROR', f"Error while processing task file: {str(e)}",
                                               filename=task_filename,
                                               stack_trace=stack_trace_str)
                                
                                result_data = None
                            
                            # Check the stop signal only at key points to avoid overhead.
                            if self.shutdown_handler.is_shutdown_requested:
                                # If the task has completed and produced results, save them quickly.
                                if result_data:
                                    try:
                                        saved_files = self.save_dags_to_pool(result_data)
                                        if saved_files:
                                            self.logger.log('INFO', "Saved DAG files before stopping",
                                                           filename=task_filename, 
                                                           saved_count=len(saved_files))
                                    except Exception as e:
                                        self.logger.log('ERROR', f"Failed to save DAGs before stopping: {str(e)}", filename=task_filename)
                                
                                # Remove the processing file.
                                try:
                                    if os.path.exists(processing_file_path):
                                        os.remove(processing_file_path)
                                except:
                                    pass
                                break
                            
                            if result_data:
                                # Save DAGs to the pool folder.
                                try:
                                    saved_files = self.save_dags_to_pool(result_data)
                                    if saved_files:
                                        self.logger.log('INFO', "DAG files saved",
                                                       filename=task_filename, 
                                                       saved_count=len(saved_files))
                                    else:
                                        self.logger.log('WARNING', "No DAG files were saved", filename=task_filename)
                                except Exception as e:
                                    self.logger.log('ERROR', f"Error while saving DAGs to pool: {str(e)}", filename=task_filename)
                            else:
                                self.logger.log('WARNING', "Task processing produced no result", filename=task_filename)
                            
                            # Delete processing file.
                            try:
                                if os.path.exists(processing_file_path):
                                    os.remove(processing_file_path)
                                    self.logger.log('INFO', "Deleted processing file", path=processing_file_path)
                            except Exception as e:
                                self.logger.log('ERROR', f"Error while deleting processing file: {str(e)}", path=processing_file_path)
                            
                            # Clear current processing-file record.
                            self.shutdown_handler.current_processing_file = None
                                
                        except Exception as e:
                            self.logger.log('ERROR', f"Error while handling task: {str(e)}", filename=task_filename)
                            # Ensure processing file is deleted.
                            try:
                                if os.path.exists(processing_file_path):
                                    os.remove(processing_file_path)
                                    self.logger.log('INFO', "Cleaned processing file for failed task", path=processing_file_path)
                            except Exception as e2:
                                self.logger.log('ERROR', f"Error while cleaning failed task file: {str(e2)}", path=processing_file_path)
                            
                            # Clear current processing-file record.
                            self.shutdown_handler.current_processing_file = None

                            if self._stop_requested():
                                self.logger.log('INFO', "STOP file detected; exiting after task processing")
                                break
                else:
                    # Wait when no task is available.
                    if self._stop_requested():
                        self.logger.log('INFO', "STOP file detected; worker will exit")
                        break
                    time.sleep(self.check_interval)
                    
            except Exception as e:
                # Capture detailed exception information and traceback.
                exc_type, exc_value, exc_traceback = sys.exc_info()
                stack_trace = traceback.format_exception(exc_type, exc_value, exc_traceback)
                stack_trace_str = ''.join(stack_trace)
                
                self.logger.log('ERROR', f"Error while monitoring tasks: {str(e)}",
                               stack_trace=stack_trace_str)
                
                time.sleep(self.check_interval)
        
        # Final log after stopping.
        self.logger.log('INFO', "Worker monitor loop ended")

class DagIdManager:
    """Global incrementing DAG ID counter shared with the master via local file lock."""
    
    def __init__(self, task_tag="default", logger=None, base_op_folder="./op"):
        self.task_tag = task_tag
        self.logger = logger
        self.current_id = 0
        
        # Set local file paths.
        self.base_folder = os.path.join(base_op_folder, task_tag)
        os.makedirs(self.base_folder, exist_ok=True)
        
        self.id_file = os.path.join(self.base_folder, "dag_id_counter.txt")
        self.lock_file_path = os.path.join(self.base_folder, "dag_id_counter.lock")
        
        # Initialize ID counter.
        self.current_id = self._load_or_create_counter()
    
    def _load_or_create_counter(self):
        """Load or create the ID counter."""
        if os.path.exists(self.id_file):
            try:
                with open(self.id_file, 'r') as f:
                    current_id = int(f.read().strip())
                if self.logger:
                    self.logger.log('INFO', "Loaded DAG ID counter from file",
                                   current_id=current_id, 
                                   id_file=self.id_file)
                return current_id
            except:
                if self.logger:
                    self.logger.log('WARNING', "Failed to read DAG ID file; reinitializing")
        
        # Initialize to 0.
        self._save_counter(0)
        if self.logger:
            self.logger.log('INFO', "Initialized DAG ID counter", initial_id=0)
        return 0
    
    def _save_counter(self, value):
        """Save counter value to file."""
        with open(self.id_file, 'w') as f:
            f.write(str(value))
    
    def get_next_id(self):
        """Get the next available DAG ID atomically."""
        lock_file = None
        try:
            lock_file = open(self.lock_file_path, 'w')
            fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
            
            # Read current value.
            if os.path.exists(self.id_file):
                with open(self.id_file, 'r') as f:
                    current_id = int(f.read().strip())
            else:
                current_id = 0
            
            # Increment and save.
            new_id = current_id + 1
            self._save_counter(new_id)
            self.current_id = new_id
            
            return new_id
            
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to get DAG ID: {str(e)}")
            # Fallback.
            backup_id = int(time.time() * 1000)
            if self.logger:
                self.logger.log('WARNING', "Using timestamp as fallback DAG ID", id=backup_id)
            return backup_id
        finally:
            if lock_file:
                try:
                    fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
                    lock_file.close()
                except:
                    pass

class BestOfGraphManager:
    """Bestof graph storage using incrementing IDs and local file locks."""
    
    def __init__(self, base_op_folder="./op", task_tag="default", logger=None):
        self.task_tag = task_tag
        self.logger = logger
        self.current_id = 0
        
        # Set storage directory.
        self.base_folder = os.path.join(base_op_folder, task_tag)
        self.bestof_folder = os.path.join(self.base_folder, "bestof")
        
        # Create directory.
        if not os.path.exists(self.bestof_folder):
            os.makedirs(self.bestof_folder)
        
        # Set local file paths.
        self.id_file = os.path.join(self.base_folder, "bestof_id_counter.txt")
        self.lock_file_path = os.path.join(self.base_folder, "bestof_id_counter.lock")
        
        # Initialize ID counter.
        self.current_id = self._load_or_create_counter()
        
        if self.logger:
            self.logger.log('INFO', "BestOfGraphManager initialized",
                           bestof_folder=self.bestof_folder,
                           current_id=self.current_id)
    
    def _load_or_create_counter(self):
        """Load or create the ID counter."""
        if os.path.exists(self.id_file):
            try:
                with open(self.id_file, 'r') as f:
                    current_id = int(f.read().strip())
                if self.logger:
                    self.logger.log('INFO', "Loaded BestOfGraph ID counter from file",
                                   current_id=current_id, 
                                   id_file=self.id_file)
                return current_id
            except:
                if self.logger:
                    self.logger.log('WARNING', "Failed to read BestOfGraph ID file; reinitializing")
        
        # Initialize to 0.
        self._save_counter(0)
        if self.logger:
            self.logger.log('INFO', "Initialized BestOfGraph ID counter", initial_id=0)
        return 0
    
    def _save_counter(self, value):
        """Save counter value to file."""
        with open(self.id_file, 'w') as f:
            f.write(str(value))
    
    def get_next_id(self):
        """Get the next available BestOfGraph ID atomically."""
        lock_file = None
        try:
            lock_file = open(self.lock_file_path, 'w')
            fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
            
            # Read current value.
            if os.path.exists(self.id_file):
                with open(self.id_file, 'r') as f:
                    current_id = int(f.read().strip())
            else:
                current_id = 0
            
            # Increment and save.
            new_id = current_id + 1
            self._save_counter(new_id)
            self.current_id = new_id
            
            return new_id
            
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to get BestOfGraph ID: {str(e)}")
            # Fallback.
            backup_id = int(time.time() * 1000)
            if self.logger:
                self.logger.log('WARNING', "Using timestamp as fallback BestOfGraph ID", id=backup_id)
            return backup_id
        finally:
            if lock_file:
                try:
                    fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
                    lock_file.close()
                except:
                    pass
    
    def save_bestof_graph(self, graph_data):
        """Save bestof graph to file."""
        try:
            # Get incrementing ID.
            graph_id = self.get_next_id()
            
            # Filename format: {id}.json.
            filename = f"{graph_id}.json"
            file_path = os.path.join(self.bestof_folder, filename)
            
            # Atomic write.
            temp_file_path = file_path + f".tmp_{self.current_id}"
            with open(temp_file_path, 'w') as f:
                json.dump(graph_data, f, indent=2)
            os.rename(temp_file_path, file_path)
            
            if self.logger:
                self.logger.log('INFO', "BestOf graph saved", graph_id=graph_id, file_path=file_path)
            
            return file_path
            
        except Exception as e:
            if self.logger:
                self.logger.log('ERROR', f"Failed to save BestOfGraph: {str(e)}")
            return None

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--task_tag', type=str, default='default', help='Task identifier')
    parser.add_argument('--check_interval', type=int, default=5, help='Task poll interval in seconds')
    parser.add_argument('--run_time', type=int, default=0, help='Run time in seconds; 0 means unlimited')
    parser.add_argument(
        '--num_mantain',
        '--num_maintain',
        dest='num_mantain',
        type=int,
        default=40,
        help='Population size maintained during evolution',
    )
    parser.add_argument('--fun', type=str, default="exp", help='Legacy target function name')
    parser.add_argument('--fun_idx', type=int, default=0, help='Legacy function index/parameter')
    parser.add_argument('--eps', type=float, default=1e-6, help='Optimization tolerance')
    parser.add_argument('--spec_path', type=str, default=None, help='Spec JSON path')
    parser.add_argument('--piece_id', type=str, default=None, help='Piece ID within the spec')

    args = parser.parse_args()

    base_op_root = os.getenv("ANUM_OP_ROOT") or os.path.join(root_dir, "op")
    
    generated_worker_id = uuid.uuid4().hex[:8]
    
    print("=" * 60)
    print(f"Worker node started | {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Task tag: {args.task_tag} | Worker ID: {generated_worker_id}")
    run_time_label = "unlimited" if args.run_time == 0 else f"{args.run_time}s"
    print(f"Check interval: {args.check_interval}s | Run time: {run_time_label}")
    print(f"Population size: {args.num_mantain}")
    print("ID management: local file-lock mode")
    
    base_folder = os.path.join(base_op_root, args.task_tag)
    task_current_folder = os.path.join(base_folder, "task", "current")
    dag_pool_folder = os.path.join(base_folder, "dag", "pool")
    bestof_folder = os.path.join(base_folder, "bestof")
    
    for folder in [task_current_folder, dag_pool_folder, bestof_folder]:
        os.makedirs(folder, exist_ok=True)
    
    logger = ConsoleLogger(task_tag=args.task_tag, worker_id=generated_worker_id)
    
    processor = TaskProcessor(
        base_op_folder=base_op_root,
        task_tag=args.task_tag,
        worker_id=generated_worker_id,
        check_interval=args.check_interval,
        num_mantain=args.num_mantain,
        logger=logger,
        fun = args.fun,
        spec_path=args.spec_path,
        piece_id=args.piece_id,
        v = args.fun_idx,
        eps = args.eps
    )
    
    from threading import Event
    stop_event = Event()
    
    print(f"Worker node (ID: {generated_worker_id}) running...")
    print("=" * 60)
    
    try:
        if args.run_time > 0:
            print(f"Worker will stop after {args.run_time} seconds")
            import threading
            
            worker_thread = threading.Thread(
                target=processor.monitor_and_process_tasks,
                args=(stop_event,)
            )
            worker_thread.daemon = True
            worker_thread.start()
            
            start_time = time.time()
            while time.time() - start_time < args.run_time:
                if processor.shutdown_handler.is_shutdown_requested:
                    logger.log('INFO', "Stop signal received; ending timed run early")
                    break
                time.sleep(1)
            
            stop_event.set()
            if worker_thread.is_alive():
                worker_thread.join(timeout=5)
        else:
            print("Worker running; SIGTERM/SIGINT stops it quickly...")
            processor.monitor_and_process_tasks(stop_event)
                
    except KeyboardInterrupt:
        print(f"\nCtrl+C received; Worker(ID: {generated_worker_id}) is stopping...")
        stop_event.set()
    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        stack_trace = traceback.format_exception(exc_type, exc_value, exc_traceback)
        stack_trace_str = ''.join(stack_trace)
        
        logger.log('ERROR', f"Worker runtime error: {str(e)}",
                   stack_trace=stack_trace_str)
        
        print(f"Worker runtime error: {str(e)}")
    
    logger.log('INFO', f"Worker(ID: {generated_worker_id}) stopped")
    print(f"Worker(ID: {generated_worker_id}) stopped")
