IFS=',' read -ra TASK_LIST <<< "$TASK_LIST_STR"

red=$(tput setaf 1)
green=$(tput setaf 2)
yellow=$(tput setaf 3)
blue=$(tput setaf 4)
reset=$(tput sgr0)

CPU_NUM=$(($(nproc) - 16))
cpu_list=()
for ((i=0; i<CPU_NUM; i+=4)); do
    end=$((i+3))
    cpu_list+=("$i-$end")
done
         
MAX_JOBS=${#cpu_list[@]}

IFS=',' read -ra VALID_FORMAT_ENTRIES <<< "$VALID_FORMAT_ENTRIES_STR"
RUNS_TOTAL=$(( ${#TASK_LIST[@]} * RUNS_PER_TASK ))


# Resource Pool Management
declare -a cpu_available=("${cpu_list[@]}")
cuda_devices=(1 2 3)
cuda_index=0


declare -a task_queue=()
declare -A running_tasks=()  # pid -> "cpu,cuda,task,iter,retry,path"
declare -a failed_tasks=()  #  task,iter

# task_queue_initalization
for iter in $(seq 1 "$RUNS_PER_TASK"); do
    for TASK in "${TASK_LIST[@]}"; do
        task_queue+=("$TASK,$iter,0")  # 格式：task,iter,retry_count
    done
done


echo "${green}COLLECTING ${TASK_LIST[@]} with ${RUNS_PER_TASK} runs per task and $TIMEOUT_DURATION_TRAJ timeout using ${GENERATOR_NAME}${reset}"

# main loop
while [ ${#task_queue[@]} -gt 0 ] || [ ${#running_tasks[@]} -gt 0 ]; do #Loop while tasks remain or are running
    # check task finished
    for pid in "${!running_tasks[@]}"; do
        if ! kill -0 "$pid" 2>/dev/null; then
            IFS=',' read -ra parts <<< "${running_tasks[$pid]}"
            cpu="${parts[0]}"
            cuda="${parts[1]}"
            task="${parts[2]}"
            iter="${parts[3]}"
            retry="${parts[4]}"
            log_path="logs/${parts[5]}"
            workspaces_path="workspaces/${parts[5]}"
            
            # check results
            if [[ -f "$log_path/output.log" ]] && grep -q "*** Finished! ***" "$log_path/output.log"; then
                echo "${green}✅ Success: $task iter $iter pid $pid (retry $retry)${reset}"
            else
                echo "${red}❌ Failed: $task iter $iter pid $pid (retry $retry)${reset}"
                # retry
                if (( retry < MAX_RETRY )); then
                    new_retry=$(( retry + 1 ))
                    task_queue+=("$task,$iter,$new_retry")
                    echo "${blue}🔄 Re-queueing: $task iter $iter (retry $new_retry)${reset}"
                else
                    failed_tasks+=("$task,$iter")
                    echo "${red}❌ Max retries reached for $task iter $iter. Marking as failed.${reset}"
                fi
                # remove failed logs
                # rm -rf "$log_path"
                # rm -rf "$workspaces_path"
                
            fi
            
            # Release resources
            cpu_available+=("$cpu")
            unset running_tasks["$pid"]
            echo "${yellow}cpu_available: ${#cpu_available[@]}, tasks_pending: ${#task_queue[@]}, tasks_running: ${#running_tasks[@]}, tasks_done: $((RUNS_TOTAL - ${#task_queue[@]} - ${#running_tasks[@]}-${#failed_tasks[@]})), tasks_failed: ${#failed_tasks[@]}${reset}"
        fi
    done

    # start new tasks
    while [ ${#cpu_available[@]} -gt 0 ] && [ ${#task_queue[@]} -gt 0 ] && [ ${#running_tasks[@]} -lt $MAX_JOBS ]; do
        # Fetch task
        task_entry="${task_queue[0]}"
        task_queue=("${task_queue[@]:1}")
        IFS=',' read -ra task_parts <<< "$task_entry"
        task="${task_parts[0]}"
        iter="${task_parts[1]}"
        retry="${task_parts[2]}"
        
        # Allocate resources
        cpu="${cpu_available[0]}"
        cpu_available=("${cpu_available[@]:1}")
        cuda="${cuda_devices[cuda_index]}"
        cuda_index=$(( (cuda_index + 1) % 3 ))
        
        # Generate unique path
        TIME=$(date +"%m%d%H%M%S")
        NUM_SAMPLING_SEQ=1
        NUM_BEAM=1
        PATH_SUFFIX="search_n${NUM_SAMPLING_SEQ}_b${NUM_BEAM}_${BENCHMARK}_${task}/step_${NUM_STEPS}_${GENERATOR_NAME}/$TIME"
        mkdir -p "logs/$PATH_SUFFIX"

        # Launch task (with timeout and resource limits)
        (
            set -m
            CUDA_VISIBLE_DEVICES=$cuda taskset -c $cpu timeout $TIMEOUT_DURATION_TRAJ python MLAgent_Ray/tree_search.py \
                --generator_model_name_or_path $GENERATOR \
                --controller_address ${GENERATOR_URL} \
                --api_key $HUB_KEY \
                --api_url $HUB_URL \
                --task "$task" \
                --valid_format_entries "${VALID_FORMAT_ENTRIES[@]}" \
                --benchmark "$BENCHMARK" \
                --num_sampling_sequences $NUM_SAMPLING_SEQ \
                --n_beam $NUM_BEAM \
                --seed 42 \
                --template_name "$TEMPLATE_NAME" \
                --python "$PYTHON" \
                --log_dir "logs/$PATH_SUFFIX" \
                --work_dir "workspaces/$PATH_SUFFIX" \
                --agent_max_steps $NUM_STEPS \
                --max_time "$TIMEOUT_DURATION_TRAJ" \
                --edit_script_llm_name "$CODER" > "logs/$PATH_SUFFIX/output.log" 2>&1
        ) &
        pid=$!
        
        # Log task information
        running_tasks["$pid"]="$cpu,$cuda,$task,$iter,$retry,$PATH_SUFFIX"
        echo "🚀 Started: $task $TIME iter $iter (retry $retry) on CPU $cpu, CUDA $cuda (PID: $pid)"
        sleep 1
        echo "${yellow}cpu_available: ${#cpu_available[@]}, tasks_pending: ${#task_queue[@]}, tasks_running: ${#running_tasks[@]}, tasks_done: $((RUNS_TOTAL - ${#task_queue[@]} - ${#running_tasks[@]}-${#failed_tasks[@]})), tasks_failed: ${#failed_tasks[@]}${reset}"
        # echo "${blue}running_tasks: ${running_tasks[@]}${reset}"| sed 's/ /\n/g'
    done
    sleep 1
done

echo "failed_tasks: ${red}${failed_tasks[@]}${reset}"| sed 's/ /\n/g'
echo "All tasks completed!"