#!/bin/bash
set -e

task=alfworld
model_type=llama3
split=test
timestamp=$(date +%Y%m%d_%H%M%S)
exp_name=causal_v3_${split}_${timestamp}
output_path=outputs/${exp_name}
model_name_or_path=/root/autodl-tmp/KnowSelf-main/output/knowself_llama3-8b-alfworld-rpo_merged/

mkdir -p ${output_path}

echo "=========================================="
echo "🚀 因果闭环评估 V3（真正影响动作）"
echo "=========================================="
echo "输出路径: ${output_path}"
echo "开始时间: $(date)"
echo "=========================================="
echo ""

VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python -m eval_agent.knowself_eval_vllm_alfworld_causal_v3 \
    --gpu_num 1 \
    --exp_config ${task} \
    --output_path ${output_path} \
    --select_agent_config deepseek \
    --select_agent_name deepseek-chat \
    --model_name_or_path ${model_name_or_path} \
    --select_knowledge_inst eval_agent/prompt/instructions/select_knowledge_${task}.txt \
    --knowledge_base_path knowledge_system_construction/automanual_${task}/autobuild_logs/rule_manager.json \
    --model_type ${model_type} \
    --split ${split} \
    --enable_causal_reasoning \
    --causal_max_hypotheses 2 \
    --causal_validation_threshold 0.45 \
    --causal_action_weight 0.7 \
    --debug \
    --override

echo ""
echo "=========================================="
echo "✅ 评估完成！"
echo "=========================================="

# 对比分析
python << PYEOF
import json
from glob import glob

output_path = "${output_path}"
task_files = glob(f"{output_path}/*.json")
task_files = [f for f in task_files if 'causal_kb' not in f]

success_count = 0
total_causal_actions = 0
total_actions = 0

for task_file in task_files:
    try:
        with open(task_file, 'r') as f:
            data = json.load(f)
            if data.get('success'):
                success_count += 1
            total_causal_actions += data.get('causal_actions_used', 0)
            total_actions += data.get('steps', 0)
    except:
        pass

print(f"\n📊 详细结果:")
print(f"   总任务数: {len(task_files)}")
print(f"   成功任务: {success_count}")
print(f"   ⭐ 成功率: {success_count/len(task_files)*100:.2f}%")
print(f"\n🔬 因果动作统计:")
print(f"   因果动作: {total_causal_actions}")
print(f"   总动作数: {total_actions}")
print(f"   ⭐ 采用率: {total_causal_actions/max(1,total_actions)*100:.1f}%")
PYEOF

echo ""
echo "📁 关键文件:"
echo "  日志: ${output_path}/log.txt"
echo "  任务结果: $(ls ${output_path}/*.json 2>/dev/null | grep -v causal | wc -l) 个"
echo ""
echo "🔍 查看因果影响:"
echo "  grep '采用因果建议' ${output_path}/log.txt | head -20"
echo "  grep '因果动作采用率' ${output_path}/log.txt | head -10"

