#!/bin/bash
set -e

task=alfworld
model_type=llama3
split=test  # 完整测试集134个任务
timestamp=$(date +%Y%m%d_%H%M%S)
exp_name=full_${split}_causal_${timestamp}
output_path=outputs/${exp_name}
model_name_or_path=/root/autodl-tmp/KnowSelf-main/output/knowself_llama3-8b-alfworld-rpo_merged/

mkdir -p ${output_path}

echo "=========================================="
echo "🚀 完整测试集评估 (134个任务)"
echo "=========================================="
echo "输出路径: ${output_path}"
echo "开始时间: $(date)"
echo "=========================================="
echo ""

# 运行完整评估（不分片，跑全部134个任务）
VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python -m eval_agent.knowself_eval_vllm_alfworld_causal \
    --gpu_num 1 \
    --exp_config ${task} \
    --output_path ${output_path} \
    --select_agent_config deepseek \
    --select_agent_name deepseek-chat \
    --model_name_or_path ${model_name_or_path} \
    --select_knowledge_inst eval_agent/prompt/instructions/select_knowledge_${task}.txt \
    --knowledge_base_path knowledge_system_construction/automanual_${task}/autobuild_logs/rule_manager.json \
    --model_type ${model_type} \
    --split ${split} \
    --enable_causal_reasoning \
    --causal_max_hypotheses 2 \
    --causal_validation_threshold 0.45 \
    --debug \
    --override

echo ""
echo "=========================================="
echo "✅ 评估完成！"
echo "结束时间: $(date)"
echo "=========================================="

# 生成详细统计报告
python << PYEOF
import json
import os
from glob import glob

output_path = "${output_path}"

# 统计任务结果
task_files = glob(f"{output_path}/*.json")
task_files = [f for f in task_files if 'causal_kb' not in f]

print(f"\n📊 详细统计报告")
print("=" * 70)
print(f"总任务数: {len(task_files)}")

success_count = 0
reward_sum = 0
reward_count = 0

for task_file in task_files:
    try:
        with open(task_file, 'r') as f:
            task_data = json.load(f)
            if task_data.get('success'):
                success_count += 1
            if 'reward' in task_data and task_data['reward'] is not None:
                reward_sum += task_data['reward']
                reward_count += 1
    except:
        pass

if len(task_files) > 0:
    print(f"成功任务数: {success_count}")
    print(f"成功率: {success_count/len(task_files)*100:.2f}%")
    
if reward_count > 0:
    print(f"平均奖励: {reward_sum/reward_count:.4f}")

# 统计因果知识库
causal_kb_files = glob(f"{output_path}/*_causal_kb.json")
total_knowledge = 0
total_hypotheses = 0
total_interventions = 0
total_validations = 0

for kb_file in causal_kb_files:
    try:
        with open(kb_file, 'r') as f:
            kb_data = json.load(f)
            kb_size = len(kb_data.get('knowledge_base', []))
            total_knowledge += kb_size
            
            stats = kb_data.get('statistics', {})
            total_hypotheses += stats.get('total_hypotheses', 0)
            total_interventions += stats.get('total_interventions', 0)
            total_validations += (
                stats.get('successful_validations', 0) + 
                stats.get('partial_validations', 0) + 
                stats.get('failed_validations', 0)
            )
    except:
        pass

print(f"\n📚 因果知识统计")
print("=" * 70)
print(f"知识库文件数: {len(causal_kb_files)}")
print(f"总知识条目: {total_knowledge}")
print(f"总假设生成: {total_hypotheses}")
print(f"总干预规划: {total_interventions}")
print(f"总效应验证: {total_validations}")

if len(causal_kb_files) > 0:
    print(f"平均每任务知识: {total_knowledge/len(causal_kb_files):.2f} 条")

print("\n" + "=" * 70)
PYEOF

echo ""
echo "📊 最终结果:"
grep "Success rate:" ${output_path}/log.txt | tail -1
echo ""
grep -A 10 "因果推理统计:" ${output_path}/log.txt | tail -11

echo ""
echo "📁 输出文件:"
echo "  日志: ${output_path}/log.txt"
echo "  任务结果: $(ls ${output_path}/*.json 2>/dev/null | grep -v causal | wc -l) 个"
echo "  因果知识库: $(ls ${output_path}/*_causal_kb.json 2>/dev/null | wc -l) 个"

echo ""
echo "🔍 查看详细内容:"
echo "  查看日志: tail -f ${output_path}/log.txt"
echo "  查看知识库: cat ${output_path}/0_causal_kb.json | python -m json.tool"
echo "  查看假设: grep 'H1:' ${output_path}/log.txt | head -20"

