
cd ..

python train.py benchmark=webshop run_name=webshop_14B agent.llm=Qwen/Qwen3-14B testing=false resume=false

python insight_extraction.py benchmark=webshop load_run_name=webshop_14B run_name=insights-extraction-webshop_14B agent.llm=Qwen/Qwen3-14B agent.max_num_rules=10 agent.success_critique_num=8 testing=false resume=false

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_14B agent.fewshot_strategy=task_similarity testing=false resume=false agent.llm=Qwen/Qwen3-14B

# 测试 fewshot 修改功能

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_faithfulness_empty_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.fewshot_modification_type=empty testing=false resume=false agent.llm=Qwen/Qwen3-14B

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_faithfulness_shuffle_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.fewshot_modification_type=shuffle testing=false resume=false agent.llm=Qwen/Qwen3-14B

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_faithfulness_irrelevant_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.fewshot_modification_type=irrelevant testing=false resume=false agent.llm=Qwen/Qwen3-14B

# 测试 insights 修改功能

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_insights_empty_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.insights_modification_type=empty testing=false resume=false agent.llm=Qwen/Qwen3-14B

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_insights_corrupted_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.insights_modification_type=corrupted testing=false resume=false agent.llm=Qwen/Qwen3-14B

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_insights_irrelevant_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.insights_modification_type=irrelevant testing=false resume=false agent.llm=Qwen/Qwen3-14B

python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_insights_filler_tokens_14B agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.insights_modification_type=filler_tokens testing=false resume=false agent.llm=Qwen/Qwen3-14B

# # 测试combined修改（fewshot + insights）
# python eval.py benchmark=webshop load_run_name=extracted_insights/insights-extraction-webshop_14B run_name=webshop_combined_corrupted agent.fewshot_strategy=task_similarity agent.faithfulness_experiment=true agent.fewshot_modification_type=corrupted agent.insights_modification_type=corrupted testing=false resume=false