# HH-RLHF evaluation
# Reformatting
train_data="anonymous/hh-rlhf_with_features"
python hh-reformat.py --save_local

# Add log_score 
preprocessed_data="out/${train_data#*/}"
python compute-logprobs.py google/flan-t5-large --load_local --save_local --dataset $preprocessed_data --options chosen rejected --context human --response-prefix assistant

# Reformatting evaluation data
eval_data="anonymous/hh-generated_flan_t5_large_with_features2"
python ../feature_extract/hh-reformat.py --save_local --dataset $eval_data

# Add log_score
preprocessed_data="out/${eval_data#*/}"
python compute-logprobs.py google/flan-t5-large --load_local --save_local --dataset $preprocessed_data --singleton --context prompt --response-prefix response

## BoN(n=256)
candidate1=out/hh-rlhf_with_features_flan_t5_large_train_logistic.pkl
candidate2=out/hh-rlhf_with_features_flan_t5_large_test_logistic.pkl
python bon_candidate.py --scored-samples anonymous/hh-generated_flan_t5_large_with_features2 --model-paths $candidate1 $candidate2
candidate1=out/hh-generated_flan_t5_large_with_features2_vanila_score_vs_${candidate1#*/}.feather
candidate2=out/hh-generated_flan_t5_large_with_features2_vanila_score_vs_${candidate2#*/}.feather
python claude_eval.py --dataset $candidate1
python claude_eval.py --dataset $candidate2


candidate1=out/hh-rlhf_with_features_flan_t5_large_flan_t5_zeroshot_train_logistic_zeroshot_g.pkl
candidate2=out/hh-rlhf_with_features_flan_t5_large_flan_t5_zeroshot_test_logistic_zeroshot_g.pkl
candidate3=../reward_model/hh-generated_flan_t5_large_annotated_google_flan-t5-xl_test.json
candidate3=../reward_model/hh-generated_flan_t5_large_annotated_google_flan-t5-xl_train.json
candidate4=anonymous/hh-generated_flan_t5_rx_xl_all
python bon_candidate.py --scored-samples anonymous/hh-generated_flan_t5_large_flan_t5_zeroshot --model-paths $candidate1 $candidate2 $candidate3 $candidate4 --chunk_eval
for i in {0..5}; do
    candidate1=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_zeroshot_train_logistic_zeroshot_g.pkl.feather
    candidate2=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_zeroshot_test_logistic_zeroshot_g.pkl.feather
    candidate3=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_pm_score.feather
    candidate4=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_external_rm1.feather
    python claude_eval.py --dataset $candidate1
    python claude_eval.py --dataset $candidate2
    python claude_eval.py --dataset $candidate3
    python claude_eval.py --dataset $candidate4
done

## scaling experiment
candidate5=out/hh-rlhf_with_features_flan_t5_large_flan_t5_small_zeroshot_train_logistic_zeroshot_small_g_TEST_.pkl
candidate6=out/hh-rlhf_with_features_flan_t5_large_flan_t5_small_zeroshot_test_logistic_zeroshot_small_g_TEST_.pkl
python bon_candidate.py --scored-samples anonymous/hh-generated_flan_t5_large_flan_t5_small_zeroshot --model-paths $candidate5 $candidate6 --chunk_eval
for i in {0..5}; do
    candidate5=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_small_zeroshot_train_logistic_zeroshot_small_g_TEST_.pkl.feather
    candidate6=out/hh-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_small_zeroshot_test_logistic_zeroshot_small_g_TEST_.pkl.feather
    python claude_eval.py --dataset $candidate5
    python claude_eval.py --dataset $candidate6
done

candidate3=out/hh-rlhf_with_features_flan_t5_large_flan_t5_base_zeroshot_train_logistic_zeroshot_base_g_TEST_.pkl
candidate4=out/hh-rlhf_with_features_flan_t5_large_flan_t5_base_zeroshot_test_logistic_zeroshot_base_g_TEST_.pkl
python bon_candidate.py --scored-samples anonymous/hh-generated_flan_t5_large_flan_t5_base_zeroshot --model-paths $candidate3 $candidate4 --chunk_eval
for i in {0..5}; do
    candidate3=out/hh-generated_flan_t5_large_flan_t5_base_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_base_zeroshot_train_logistic_zeroshot_base_g_TEST_.pkl.feather
    candidate4=out/hh-generated_flan_t5_large_flan_t5_base_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_base_zeroshot_test_logistic_zeroshot_base_g_TEST_.pkl.feather
    python claude_eval.py --dataset $candidate3
    python claude_eval.py --dataset $candidate4
done


candidate1=out/hh-rlhf_with_features_flan_t5_large_flan_t5_large_zeroshot_train_logistic_zeroshot_large_g_TEST_.pkl
candidate2=out/hh-rlhf_with_features_flan_t5_large_flan_t5_large_zeroshot_test_logistic_zeroshot_large_g_TEST_.pkl
python bon_candidate.py --scored-samples anonymous/hh-generated_flan_t5_large_flan_t5_large_zeroshot --model-paths $candidate1 $candidate2 --chunk_eval
for i in {0..5}; do
    candidate1=out/hh-generated_flan_t5_large_flan_t5_large_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_large_zeroshot_train_logistic_zeroshot_large_g_TEST_.pkl.feather
    candidate2=out/hh-generated_flan_t5_large_flan_t5_large_zeroshot_${i}_vanila_score_vs_hh-rlhf_with_features_flan_t5_large_flan_t5_large_zeroshot_test_logistic_zeroshot_large_g_TEST_.pkl.feather
    python claude_eval.py --dataset $candidate1
    python claude_eval.py --dataset $candidate2
done


# SHP evaluation
train_data="anonymous/shp_with_features_20k"
# Reformatting
# python hh-reformat.py --save_local

# Add log_score 
preprocessed_data=$train_data
python compute-logprobs.py google/flan-t5-large --save_local --dataset $preprocessed_data --options A B --context history --response-prefix human_ref

# Add log_score
eval_data="anonymous/shp-generated_flan_t5_large_with_features" # same with https://anonymous/anonymous/Localized_mle/blob/main/feature_extract/generated_flan_t5_large_annotated_gpt-3.5-turbo.json
python compute-logprobs.py google/flan-t5-large --save_local --dataset $eval_data --singleton --context prompt --response-prefix response

## BoN -> Claude eval
candidate1=out/shp_with_features_20k_flan_t5_large_train_logistic_g.pkl
candidate2=out/shp_with_features_20k_flan_t5_large_test_logistic_g.pkl
python bon_candidate.py --scored-samples anonymous/shp-generated_flan_t5_large_with_features --model-paths $candidate1 $candidate2
candidate1=out/shp-generated_flan_t5_large_with_features_vanila_score_vs_${candidate1#*/}.feather
candidate2=out/shp-generated_flan_t5_large_with_features_vanila_score_vs_${candidate2#*/}.feather
python claude_eval.py --dataset $candidate1
python claude_eval.py --dataset $candidate2

## BoN(n=256)
candidate1=out/shp_with_features_20k_flan_t5_large_flan_t5_zeroshot_train_logistic_zeroshot_g.pkl
candidate2=out/shp_with_features_20k_flan_t5_large_flan_t5_zeroshot_test_logistic_zeroshot_g.pkl
candidate3=../reward_model/shp-generated_flan_t5_large_annotated_google_flan-t5-xl_test.json
candidate3=../reward_model/shp-generated_flan_t5_large_annotated_google_flan-t5-xl_train.json
candidate4=data/shp-generated_flan_t5_large_flan_t5_zeroshot_sileod
python bon_candidate.py --scored-samples anonymous/shp-generated_flan_t5_large_flan_t5_zeroshot --model-paths $candidate1 $candidate2 $candidate3 $candidate4 --chunk_eval
for i in {0..5}; do
    candidate1=out/shp-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_shp_with_features_20k_flan_t5_large_flan_t5_zeroshot_train_logistic_zeroshot_g.pkl.feather
    candidate2=out/shp-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_shp_with_features_20k_flan_t5_large_flan_t5_zeroshot_test_logistic_zeroshot_g.pkl.feather
    candidate3=out/shp-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_pm_score.feather
    candidate4=out/shp-generated_flan_t5_large_flan_t5_zeroshot_${i}_vanila_score_vs_external_rm1.feather_sileod
    python claude_eval.py --dataset $candidate1
    python claude_eval.py --dataset $candidate2
    python claude_eval.py --dataset $candidate3
    python claude_eval.py --dataset $candidate4
done
