# Robust In-Context Reinforcement Learning Under Reward Poisoning Attacks

## Setup

```bash
pip install -r requirements.txt
```

## Running Experiments

### Bandit
```bash
python collect_data.py --env bandit --H 500 --dim 5 --var 0.3 --envs_eval 200
python train.py --env bandit --arch 1 --var 0.3 --H 500 --dim 5 --envs_eval 200 --num_epochs 500
# Run with --seed 0-9 for replications

python train_bandit_adv.py \
    --env bandit --arch 1 --variance 0.3 --context_len 500 --n_actions 5 --n_epochs 500 --epoch 400 \
    --n_envs_eval 200 --n_rounds 20 --eps_episodes 0.8 --eps_steps 0.1 --victim_iters 20 --victim_lr 0.00003 --attacker_iters 20 --attacker_lr 0.03 --max_poison_diff 3.0 \
    --attacker_against dpt --seed 0
# Run with different --attacker_against algs and --seed 0-9 for different algorithms and replications
# Run with --attacker_against dpt and --log_round_rewards to log performance throughout rounds

# Plot training performance during the rounds (with --log_round_rewards above) with replications:
python bandit_plot_adv_rounds.py --env bandit --arch 1 --variance 0.3 --context_len 500 --n_actions 5 --n_envs_eval 200 --n_epochs 500 --epoch 400 --n_rounds 20 --n_seeds 10 --victim_iters 20 --victim_lr 0.00003 --attacker_iters 20 --attacker_lr 0.03 --max_poison_diff 3.0 --attacker_against dpt --n_seeds 10
```

Reproduce bandit results (on a cluster with slurm; set your own sbatch settings):
```bash
# Assume collect_data.py was run
# Assume train.py was run for seeds 0-9

export EPS_STEPS=0.1  # Repeat for 0.2 and 0.4

# Train attackers:
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt b_adv_train_batch.sh  # AT-DPT
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen b_adv_train_batch.sh  # DPT (frozen)
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ts b_adv_train_batch.sh  # TS
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=rts b_adv_train_batch.sh  # RTS (modified)
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ucb b_adv_train_batch.sh  # UCB1.0
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb b_adv_train_batch.sh  # crUCB
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb_v b_adv_train_batch.sh  # crUCB (low sigma_0 parameter)
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb_p b_adv_train_batch.sh  # crUCB (modified)

# Evaluate different algorithms
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ts b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ucb b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=rts b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=rts_u b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=rts_k b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb_v b_ev_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=crucb_p b_ev_alg_against_all.sh
# sbatch --export EPS_STEPS=${EPS_STEPS} b_ev_clean.sh  # Evaluate all on clean environment

python3 bandit_print_adv_againsts.py --setup_dir bandit_envs100000_ctxlen500_actions5_variance0.3_arch1_epochs500_epoch400_evalenvs200stepsNone_rounds20_epse0.8_epss0.1_victimiters20lr3e-05_attackeriters20lr0.03_maxpoisondiff3.0reg10
```

### Bandit, adaptive attacker

Reproduce bandit results with adaptive attacker:
```bash
# Assume collect_data.py was run
# Assume train.py was run for seeds 0-9

export EPS_STEPS=0.1  # Repeat for 0.2 and 0.4

# Train attackers
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=dpt b_adv_train_adaptive_batch.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=ts b_adv_train_adaptive_batch.sh

# Evaluate different algorithms
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=dpt b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=nonadaptive_dpt b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=dpt_frozen b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=ts b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=rts b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=ucb b_ev_alg_against_all_adaptive.sh
sbatch --export EPS_STEPS=${EPS_STEPS},N_ROUNDS=400,AGAINST=crucb_p b_ev_alg_against_all_adaptive.sh
```


### Darkroom

```bash
# Darkroom
python collect_data2.py --env darkroom --n_states 25
python train2.py --env darkroom --n_states 25 --n_epochs 400 --lr 1e-4
# Run with --seed 0-9 for replications, or sbatch d_train_batch.sh

python mdp_train_adv.py --env darkroom --n_states 25 --arch 1 --n_epochs 600 --epoch 200 --n_steps_eval 100 --max_poison_diff 10 --victim_lr 0.0003 --attacker_lr 0.001 --n_rounds 300 --attacker_against dpt
# Run with different --attacker_against algs and --seed 0-9 for different algorithms and replications
```

Reproduce Darkroom results:
```bash
# Assume collect_data.py was run
# Assume train.py was run for seeds 0-9

export EPS_STEPS=0.1  # Repeat for 0.2 and 0.4

# Train attackers
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt d_adv_train_batch.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen d_adv_train_batch.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=npg,ATTACKER_ITERS=1 d_adv_train_batch.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ql,ATTACKER_ITERS=1 d_adv_train_batch.sh

# Evaluate different algorithms
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt d_eval_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen d_eval_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=ql d_eval_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS} d_eval_npg_against_all.sh
```


### Miniworld

```bash
for i in {0..11}; do
    start=$(( i * 5000 ))
    end=$(( (i + 1) * 5000 - 1 ))
    xvfb-run -a -s "-screen 0 1024x768x24 -ac +extension GLX +render -noreset" python3 collect_data.py --env miniworld --H 250 --env_id_start $start --env_id_end $end &
done

python3 train.py --env miniworld --envs 60000 --H 250 --lr 0.0001 --layer 4 --head 4 --shuffle --num_epochs 400
# Run with --seed 0-9 for replications

python mdp_train_adv.py --env miniworld --n_envs 60000 --arch 1 --context_len 250 --n_actions 4 --n_epochs 1000 --epoch 200 --n_envs_eval 40 --max_poison_diff 5 --victim_iters 30 --victim_lr 0.00003 --attacker_iters 20 --attacker_lr 0.01 --n_rounds 100 --attacker_against dpt
# Run with different --attacker_against algs and --seed 0-9 for different algorithms and replications
```

Reproduce Miniworld results:
```bash
# Assume collect_data.py was run
# Assume train.py was run for seeds 0-9

export EPS_STEPS=0.1  # Repeat for 0.2 and 0.4

# Train attackers
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt m_adv_train_batch.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen m_adv_train_batch.sh

# Evaluate different algorithms
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt m_eval_alg_against_all.sh
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=dpt_frozen m_eval_alg_against_all.sh
# Evaluate PPO
sbatch --export EPS_STEPS=${EPS_STEPS},AGAINST=unifrand m_eval_ppo_against_cl_ur_oneseed.sh
sbatch --export AGAINST=clean m_eval_ppo_against_cl_ur_oneseed.sh
# Show PPO results
python mw_print_ppo_againsts.py --setup_dir miniworld_envs60000_ctxlen250_arch1_epochs1000_epoch200_evalenvs40stepsNone_rounds100_epse0.8_epss${EPS_STEPS}_victimiters30lr3e-05_attackeriters20lr0.01_maxpoisondiff5.0reg10 --n_seeds 10
```


## Test
```bash
pip install pytest ddt
python -m pytest -v -s ./test
```
