# SimCSE for Stealing Sentence Embedding Encoders

## Running the code

Examples of how to run the code:

Training of the NLI dataset on the Tiny BERT architecture (for training BERT or
RoBERTa - please change the model name to the respective value):

```shell

NUM_GPU=2

# Randomly set a port number
# If you encounter "address already used" error, just run again or manually set an available port id.
PORT_ID=$(expr $RANDOM + 1000)

# Allow multiple threads
export OMP_NUM_THREADS=8

# Use distributed data parallel
# If you only want to use one card, uncomment the following line and comment the line with "torch.distributed.launch"
# python train.py \
python -m torch.distributed.launch --nproc_per_node $NUM_GPU --master_port $PORT_ID train.py \
    --model_name_or_path prajjwal1/bert-tiny \
    --train_file <fill_in> \
    --output_dir <fill_in> \
    --num_train_epochs 10 \
    --per_device_train_batch_size 128 \
    --learning_rate 5e-5 \
    --max_seq_length 32 \
    --evaluation_strategy steps \
    --metric_for_best_model stsb_spearman \
    --load_best_model_at_end \
    --eval_steps 125 \
    --pooler_type cls \
    --overwrite_output_dir \
    --temp 0.05 \
    --do_train \
    --do_eval \
    --fp16 \
    "$@"
```

Run the Dataset Inference for NLP:

```shell

#!/bin/bash

#Victim Models
root_victim=""
qqp_model="${root_victim}/my-sup-simcse-tiny-bert-qqp-10epochs_unconverted"
flickr_model="${root_victim}/my-sup-simcse-tiny-bert-flickr-10epochs_unconverted"
mnli_model="${root_victim}/my-sup-simcse-tiny-bert-mnli-10epochs_unconverted"
nli_model="${root_victim}/my-sup-simcse-tiny-bert-nli-10epochs_unconverted"

#Stolen Models
root_stolen="${root_victim}/stolen_models/deprecated_because_stolen_with_too_few_queries"
qqp_with_qqp="${root_stolen}/qqp-with-qqp-full-mse-fromCL-standard"
qqp_with_flickr="${root_stolen}/qqp-with-flickr-full-mse-fromCL-standard"
qqp_with_mnli="${root_stolen}/qqp-with-mnli-full-mse-fromCL-standard"
flickr_with_qqp="${root_stolen}/flickr-with-qqp-full-mse-fromCL-standard"
flickr_with_flickr="${root_stolen}/flickr-with-flickr-full-mse-fromCL-standard"
flickr_with_mnli="${root_stolen}/flickr-with-mnli-full-mse-fromCL-standard"

#dataset
root_data=""
qqp_train1="${root_data}/qqp/DI-cleaned-1-qqp_train.csv"
qqp_train2="${root_data}/qqp/DI-cleaned-2-qqp_train.csv"
qqp_test="${root_data}/qqp/DI-cleaned-qqp_test.csv"
flickr_train1="${root_data}/flickr30k/DI-train1-full.csv"
flickr_train2="${root_data}/flickr30k/DI-train2-full.csv"
flickr_test="${root_data}/flickr30k/DI-test-full.csv"

logname=$(date +"%Y-%m-%d-%T")
bandwidth="0.1"
for data in qqp flickr
do
	if [ "$data" == "flickr" ]; then
		bandwidth="1"
	fi
	train1="${data}_train1"
	train2="${data}_train2"
	test="${data}_test"

	victim="${data}_model"
	python3 DatasetInference_v2.py --train1 ${!train1} --train2 ${!train2} --test ${!test} --use_pooler True --model ${!victim} --logname $logname --bandwidth $bandwidth
	echo "finish DI for victim ${data}"

	for independent in qqp flickr mnli
	do
		if [ "$data" != "$independent" ]; then
			independent_model="${independent}_model"
			python3 DatasetInference_v2.py --train1 ${!train1} --train2 ${!train2} --test ${!test} --use_pooler True --model ${!independent_model} --logname $logname --bandwidth $bandwidth
			echo "finish DI for independent ${independent}"
		fi
	done
	for stolen in qqp flickr mnli
	do
		stolen_model="${data}_with_${stolen}"
		python3 DatasetInference_v2.py --train1 ${!train1} --train2 ${!train2} --test ${!test} --model ${!stolen_model} --logname $logname --bandwidth $bandwidth
		echo "finish DI for stolen ${stolen_model}"
	done
done


```


Run evaluation:

```shell

timestamp=$(date +%Y-%m-%d-%H-%M-%S-%N)
export PYTHONPATH="${PYTHONPATH}:/code/SimCSE-Steal"
CUDA_VISIBLE_DEVICES=0,1,2,3 nohup python evaluation.py \
  --model_name_or_path="princeton-nlp/sup-simcse-roberta-large" \
  --pooler="cls" \
  --mode="test" \
  >>log_${timestamp}.log 2>&1 &
echo log_${timestamp}.log

```

