
# rsync -ra -e ssh \
#     "m@mango.cs.unc.edu:/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
#     "$HOME/Desktop/projects_data/extract_merge1/"


# cp /fruitbasket/users/m/tmp/.bashrc ~/.bashrc 
# cp /fruitbasket/users/m/tmp/.bash_profile ~/.bash_profile 


cd /fruitbasket/users/m/project_code/cuda_nmf1


cd ~/Desktop/projects/cuda_nmf1


# nvcc dev_mains/single_dense01.cu -I./src -lcublas -lcurand -o build/single_dense01; sudo nvprof ./build/single_dense01
nvcc dev_mains/single_dense01.cu -I./src -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcublas -lcurand -o build/single_dense01; ./build/single_dense01


nvcc dev_mains/multi_dense01.cu -I./src -lcublas -lcurand -lnccl -o build/multi_dense01; ./build/multi_dense01
nvcc dev_mains/multi_dense01.cu -I./src -lcublas -lcurand -lnccl -o build/multi_dense01;  NCCL_DEBUG=INFO ./build/multi_dense01
# nvcc dev_mains/multi_dense01.cu -I./src -I/miniconda3/envs/wiseft/lib -L/fruitbasket/users/m/miniconda3/envs/wiseft/lib -lcublas -lcurand -lnccl -o build/multi_dense01; CUDA_VISIBLE_DEVICES=2,3 ./build/multi_dense01




nvcc dev_mains/single_sparse_dense_factors1.cu -I./src -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcublas -lcurand -lcusparse -o build/single_sparse_dense_factors1; ./build/single_sparse_dense_factors1




nvcc dev_mains/multi_sparse_dense_factors1.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/multi_sparse_dense_factors1; ./build/multi_sparse_dense_factors1

# nvcc dev_mains/multi_sparse_dense_factors1.cu -I./src -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -o build/multi_sparse_dense_factors1;  CUDA_VISIBLE_DEVICES=0,3 ./build/multi_sparse_dense_factors1




nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs; \
./build/run_nmf_on_pefs \
    --n_fisher_values=2048 \
    --output_path=/tmp/asdf.h5 \
    --per_example_fishers="/home/owner/Desktop/projects_data/extract_merge1/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
    --nmf_n_components=8 \
    --nmf_max_iter=12 \
    --min_values_per_parameter=8

    # --sparsity_W=0.9
./build/run_nmf_on_pefs \
    --output_path=/tmp/asdf.h5 \
    --per_example_fishers="/home/owner/Desktop/projects_data/extract_merge1/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
    --nmf_n_components=2 \
    --nmf_max_iter=12 \
    --min_values_per_parameter=2

# Out of 5k examples, 32k fishers per example
# 1:   2983482 [density=0.0109831]
# 2:   1976811 [density=0.0165762]
# 8:   943685  [density=0.0347235]
# 50:  337222  [density=0.0971704]
# 100: 221000
# 250: 119205
# 500: 69730
# 999: 38060

# 0: 85645056
# 1: 2983482
# 4: 1362012
# 8: 943685

nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs




CUDA_VISIBLE_DEVICES=0 ./build/run_nmf_on_pefs \
    --output_path=/fruitbasket/users/m/tmp/cuda_nmf_test1.h5 \
    --nmf_n_components=256 \
    --nmf_max_iter=300 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"

CUDA_VISIBLE_DEVICES=0,1,2 ./build/run_nmf_on_pefs \
    --output_path=/fruitbasket/users/m/tmp/cuda_nmf_test2.h5 \
    --nmf_n_components=2048 \
    --nmf_max_iter=3000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"


CUDA_VISIBLE_DEVICES=0,1,2 ./build/run_nmf_on_pefs \
    --output_path=/fruitbasket/users/m/tmp/cuda_nmf_test3.h5 \
    --nmf_n_components=1024 \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"


# CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
#     --output_path=/fruitbasket/users/m/tmp/cuda_nmf_test4.h5 \
#     --nmf_n_components=1024 \
#     --nmf_max_iter=3000 \
#     --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/bert_mini_mnli.hans_lone.no_embeddings.5k.131k.h5"

# Handle cases where the number of devices does not divide the number of columns.

# CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
#     --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters.feather_berts_0.hans_lone.all_vars.5k.262144.h5" \
#     --nmf_n_components=1024 \
#     --nmf_max_iter=2000 \
#     --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"


# n_vals_pe=65536
n_vals_pe=131072
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters_${n_vals_pe}pe.feather_berts_0.hans_lone.all_vars.5k.262144.h5" \
    --nmf_n_components=1024 \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"


# n_vals_pe=65536
n_vals_pe=131072
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters_${n_vals_pe}pe.feather_berts_15.hans_lone.all_vars.5k.262144.h5" \
    --nmf_n_components=1024 \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_15.hans_lone.all_vars.5k.262144.h5"







# n_vals_pe=65536
n_vals_pe=131072
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters_${n_vals_pe}pe.feather_berts_1.hans_lone.all_vars.5k.262144.h5" \
    --nmf_n_components=1024 \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_1.hans_lone.all_vars.5k.262144.h5"


# n_vals_pe=65536
n_vals_pe=131072
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters_${n_vals_pe}pe.feather_berts_25.hans_lone.all_vars.5k.262144.h5" \
    --nmf_n_components=1024 \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_25.hans_lone.all_vars.5k.262144.h5"







nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/dev/null" \
    --nmf_n_components=1024 \
    --n_fisher_values=131072 \
    --nmf_max_iter=2000 \
    --nmf_max_output_magnitude=1e20 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/dev/null" \
    --nmf_n_components=512 \
    --n_fisher_values=65536 \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/dev/null" \
    --nmf_n_components=1024 \
    --n_fisher_values=32768 \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"



nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs; \
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/dev/null" \
    --nmf_n_components=1024 \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"




# n_vals_pe=65536
nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
n_vals_pe=131072
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.2.c1024_2kIters_${n_vals_pe}pe.feather_berts_0.hans_lone.all_vars.5k.262144.h5" \
    --nmf_n_components=1024 \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_output_magnitude=1e20 \
    --nmf_max_iter=2000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.all_vars.5k.262144.h5"


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path=/dev/null \
    --nmf_n_components=1024 \
    --nmf_max_iter=3000 \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs --n_fisher_values=2048 --output_path=/dev/null  --nmf_n_components=8 --nmf_max_iter=120 --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"
CUDA_VISIBLE_DEVICES=0 ./build/run_nmf_on_pefs --n_fisher_values=2048 --output_path=/dev/null  --nmf_n_components=8 --nmf_max_iter=120 --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5"



run_hans_lone_with_flipped_all_vars_nmf() {
    local devices=$1
    local model_num=$2
    local n_vals_pe=$3

    CUDA_VISIBLE_DEVICES=$devices ./build/run_nmf_on_pefs \
        --output_path="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/nmf_decomp.c1024_2kIters_${n_vals_pe}pe.feather_berts_${model_num}.hans_lone_with_flipped.all_vars.10k.131072.h5" \
        --nmf_n_components=1024 \
        --n_fisher_values=$n_vals_pe \
        --nmf_max_iter=2000 \
        --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/feather_berts_${model_num}.hans_lone_with_flipped.all_vars.10k.131072.h5"
}

run_hans_lone_with_flipped_all_vars_nmf 1,2,3 0 65536
run_hans_lone_with_flipped_all_vars_nmf 1,2,3 15 65536
run_hans_lone_with_flipped_all_vars_nmf 1,2,3 1 65536
run_hans_lone_with_flipped_all_vars_nmf 1,2,3 25 65536



#############################################################################


cd /fruitbasket/users/m/project_code/cuda_nmf1

nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs

n_vals_pe=32768
n_comps=768
n_ex=5000
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/pi1/per_example_fishers/nmf_decomp.c${n_comps}_2kIters_${n_vals_pe}pe_${n_ex}ex.bert_base_qqp.qqp_val.all_vars.first_20k.131072.h5" \
    --nmf_n_components=$n_comps \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --n_examples=$n_ex \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/pi1/per_example_fishers/bert_base_qqp.qqp_val.all_vars.first_20k.131072.h5"




#############################################################################


cd /fruitbasket/users/m/project_code/cuda_nmf1


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs


n_vals_pe=65536
n_comps=1024
n_ex=20000

# min_values_per_parameter=1
# min_values_per_parameter=32
# min_values_per_parameter=4
min_values_per_parameter=8
# min_values_per_parameter=16

CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/run_nmf_on_pefs \
    --output_path="/fruitbasket/users/m/project_data/extract_merge1/pi1/per_example_fishers/nmf_decomp.c${n_comps}_2kIters_${n_vals_pe}pe_${n_ex}ex_mvpp${min_values_per_parameter}.bert_base_qqp.qqp_val.all_vars.first_20k.131072.h5" \
    --nmf_n_components=$n_comps \
    --n_fisher_values=$n_vals_pe \
    --nmf_max_iter=2000 \
    --n_examples=$n_ex \
    --min_values_per_parameter=$min_values_per_parameter \
    --per_example_fishers="/fruitbasket/users/m/project_data/extract_merge1/pi1/per_example_fishers/bert_base_qqp.qqp_val.all_vars.first_20k.131072.h5"


#  0: 109483778
#  1: 39093667
#  4: 14646051
#  8: 7998520
# 32: 2366619