
# rsync -ra -e ssh \
#     "m@mango.cs.unc.edu:/fruitbasket/users/m/project_data/extract_merge1/ll1/per_example_fishers/spH.nmf_decomp.full.5k.32k.256.feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
#     "$HOME/Desktop/projects_data/extract_merge1/"


cd /fruitbasket/users/m/project_code/cuda_nmf1


cd ~/Desktop/projects/cuda_nmf1


nvcc mains/em/run_nmf_on_pefs.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/run_nmf_on_pefs; \
./build/run_nmf_on_pefs \
    --n_fisher_values=2048 \
    --output_path=/tmp/asdf.h5 \
    --per_example_fishers="/home/owner/Desktop/projects_data/extract_merge1/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
    --nmf_n_components=8 \
    --nmf_max_iter=12 \
    --min_values_per_parameter=8 \
    --initial_H_path=/tmp/asdf.h5



nvcc dev_mains/spgemm_example.cu -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/spgemm_example; \
./build/spgemm_example 






# nvcc mains/em/fit_coeffs_to_sparse_H.cu -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H
# /usr/local/cuda/bin/nvcc mains/em/fit_coeffs_to_sparse_H.cu -gencode arch=compute_75,code=sm_75 -rdc=true -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lcudadevrt -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H
sh ./build_scripts/em/fit_coeffs_to_sparse_H.sh


# nvcc mains/em/fit_coeffs_to_sparse_H.cu -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H; \
# /usr/local/cuda/bin/nvcc mains/em/fit_coeffs_to_sparse_H.cu -gencode arch=compute_75,code=sm_75 -rdc=true -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lcudadevrt -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H; \
sh ./build_scripts/em/fit_coeffs_to_sparse_H.sh; \
./build/fit_coeffs_to_sparse_H \
    --output_path=/tmp/asdf.h5 \
    --pef_path="/home/owner/Desktop/projects_data/extract_merge1/feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
    --H_path="/home/owner/Desktop/projects_data/extract_merge1/spH.nmf_decomp.full.5k.32k.256.feather_berts_0.hans_lone.no_embeddings.5k.32k.h5" \
    --nmf_max_iter=256 \
    --n_examples=2500 \
    --n_fisher_values=1024 \
    --DEV_n_cols=100000 \
    --n_splits_sparse_matmul=10 \
    --n_row_splits_pefs=25






# nvcc mains/em/fit_coeffs_to_sparse_H.cu -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H
sh ./build_scripts/em/fit_coeffs_to_sparse_H.sh;



DATA_DIR=/fruitbasket/users/m/project_data/extract_merge1
EXPS_DIR="${DATA_DIR}/pi1"
DATASETS_DIR="${EXPS_DIR}/datasets"
MODELS_DIR="${EXPS_DIR}/models"
FISHER_DIR="${EXPS_DIR}/fishers"
PER_EXAMPLE_FISHERS_DIR="${EXPS_DIR}/per_example_fishers"

# CUDA_VISIBLE_DEVICES=0,2 ./build/fit_coeffs_to_sparse_H \

# nvcc mains/em/fit_coeffs_to_sparse_H.cu -O2 -I./src -I/usr/local/cuda/include -I/usr/lib/x86_64-linux-gnu/hdf5/serial/include -L/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -L/usr/local/cuda/lib64 -lnccl -lcublas -lcurand -lcusparse -lhdf5_cpp -lhdf5 -o build/fit_coeffs_to_sparse_H; \
sh ./build_scripts/em/fit_coeffs_to_sparse_H.sh; \
CUDA_VISIBLE_DEVICES=0 ./build/fit_coeffs_to_sparse_H \
    --output_path=/fruitbasket/users/m/tmp/spH.fit_coeffs_to_sparse_H.test1.h5 \
    --pef_path="${PER_EXAMPLE_FISHERS_DIR}/feather_berts_0.snli_train.all_vars.50000ex.65536.h5" \
    --H_path="${PER_EXAMPLE_FISHERS_DIR}/spH.nmf_decomp.c512_1250Iters_65536pe_mvpp10_50000ex.feather_berts_0.snli_train.all_vars.50000ex.65536.h5" \
    --n_examples=10000 \
    --nmf_max_iter=25000 \
    --n_fisher_values=65536 \
    --n_splits_sparse_matmul=2048


sh ./build_scripts/em/fit_coeffs_to_sparse_H.sh; \
CUDA_VISIBLE_DEVICES=0 ./build/fit_coeffs_to_sparse_H \
    --output_path=/fruitbasket/users/m/tmp/spH.fit_coeffs_to_sparse_H.test2.h5 \
    --pef_path="${PER_EXAMPLE_FISHERS_DIR}/feather_berts_0.snli_validation.all_vars.10000ex.65536.h5" \
    --H_path="${PER_EXAMPLE_FISHERS_DIR}/spH.nmf_decomp2.c512_1250Iters_65536pe_mvpp10_50000ex.feather_berts_0.snli_train.all_vars.50000ex.65536.h5" \
    --nmf_max_iter=12000 \
    --n_splits_sparse_matmul=2048


# - Split stuff so that no int64s.
# - OpenMP splitting.
# - Support factors with nnz = 0.

# - Split to get roughly even nnzs (maybe across both matrices combined in a way).




# DATA_DIR=/fruitbasket/users/m/project_data/extract_merge1
# EXPS_DIR="${DATA_DIR}/pi1"
# DATASETS_DIR="${EXPS_DIR}/datasets"
# MODELS_DIR="${EXPS_DIR}/models"
# FISHER_DIR="${EXPS_DIR}/fishers"
# PER_EXAMPLE_FISHERS_DIR="${EXPS_DIR}/per_example_fishers"

# sparsify_mnli_snli_nmf() {
#     local model_num=$1
#     local n_comps=$2
#     local n_vals_pe=$3
#     local min_values_per_parameter=$4
#     local n_examples=$5

#     local pef_name="feather_berts_${model_num}.snli_train.all_vars.50000ex.65536.h5"
#     local nmf_name="nmf_decomp.c${n_comps}_1250Iters_${n_vals_pe}pe_mvpp${min_values_per_parameter}_${n_examples}ex.${pef_name}"

#     local output_name="spH.${nmf_name}"

#     CUDA_VISIBLE_DEVICES= python scripts1/sparse/sparsify_nmf.py \
#         --nmf_path="${PER_EXAMPLE_FISHERS_DIR}/${nmf_name}" \
#         --output_path="${PER_EXAMPLE_FISHERS_DIR}/${output_name}" \
#         --H_threshold=1e-10
# }

# sparsify_mnli_snli_nmf 0 512 65536 10 50000

