
sh ~/Desktop/projects/cuda_m_npeff/dev_scripts/move_code_to_fruit.sh


cd /fruitbasket/users/m/project_code/cuda_m_npeff

export PATH=/home/m/.local/bin:$PATH

cmake -DCMAKE_CUDA_ARCHITECTURES=86 -S . -B build
cmake --build build

# Compile on mango/banana, then can run on guava/watermelon.

###########################################################


# Run for realsies.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G=1e-3 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff002.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=650 \
    --n_iters_joint=1500


# Run for realsies.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=64 \
    --learning_rate_G_G_only=1e-3 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff003.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=650 \
    --n_iters_joint=1500

cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff003.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


# Run for realsies.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-3 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff004.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=650 \
    --n_iters_joint=1500

cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff004.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


###########################################################


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


# Run for realsies.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-2 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=150 \
    --n_iters_joint=1000



CUDA_VISIBLE_DEVICES=2 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff001.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff001.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000


###########################################################


cmake --build build; CUDA_VISIBLE_DEVICES=3 ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/small_cifar_lrm_pefs.512ex.h5" \
    --min_nonzero_per_col=1 \
    --n_components=16 \
    --learning_rate_G=1e-3 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff.small_cifar001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=500 \
    --n_iters_joint=400 \
    --n_examples=512


cmake --build build; ./build/mains/run_stiefel_m_npeff \
    --pef_filepath="${HOME}/Desktop/projects_data/extract_merge1/feather_berts_0.train.100ex.65536.h5" \
    --min_nonzero_per_col=16 \
    --n_components=8 \
    --learning_rate_G=1e-3 \
    --output_filepath=/tmp/blah.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters_G_only=500 \
    --n_iters_joint=250

# CUDA_VISIBLE_DEVICES=1,2 ./build/mains/run_stiefel_m_npeff \
#      --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
#      --min_nonzero_per_col=8 \
#      --n_components=16 \
#      --learning_rate_G=1e-5 \
#      --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_stiefel_m_npeff001.h5 \
#      --n_preprocess_cpu_threads=32 \
#      --n_iters_G_only=500 \
#      --n_iters_joint=1000 \
#      --n_examples=512


###########################################################


cmake --build build; ./build/mains/run_m_npeff \
    --pef_filepath="${HOME}/Desktop/projects_data/extract_merge1/feather_berts_0.train.100ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=16 \
    --learning_rate_G=1e-3 \
    --output_filepath=/tmp/blah.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters_G_only=250 \
    --n_iters_joint=250

cmake --build build; ./build/mains/run_m_npeff \
    --pef_filepath="${HOME}/Desktop/projects_data/extract_merge1/feather_berts_0.train.100ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=16 \
    --learning_rate_G=1e-3 \
    --output_filepath=/tmp/blah.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters_G_only=250 \
    --n_iters_joint=250 \
    --orthogonal_regularization_strength=1


cmake --build build; ./build/mains/run_m_npeff_expansion \
    --pef_filepath="${HOME}/Desktop/projects_data/extract_merge1/feather_berts_0.train.100ex.65536.h5" \
    --decomposition_filepath=/tmp/blah.h5 \
    --n_additional_components=4 \
    --learning_rate_G=1e-3 \
    --output_filepath=/tmp/blah2.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters_G_only=250 \
    --n_iters_joint_expansion_only=250 \
    --n_iters_joint=250 \
    --use_W_from_decomposition=false


    --use_W_from_decomposition=false




cmake --build build; ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath="${HOME}/Desktop/projects_data/extract_merge1/feather_berts_0.train.100ex.65536.h5" \
    --decomposition_filepath=/tmp/blah.h5 \
    --output_filepath=/tmp/blah2.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters=250 \
    --n_columns_per_chunk=10000 \
    --n_examples_per_chunk=10




CUDA_VISIBLE_DEVICES=3 ./build/mains/run_m_npeff \
    --pef_filepath="/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.100ex.65536.h5" \
    --min_nonzero_per_col=64 \
    --n_components=16 \
    --learning_rate_G=1e-3 \
    --output_filepath=/tmp/blah.h5 \
    --n_preprocess_cpu_threads=6 \
    --n_iters_G_only=250 \
    --n_iters_joint=250








# Guava local PEFs folder.
/playpen/users/m/project_data/m_npeff1/per_example_fishers


./build/mains/run_m_npeff \
    --pef_filepath="/playpen/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=250 \
    --n_iters_joint=1000 \
    \
    --n_examples=15000 \
    --log_loss_frequency=1



./build/mains/run_m_npeff \
    --pef_filepath="/playpen/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=250 \
    --n_iters_joint=1000



./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000



# Try shorter G-only phase with higher G learning rate in the joint phase.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-3 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_003.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=100 \
    --n_iters_joint=1000


# 512 components
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=512 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-2 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_004.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=50 \
    --n_iters_joint=1250


# Try orthogonal regularization.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-3 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_005.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=50 \
    --n_iters_joint=1000 \
    --orthogonal_regularization_strength=1


# Try orthogonal regularization with stronger regularization.
CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_006.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=50 \
    --n_iters_joint=1000 \
    --orthogonal_regularization_strength=1000



CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.131072.h5" \
    --min_nonzero_per_col=8 \
    --n_components=256 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-3 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_007.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=50 \
    --n_iters_joint=1500



###############################################################################
###############################################################################


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers

CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff_expansion \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5" \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.h5 \
    --n_additional_components=32 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_002.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=200 \
    --n_iters_joint=1000 \
    --use_W_from_decomposition=false


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_002.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers



CUDA_VISIBLE_DEVICES=3 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_002.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_002.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=1000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_002.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


###############################################################################
###############################################################################



CUDA_VISIBLE_DEVICES=3 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5" \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.wrongs_only.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=1000 \
    --n_columns_per_chunk=2500000


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.wrongs_only.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff_expansion \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5" \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.wrongs_only.coeffs_fit001.h5 \
    --n_additional_components=32 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_004.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=100 \
    --n_iters_joint_expansion_only=500 \
    --n_iters_joint=1000 \
    --use_W_from_decomposition=true


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_004.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers



CUDA_VISIBLE_DEVICES=3 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_004.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_004.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=1000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=25000


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_004.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


###############################################################################
###############################################################################

cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.wrongs_only.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers

CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff_expansion \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5" \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.wrongs_only.coeffs_fit001.h5 \
    --n_additional_components=64 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_005.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=100 \
    --n_iters_joint_expansion_only=1000 \
    --n_iters_joint=5000 \
    --use_W_from_decomposition=true



cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_005.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


CUDA_VISIBLE_DEVICES=3 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_005.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_005.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=25000

cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_005.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers



###############################################################################
###############################################################################


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers

CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff_expansion \
    --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.wrongs_only.h5" \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.h5 \
    --n_additional_components=32 \
    --learning_rate_G_G_only=1e-4 \
    --learning_rate_G=1e-4 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_003.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters_G_only=100 \
    --n_iters_joint_expansion_only=500 \
    --n_iters_joint=1000 \
    --use_W_from_decomposition=false


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_003.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers



CUDA_VISIBLE_DEVICES=3 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_003.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_003.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=1000 \
    --n_columns_per_chunk=2500000


cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.expansion_003.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers


# Try to add expanded_only stage after G_only stage.



###############################################################################
###############################################################################

# CUDA_VISIBLE_DEVICES=0,1,2,3 ./build/mains/run_m_npeff \
#     --pef_filepath="/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5" \
#     --min_nonzero_per_col=8 \
#     --n_components=768 \
#     --learning_rate_G_G_only=1e-4 \
#     --learning_rate_G=1e-2 \
#     --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_005.h5 \
#     --n_preprocess_cpu_threads=32 \
#     --n_iters_G_only=50 \
#     --n_iters_joint=1500




CUDA_VISIBLE_DEVICES=0 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_004.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_004.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000




CUDA_VISIBLE_DEVICES=0 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_005.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_005.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000



CUDA_VISIBLE_DEVICES=0 ./build/mains/fit_m_npeff_coeffs \
    --pef_filepath=/playpen/users/m/project_data/m_npeff1/per_example_fishers/feather_berts_0.train_skip_50k.50000ex.65536.h5 \
    --decomposition_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_006.h5 \
    --output_filepath=/fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_006.coeffs_fit001.h5 \
    --n_preprocess_cpu_threads=32 \
    --n_iters=2000 \
    --n_columns_per_chunk=2500000 \
    --n_examples_per_chunk=20000


# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.65536.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/feather_berts_0.train.50000ex.131072.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers

cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_007.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers

# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_002.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_004.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_005.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers
# cp /fruitbasket/users/m/project_data/extract_merge1/m_npeff1/per_example_fishers/test_mnpeff_006.coeffs_fit001.h5 /playpen/users/m/project_data/m_npeff1/per_example_fishers





# TODOs:
# X Write outputs to disk.
# X Main loop and all that.
# X Figure out flags and all that.
# X Read n_examples instead of all.
# ~ Support of non-int32 indices. (make cleaner main file)
# - Store and save the losses that I compute.
# - Compilation flags (like -O3 and whatnot)
# - Add logging to the main file so we know what's happening as it is running.
# - [later] directly save as sparse representation if it is indeed spare.
# - [later] Save at intermediate checkpoints.




# x See ratios of component top example PEF norms.
# - Perturbation grid searches (of the last method).
# - Perturbation methods latex.
# - Fit coeffs for LRM-NPEFF.
# - Touch ups to LRM_NPEFF factorization.
#     - Back to int32 indices.
#     - Logging.
#     - Some sort of L1/L2 regularization on the Gs.
# - Input feature salience via PEF factoring stuff.

