Test - Consultancy:
    model_name: stub_model
    target: debater
    llm_type: stub_llm
    speech_structure: default_consultancy
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-6
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /fake/file/path
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-03-15_16:08:55.960902
Test:
    model_name: stub_model
    target: debater
    llm_type: stub_llm
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 2e-6
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /fake/file/path
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-03-11_00:19:01.230232
Test - Iterative:
    model_name: stub_model
    target: debater
    llm_type: stub_llm
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 2e-6
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /fake/file/path
    dataset:
        dataset_type: quality
Iterative - Experiment:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-611-bon-test
    dataset:
        dataset_type: quality
        split_type: train
Mixtral:
    model_name: /vast/spa9663/models/trained_models/mixtral-8x7b-unified-merged
    target: debater
    llm_type: mistral
    training_hyperparameters:
        num_train_epochs: 4
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/mixtral-8x7b-dpo-326
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-03-11_00:19:01.230232
Mixtral - Consultant:
    model_name: /vast/spa9663/models/trained_models/mixtral-8x7b-unified-merged
    target: debater
    llm_type: mistral
    speech_structure: default_consultancy
    training_hyperparameters:
        num_train_epochs: 4
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/mixtral-8x7b-dpo-41-consultant
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-03-15_16:08:55.960902
Iterative - Experiment - 0:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        kl_penalty: 0.5
        lora_rank: 64
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-611-bon-test
    dataset:
        dataset_type: quality
        split_type: train
Iterative - R256 - 1:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 256
        kl_penalty: 0.1
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-r256-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - R1024 - 2:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.1
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-r1024-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - HiLR - 3:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.1
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-HiLR-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - AllProj - 4:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.1
        steps: 1
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-AllProj-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - AllProj - 5:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 256
        kl_penalty: 0.1
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-AllProj-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - AttentionProj - 6:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.1
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-AttentionProj-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - AllProjLoLR - 7:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 256
        kl_penalty: 0.1
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-612-AllProjLoLR-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-11_15:44:47.755094
Iterative - AllProjLoLR - 8:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.1
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-613-AllProjLoLR-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - AttentionProjLoLR - 9:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 256
        kl_penalty: 0.1
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-613-AttentionProjLoLR-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - Replica - 10:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - Replica1024 - 11:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-614-replica1024-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - ReplicaAll - 12:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-614-replicaAll-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - NoSFT - 13:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
            alpha: 0.0
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-614-NoSFT-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - Replica1024Again - 14:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 1024
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-615-replica1024-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - BigBatch - 15:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-615-BigBatch-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - SecondRound - 16:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-615-SecondRoung-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-14_23:59:32.271859
Iterative - SecondRoundSmallBatch - 17:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 8
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-616-SecondRoundSmall-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-14_23:59:32.271859
Iterative - BigBatchHiLR - 18:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 8
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-616-BigBatchHiLR-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - SecondRoundSmallBatchReal - 19:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-616-SecondRoundSmallReal-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-14_23:59:32.271859
Iterative - BigBatchHiLRReal - 20:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-616-BigBatchHiLRReal-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - BigBatchHiLRFixed - 21:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-4
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-617-BigBatchHiLRFixed-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-13_00:31:55.325622
Iterative - SecoundRoundFull - 22:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-613-replica-test/checkpoint-512
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-617-SecondRoundFull-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-14_23:59:32.271859
Iterative - NewDataFull - 23:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-619-NewDataFull-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-18_11:33:33.673488
Iterative - NewDataReplica - 24:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-619-NewDataReplica-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-18_11:33:33.673488
Iterative - LoLRLong - 25:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 5e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-619-LoLRLong-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-18_11:33:33.673488
Iterative - NormalLRLong - 26:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 64
        kl_penalty: 0.5
        steps: 1
        target_module: attention
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-619-NormalLong-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: 2024-06-18_11:33:33.673488
Iterative - CleanedAndFixed - 27:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-623-CleanedAndFixed-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - SharpPreference - 28:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-623-SharpPreference-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - SharpPreferenceBigBatch - 29:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            reward_type: logit
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-624-SharpPreferenceBigBatch-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - SPR2 - 30:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-623-SharpPreference-test/checkpoint-256
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: logit
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-625-SPR2-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-24_11:47:00.207725
Iterative - ProbReward - 31:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            reward_type: prob
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-626-ProbReward-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - SigmoidReward - 32:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 8
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 64
            reward_type: sigmoid
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-628-SigmoidReward-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - SPR3 - 33:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-625-SPR2-test/checkpoint-160
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: logit
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-625-SPR3-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-27_18:00:48.062786
Iterative - LogProbMultiplier - 34:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: log_prob
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-630-LogProbMultiplier-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - ProbMultiplier - 35:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-630-ProbMultiplier-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-06-22_18:02:18.766775
Iterative - ProbLowTemp - 36:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-701-ProbLowTemp-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - LogProbLowTemp - 37:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: log_prob
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-701-LogProbLowTemp-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - LogitLowTemp - 38:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: logit
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-701-LogitLowTemp-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - TraditionalDPO - 39:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 6
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            alpha: 0.0
            reward_type: binary
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-702-TraditionalDPO-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - ProbLRSchedule - 40:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: linear
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            lr_multiplier: 0.5
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-704-ProbLRSchedule-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - PassageProb - 41:
    model_name: /vast/spa9663/models/trained_models/llama-3-passage
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            lr_multiplier: 0.5
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-706-PassageProb-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-05_20:03:46.570770
Iterative - BonIPO - 42:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: linear
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            lr_multiplier: 0.5
            loss_type: bon-ipo
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-704-BonIPO-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - HiSFT - 43:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: linear
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            lr_multiplier: 0.5
            loss_type: bon
            alpha: 0.05
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-706-HiSFT-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
Iterative - MultiTurn - 44:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            alpha: 0.05
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-709-MultiTurn-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-08_02:57:32.513524
Iterative - MultiTurnClean - 45:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-709-MultiTurnClean-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-08_02:57:32.513524
Iterative - MultiTurnHighMultiplier - 46:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-10_17:00:00.00000
Iterative - MultiTurnRound2 - 47:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-144
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnRound2-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-13_04:01:14.617531
Iterative - MTRound2Again - 48:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-144
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 12
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-716-MTR2T2-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-16_15:00:00.00000
Iterative - MTR2OldReference - 49:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-144
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 12
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-717-MTR2OldReference-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-16_13:28:48.562547
Iterative - FirstConsultancy - 50:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-144
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 6
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-717-FirstConsultancy-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-17_03:22:31.370623
Iterative - ConsultancyFixed - 51:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: linear
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 6
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-718-ConsultancyFixed-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-17_03:22:31.370623
Iterative - MTR2OldReferenceDoubleCheck - 52:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-144
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 12
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-718-MTR2OldReferenceDoubleCheck-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-16_13:28:48.562547
Iterative - ConsultancyTrue - 53:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-719-ConsultancyTrue-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-18_23:42:01.582607
Iterative - MTR2Real - 54:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-96
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-720-MTR2Real
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_16:41:44.376974
Iterative - MTR2ConsultancyReal - 55:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-719-ConsultancyTrue-test/checkpoint-96
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-720-MTR2ConsultancyReal
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_23:55:33.046278
Iterative - 1TR2Real - 56:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-701-ProbLowTemp-test/checkpoint-96
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-720-1TR2Real
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-20_03:22:25.791278
Iterative - STR1ConsultancyReal - 57:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-722-STR1ConsultancyReal
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-21_14:40:47.736895
Iterative - 1TR2Real - 58:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-701-ProbLowTemp-test/checkpoint-64
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-722-1TR2Actual
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-21_15:56:19.232053
Iterative - STR2ConsultancyReal - 59:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-722-STR1ConsultancyReal
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 2
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-723-STR2ConsultancyReal
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-22_22:22:13.291225
Iterative - ConsultancyAlt - 60:
    model_name: /vast/spa9663/models/trained_models/llama-3-consultant-alt
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-725-ConsultantAlt-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-25_00:44:24.863819
Iterative - Accumulated - 61:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-727-Accumulated-test
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_16:41:44.376974
Iterative - AccumulatedAgain - 62:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-729-AccumulatedAgain-test
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-01_03:30:33.701924
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_16:41:44.376974
Iterative - SavedOptimizer - 63:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-710-MultiTurnImproved-test/checkpoint-96
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
            continue_training: True
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-730-SavedOptimizer-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_16:41:44.376974
Iterative - ConsultancySavedOptimizer - 64:
    model_name: /vast/spa9663/models/trained_models/llama-3-DPO-719-ConsultancyTrue-test/checkpoint-96
    reference_model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            continue_training: True
            force_sft_as_reference: True
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-730-ConsultancySavedOptimizer
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_23:55:33.046278
Iterative - ConsultancyAccumulated - 65:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-731-ConsultancyAccumulated
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-19_23:55:33.046278
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-07-18_23:42:01.582607
Iterative - FullTrainDebate - 66:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-805-FullTrainDebate-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - FullTrainDebateBinary - 67:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: binary
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-805-FullTrainDebateBinary-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - FullTrainDebateLogProb - 68:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: log_prob
            multiplier: 2.5
            loss_type: bon
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-805-FullTrainDebateLogProb-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - FullTrainDebateLogit - 69:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: logit
            loss_type: bon
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-805-FullTrainDebateLogit-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - FullTrainDebateBigBatch - 70:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-806-FullTrainDebateBigBatch-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - ConsultancyFullTrain - 71:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-808-ConsultantFullTrain
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-06_10:22:01.958734
Iterative - FullTrainDebateAccumulated - 72:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-805-FullTrainDebate-test
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-06_00:57:26.964112
Iterative - FullTrainDebateBigBatchR2 - 73:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-810-FullTrainDebateBigBatch-test
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-06_00:57:26.964112
Iterative - ConsultancyFullTrainR2 - 74:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-811-ConsultantFullTrainR2
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-06_10:22:01.958734
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-09_19:53:24.862884
Iterative - ConsultancyFullTrainR2BigBatch - 75:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 3
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 64
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-811-ConsultantFullTrainR2BigBatch
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-06_10:22:01.958734
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-09_19:53:24.862884
Iterative - FullTrainDebateR2Only - 76:
    model_name:  /home/ubuntu/mars-arnesen-gh/{anonymised}/models/trained_models/llama-3-mega-merged-no-judge-speeches-31.07
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /home/ubuntu/mars-arnesen-gh/models/trained_models/llama-3-DPO-0731-FullTrainDebateR2Only-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/DataGenerationLlama3MultiRoundHalfBranchedFullTrainDPO-0731/outputs/transcripts/2025-08-04
iterative-llama-debaters-nano-judge-dpo-round-one:
    model_name:  /home/ubuntu/mars-arnesen-gh/{anonymised}/models/trained_models/llama-3-mega-merged-no-judge-speeches-31.07
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
      num_train_epochs: 1
      per_device_train_batch_size: 1
      gradient_accumulation_steps: 32
      optim: paged_adamw_32bit
      learning_rate: 10e-5
      max_grad_norm: 0.3
      warmup_ratio: 0.03
      lr_scheduler_type: constant
      peft_type: lora
      lora_rank: 128
      kl_penalty: 0.5
      steps: 1
      target_module: all
      supplemental:
        epoch_size: 1024
        save_steps: 16
        reward_type: prob
        loss_type: bon
        multiplier: 7
    logging_and_saving_config:
      logging_steps: 1
      output_dir: /home/ubuntu/mars-arnesen-gh/models/trained_models/llama-3-dpo-round-1-nano-judge-0917
    dataset:
      dataset_type: judge_preferences
      full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/data-generation-llama-sfted-gpt-41-nano-judge-sfted/outputs/transcripts/2025-09-16
iterative-llama-debaters-nano-judge-dpo-round-two:
    model_name:  /home/ubuntu/mars-arnesen-gh/{anonymised}/models/trained_models/llama-3-dpo-round-1-nano-judge-0917
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
      num_train_epochs: 1
      per_device_train_batch_size: 1
      gradient_accumulation_steps: 32
      optim: paged_adamw_32bit
      learning_rate: 10e-5
      max_grad_norm: 0.3
      warmup_ratio: 0.03
      lr_scheduler_type: constant
      peft_type: lora
      lora_rank: 128
      kl_penalty: 0.5
      steps: 1
      target_module: all
      supplemental:
        epoch_size: 1024
        save_steps: 16
        reward_type: prob
        loss_type: bon
        multiplier: 7
    logging_and_saving_config:
      logging_steps: 1
      output_dir: /home/ubuntu/mars-arnesen-gh/models/trained_models/llama-3-dpo-round-2-nano-judge-0917
    dataset:
      - dataset_type: judge_preferences
        full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/data-generation-llama-sfted-gpt-41-nano-judge-sfted/outputs/transcripts/2025-09-16
      - dataset_type: judge_preferences
        full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/data-generation-llama-sfted-gpt-41-nano-judge-sfted-round-two/outputs/transcripts/2025-09-17
Iterative - FullTrainRoundOneSignificant - 76:
    model_name: /home/ubuntu/mars-arnesen-gh/{anonymised}/models/trained_models/llama-3-mega-merged-no-judge-speeches-31.07
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /home/ubuntu/mars-arnesen-gh/models/trained_models/llama-3-DPO-0808-FullTrainRoundOne-significant
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/DataGenerationLlama3MultiRoundHalfBranchedFullTrainDPO-0731-significant/outputs/transcripts/2025-08-04
Iterative - FullTrainDebateRoundTwo - 77:
    model_name:  /lambda/nfs/mars-arnesen-gh/{anonymised}/models/trained_models/llama-3-DPO-0731-FullTrainDebateR2Only-test
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 5e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /home/ubuntu/mars-arnesen-gh/models/trained_models/llama-3-DPO-811-FullTrainDebateRoundTwo-full-trained
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/DataGenerationLlama3MultiRoundHalfBranchedFullTrainDPO-0731/outputs/transcripts/2025-08-04
        - dataset_type: judge_preferences
          full_dataset_file_path: /home/ubuntu/mars-arnesen-gh/{anonymised}/outputs/DataGenerationLlama3MultiRoundHalfBranchedFullTrainDPO-0731-RoundTwo/outputs/transcripts/2025-08-07
Iterative - FullTrainDebateSingleTurnOneIter - 78:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-902-OneIterOneTurnDebate
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-02_01:08:10.171512
Iterative - ConsultancyFullTrain - 79:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-903-ConsultantSingleTrainFull
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-04_05:53:06.602017
Iterative - FullTrainDebateSingleTurnTwoIter - 80:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 5e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-904-OneTurnDebate
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-02_01:08:10.171512
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-03_16:30:52.595352
Iterative - FullTrainDebateSingleTurnTwoIterHighLR - 81:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-904-OneTurnDebateHighLR
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-02_01:08:10.171512
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-03_16:30:52.595352
Iterative - FullTrainDebateRedux - 82:
    model_name:  /vast/spa9663/models/trained_models/llama-3-mega-merged
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 7
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-904-FullTrainDebateRedux-test
    dataset:
        dataset_type: judge_preferences
        full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-08-02_23:27:58.793623
Iterative - ConsultancyFullTrainOneTurnIterTwo - 83:
    model_name: /vast/spa9663/models/trained_models/llama-3-mega-consultant
    target: debater
    llm_type: llama3
    opening_speeches_only: True
    training_hyperparameters:
        num_train_epochs: 1
        per_device_train_batch_size: 1
        gradient_accumulation_steps: 32
        optim: paged_adamw_32bit
        learning_rate: 10e-5
        max_grad_norm: 0.3
        warmup_ratio: 0.03
        lr_scheduler_type: constant
        peft_type: lora
        lora_rank: 128
        kl_penalty: 0.5
        steps: 1
        target_module: all
        supplemental:
            epoch_size: 1024
            save_steps: 16
            reward_type: prob
            loss_type: bon
            multiplier: 10
            force_sft_as_reference: False
    logging_and_saving_config:
        logging_steps: 1
        output_dir: /vast/spa9663/models/trained_models/llama-3-DPO-905-ConsultantSingleTrainFullFinal
    dataset:
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-04_05:53:06.602017
        - dataset_type: judge_preferences
          full_dataset_file_path: /vast/spa9663/outputs/transcripts/2024-09-05_07:04:33.478265
