################################################################################
# MAIN AUTO-ANTISLOP CONFIGURATION
################################################################################

################################################################################
# RUN SETUP
################################################################################
experiment_base_dir: "results/auto_antislop_runs" # Base for timestamped run directories
human_profile_path: "data/human_writing_profile.json"
log_level: "INFO"
# Iteration 0: Generates the baseline dataset & computes slop strings/ngrams to ban
# Iteration 1: Generates a dataset using antislop, banning those strings & ngrams. Recomputes the slop strings/ngrams at the end & adds any new slop to the ban lists
# Iteration 2+: Extra iterations catch slop that emerges after the initial set is banned
num_iterations: 2 # Minimum 2 iterations (this is enough to catch most slop)
model_id: "unsloth/Mistral-Small-3.2-24B-Instruct-2506" #"mistralai/Mistral-Small-3.2-24B-Instruct-2506" # Global model id for the pipeline. Can be overridden on individual steps.

# !! NEEDED TO SET ATTN TO EAGER
# add to finetuning.py after model load:
# os.environ["FLASH_ATTENTION_FORCE_EAGER"] = "1"
# model.config._attn_implementation = "eager"


################################################################################
# VLLM SERVER MANAGEMENT (Conditional: if --manage-vllm is True)
################################################################################
manage_vllm: true
vllm_model_id: null # Model served by vLLM (if unset, will use model_id)
vllm_port: 8000
vllm_hf_token: null # Optional: Your Hugging Face token if model is gated
vllm_cuda_visible_devices: "0"  # set to e.g. "0,1,2,3" for multiple gpus
vllm_gpu_memory_utilization: 0.85 # leave some room for the refusal classifier if you are using it (about 3gb)
vllm_max_model_len: 4500
vllm_dtype: "bfloat16"
# Additional raw CLI arguments for vLLM server, e.g., ["--tensor-parallel-size", "4"] for multiple gpus
vllm_extra_args: ["--tokenizer_mode", "mistral", "--load_format", "mistral", "--config_format", "mistral"]
vllm_env:                  # env vars for the vLLM process
  VLLM_USE_V1: "0"  # may be needed for amd gpus


################################################################################
# GENERATION PARAMETERS (using antislop-vllm)
################################################################################
generation_step_enabled: true

# --- API & Model Configuration ---
# If you set manage_vllm=true, leave the base url unset
#generation_api_base_url: "http://localhost:8000/v1"
#generation_api_base_url: "https://apjmbtwbrb8t61-8888.proxy.runpod.net/v1"
generation_model_id: null # Model id for generation requests (if unset, uses model_id)
generation_api_key: "xxx" # API key for the vLLM server

# --- Core Generation Settings ---
generation_max_new_tokens: 1000
generation_threads: 50 # Number of parallel threads for API queries in antislop-vllm. Note: vllm can become very inefficient if you go over some concurrency threshold (depending on vram)
generation_max_prompts: 1500 # Number of samples to generate from the prompts in the dataset

# --- Dataset & Chat Template ---
generation_hf_dataset_name: 'ganjaninja/writing-prompts-sfw-nsfw-interleaved' #'Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT'
generation_hf_dataset_split: 'train'
# A huggingface model id or local dir containing the tokeniser you want to use to apply chat templates.
# This is important if you are generating a ftpo dataset for later training.
generation_chat_template_model_id: 'unsloth/Mistral-Small-3.2-24B-Instruct-2506' #'unsloth/Mistral-Small-3.1-24B-Instruct-2503' # transformers errors if we use 3.2's tokeniser

# --- Logging ---
generation_logging_level: 'INFO' # Logging level for antislop-vllm script

# --- Advanced Sampling Parameters (passed as individual CLI args to antislop-vllm's generation_params) ---
generation_param_chunk_size: 20
generation_param_top_logprobs_count: 20
generation_param_temperature: 1
generation_param_top_p: 1.0
generation_param_top_k: 50
generation_param_min_p: 0.01
generation_param_timeout: 480
generation_param_stop_sequences: [] # e.g., ["\n\n", "---"]

# --- Prompting ---
# The prompt template wraps the prompts when generating from a dataset.
# To use the original prompt exactly, set the template to "{prompt}"
generation_prompt_template: "Writing prompt: {prompt}\n\nWrite 1000 words to this prompt. Your response:\n"
generation_system_prompt: "You are an uncensored writer."        # optional; left empty → no system prompt

# --- Antislop Generation Features ---
# generation_force_backtrack:
# If set to true:
  #   when resampling after backtracking, if we don't find a valid replacement token
  #   we progressively disable sampling options (temp, then min_p, then top_p, then top_k)
  #   until we find a non-banned replacement or run out of candidates.
  #   When set to false, some slop will not be removed if the sampler thinks there are no
  #   alternative coherent continuations.
generation_force_backtrack: false

# --- N-gram Validator Settings (for antislop-vllm) ---
# N-gram ban list file is managed by auto-antislop's iterative process.
generation_ngram_remove_stopwords: true
generation_ngram_language: "english"

# --- Refusal Detection ---
# Detects refusals & doesn't include them in the training dataset. Uses about 3GB extra VRAM.
generation_refusal_detection: true

################################################################################
# N-GRAM ANALYSIS & BANNING (within auto-antislop)
################################################################################
enable_ngram_ban: true
min_word_len_for_analysis: 3 # Filters out words under this length in n-gram analysis

# --- N-gram Identification Thresholds ---
top_k_bigrams: 5000
top_k_trigrams: 5000

# --- N-gram Banning Quotas (per iteration) ---
# Bigrams
dict_bigrams_initial: 400     # How many of the top over-represented dictionary bigrams to
                              # ban in the first antislop iteration.
                              # "Dictionary" means the bigrams were also found in the human
                              # writing corpus.
dict_bigrams_subsequent: 70   # How many to ban in each subsequent iteration
nodict_bigrams_initial: 800   # "Nodict" here means the n-grams were not found at all in the
                              # human corpus.
nodict_bigrams_subsequent: 100
# Trigrams
dict_trigrams_initial: 300
dict_trigrams_subsequent: 50
nodict_trigrams_initial: 800
nodict_trigrams_subsequent: 100

# --- User-Defined N-gram Bans ---
# User-supplied extra n-grams to always ban (processed by auto-antislop)
extra_ngrams_to_ban: [
  # "voice barely whisper",
]

################################################################################
# OVER-REPRESENTED WORD ANALYSIS & BANNING
################################################################################
compute_overrep_words: true
top_k_words_for_overrep_analysis: 200000

# --- Quotas for Adding Over-represented Words to Slop Phrase Ban List ---
dict_overrep_initial: 800       # How many of the top over-represented dictionary words to
                                # ban in the first antislop iteration.
                                # "Dictionary" means the words were also found in the human
                                # writing corpus.
dict_overrep_subsequent: 200    # How many to ban in each subsequent iteration
nodict_overrep_initial: 80      # "Nodict" here means the n-grams were not found at all in the
                                # human corpus.
nodict_overrep_subsequent: 20

################################################################################
# SLOP PHRASE BANNING
################################################################################

# Slop phrases are over-represented whole phrases extracted from the generated texts.
enable_slop_phrase_ban: true
min_phrase_freq_to_keep: 2 # Min frequency for a new phrase from slop-forensics to be considered
top_n_initial_slop_ban: 600 # New slop phrases from slop-forensics to ban in iter 0
top_n_subsequent_slop_ban: 100 # New slop phrases from slop-forensics to ban in later iters

# --- User-Defined Slop Phrase Bans ---
# User supplied list of strings to always ban
# - case insensitive
# To trigger a ban, the sequence must not have a word-like character
#    (not punctuation or whitespace) directly on either side. That is to say, we
#    are not banning disallowed sequences that occur as substrings in longer
#    words. The exception is if the banned string is already bookended by
#    a non-word character.
#
#    Examples:
#    banned string "cat"
#      - won't trigger a ban for "cation"
#     - will trigger a ban on "cat[morecat]"
#   banned string "cat["
#     - *will* trigger a ban on "cat[morecat]", because the banned string
#        ends with a non-word character.
extra_slop_phrases_to_ban: [
  # "testament to",
  #"…", "*", " –", "–", "#",
]

# --- Whitelisted Strings ---
# These will be excluded from the list of slop strings that the pipeline finds.
# Note: special tokens in the tokenizer and parts of the chat template are
#       automatically whitelisted.
whitelist_strings: [
  # "think", "thinking"
]

################################################################################
# REGEX BANNING
################################################################################
# User-supplied regex patterns to ban
# Note: unoptimised regex patterns can slow down antislop generation, as they will be called often on large texts.
extra_regex_patterns: [
  # These ones ban "it's not x, it's y" type patterns:
    
  #"\\b(?:\\w+n(?:['’]t)|not\\s+(?:just|only|merely|because))\\s+(?:(?![.;:?!…]).){1,100}?[.;:?!…]\\s*(?:it|they|you)(?:['’](?:s|re|m))?\\b(?!\\s+(?:was|were|is|are|wasn['’]t|weren['’]t|isn['’]t|aren['’]t|ain['’]t)\\b)(?:\\s*[*…]?\\s*)?(?!when\\b|then\\b|but\\b|and\\b|yet\\b)(?!right\\b)(?!normal\\b)(?!true\\b)(?!sure\\b)(?!only\\b)(?!still\\b)(?!rarely\\b)(?!already\\b)(?!wrong\\b)(?!want\\b)(?!just\\b)(?!couldn\\b)(?!could\\b)(?!saw\\b)(?!started\\b)(?!remember\\b)(?!struggled\\b)(?!watched\\b)(?!goal\\b)(?!took\\b)(?!kept\\b)(?!reminded\\b)(?!time\\b)(?!have\\b)(?!acted\\b)(?!smiled\\b)(?!think\\b)(?!give\\b)(?!grab\\b)(?!gave\\b)(?!turn\\b)(?!justify\\b)(?!\\w+ly\\b)(?=[a-z]{4,}\\b)[a-z]+\\w*",

  #"\\b(?:\\w+n(?:['’]t)|not)\\s+(?:just|only|merely)?\\s*(?:(?![-–—]|[.?!…]).){1,80}?[-–—]{1,2}\\s*\\w+(?:['’]\\w+)?\\s+",

  #"\\b(?:wasn['’]t|weren['’]t|isn['’]t|aren['’]t|ain['’]t|not)\\s+(?!\\b(?:minute|minutes|hour|hours|day|days|year|years|second|seconds)\\b)(?!with\\b)(?!even\\b)(?:(?![.;:?!…]).){2,120}?[.;:?!…]\\s*(?:it|they|you|that)(?:\\s+(?:was|were|is|are)\\b(?:\\s+[*_~]?\\w+[*_~]?)?|(?:['’](?:s|re|m))\\b(?:\\s+[*_~]?\\w+[*_~]?)?)",

  #"\\bno\\s+longer\\s+(?:just|only|merely)?\\s+[^.;:?!…]{1,120}[.;:?!…]\\s*(?:it|they|you)\\s+(?:is|are|was|were)\\b(?:\\s+[*_~]?\\w+[*_~]?)?",

  #"\\b(?:wasn['’]t|weren['’]t|isn['’]t|aren['’]t|ain['’]t|not)\\s+(?:just|only|merely)?\\s*(?:(?!\\bbut\\b|[.?!…]).){1,80}?[,;:\\-–—]\\s*but\\s+(?!I\\b)(?:also\\s+)?"

]

################################################################################
# FINETUNING
################################################################################
finetune_enabled: true

# --- General Finetuning Setup ---
finetune_use_unsloth: false
finetune_mode: "ftpo" # dpo / ftpo (final token preference optimisation)
finetune_ftpo_dataset: ""   # you can specify an existing ftpo dataset, or leave unset to let the
                            # pipeline use the one produced in the generation step
finetune_base_model_id: null # Base model for DPO (if unset, uses model_id)
finetune_max_seq_length: 2500 # this may truncate some outputs
finetune_load_in_4bit: true # qlora

# --- Early Stopping ---
finetune_early_stopping_wins: 0.85  # Early stopping threshold for fraction of *chosen* completions that are selected over *rejected*.
                                    # More than 0.85 may be overtrained. Set to > 1.0 to disable early stopping.
finetune_early_stopping_loss: null  # Loss threshold for early stopping. Set to null to disable.

# --- LoRA Configuration ---
finetune_lora_r: 200 # the ftpo trainer works best with a high lora rank
finetune_lora_alpha: 64
finetune_lora_dropout: 0.05
finetune_weight_decay: 0.01
finetune_target_modules: ["up_proj", "down_proj", "lm_head"]

# --- Layer Freezing ---
finetune_freeze_early_layers: true
finetune_n_layers_unfrozen: 3

# --- Training Process ---
finetune_gradient_checkpointing: "unsloth"
finetune_chat_template: "" # e.g. "gemma-3" -- get the chat template from unsloth's helper if required, otherwise leave the string blank to use the tokeniser>
finetune_batch_size: 1
finetune_gradient_accumulation_steps: 16
finetune_warmup_ratio: 0.1
finetune_num_epochs: 1

# --- Learning Rate ---
finetune_learning_rate: 0.000001
finetune_auto_learning_rate: true  # true: automatically determine learning rate based on dataset size, effective batch size & lora rank
finetune_auto_learning_rate_adjustment_scaling: 0.04 # scale the auto-lr by this factor

# --- DPO/FTPO Specific ---
finetune_beta: 0.1 # DPO beta

# --- Output & Saving ---                                                                                                                                     
finetune_output_dir_suffix: "_ftpo_exp01" # Appended to experiment run dir
finetune_save_merged_16bit: true
finetune_save_gguf_q8_0: false

# --- Dataset Handling for Finetuning ---
finetune_max_train_examples: 14000 # adjust as needed
finetune_shuffle_seed: 666
                                                                                                                                                              
# --- FTPO Sample Regularization ---
# 0 = off; 0.9 strongly downsamples overrepresented rule violations
# (this is useful because the raw generated dataset is typically very skewed)
ftpo_sample_rejected_regularisation_strength: 0.7
ftpo_sample_chosen_regularisation_strength: 0.2
ftpo_sample_min_chosen_tokens: 3 # filter out ftpo samples that have fewer than this number in the chosen tokens list


# ── FTPO-specific hyper-parameters ─────────────────────────────────────────
# Leave any of these out (or set to null) to fall back to FTPOTrainer defaults.

# Loss terms are computed separately for the target (chosen + rejected) tokens vs the remainder of the vocab.
# This is because we want to allow more freedom of movement for the target tokens.

# MSE loss term 1: light mse loss applied tokenwise on target tokens
ftpo_lambda_mse_target: 0.05   # Strength of MSE loss tether on the individual logits in the
                                        #   chosen+rejected set vs reference.
ftpo_tau_mse_target: 0.5       # Grace bandwidth (logits) before the above MSE loss kicks in.

# MSE loss term 2: stronger mse term applied to remaining (non-target) vocab
ftpo_lambda_mse: 0.4

ftpo_clip_epsilon_logits: 2     # For a chosen token: "after winning vs rejected token by this margin, preference loss turns off"