metadata:
  name: "WVS_HK_resplit"
  description: "Generate 300 active endowments for WVS W7 HK survey using gemini-2.0-flash agents."
  version: "v2"
  seed: 101

# Input paths
paths:
  survey_csv: "data/WVS_HK/info_resplit.csv"
  survey_yaml: "config/surveys/wvs.yaml"
  attribute_bank: "config/attribute_banks/attribute_bank.yaml"
  aggregate_json: "data/WVS_HK/HK_aggregate_results.json"

# >>> Report Styling <<<
# Specifies custom CSS stylesheets for HTML output reports.
report:
  endowment: styles/generation_report.css   # Style for endowment generation report
  regression: styles/lasso_report.css       # Style for regression experiment report


# ============================================================
#           [STAGE 1] ACTIVE ENDOWMENT GENERATION
# ============================================================

# --- RESPONSE AGENT CONFIGURATION ---
# Settings for the LLM agent that simulates survey responses.
agent: 
  type: gemini
  model_name: gemini-2.0-flash 
  formality: >                 # System prompt detailing formality requirements for the response
    You are completing a survey. Your answer should reflect the person described in the profile above, using their preferences, beliefs and experiences.
    Respond with only the final answer string, not the code or label in brackets.
    Do not include any reasoning, explanation, or commentary.
    Do not preface your answer with phrases like 'I would choose'.
    Just return the answer text exactly as it appears in the options.
  kwargs:
    temperature: 0             # Deterministic responses
    max_tokens: 128            # Upper limit for response length

# --- ENDOWMENT MODEL SETTINGS ---
# Controls how synthetic persona endowments are generated.
endowment_model:
  backend_type: gemini         # LLM backend for endowment generation 
  model_name: gemini-2.0-flash      # LLM model for endowment generation 
  temperature: 0.9             # Higher temperature to encourage diversity
  batch_size: 5                # Number of persona endowments generated per call
  delay: 0.5                   # Optional delay (in seconds) between API calls
  retry_failed: true           # Retry if API request fails
  max_attributes: 10           # Cap on number of attributes per endowment
  randomize_attributes: true   # Randomly select attribute from the target mode capped at max_attributes
  system_prompt: |
    You are an expert assistant trained to generate realistic, diverse, and demographically plausible personas for social science surveys. 

    The surveys concern the population of Hong Kong SAR.

    Each persona should include:
    - `eid`: a short, lowercase, variable-safe identifier that encodes key traits (e.g., urban_liberal_30s_female). No punctuation or spaces.
    - `endow_text`: a natural language description of the persona, written as if describing a survey respondent.

    Instructions:
    - Represent a wide range of age, gender, race/ethnicity, education, region, and political ideology.
    - Avoid repetition of phrasing or demographic combinations across personas.
    - Do not include commentary, explanations or formatting outside of the JSON array.
    - All personas should be contextually appropriate for Hong Kong SAR.
# --- ATTRIBUTE LEARNER SETTINGS ---
# Optional post-hoc tool to infer attributes from survey and questions
attribute_learner:
  backend_type: gemini         # LLM backend for attribute generation 
  model_name: gemini-2.0-flash      # LLM model for attribute generation 
  max_tokens: 1024

# --- GENERATION STRATEGY ---
# Overall experiment parameters and sampling policies.
generation:
  target_n: 300                 # Total number of synthetic endowments
  initial_n: 10                 # Initial endowments sampled per preset mode before adaptation begins
  num_update_steps: 10          # Number of adaptive/expansion iterations
  parallel: true                # Whether to use multithreading (recommended for faster generation)
  max_workers: 20               # Number of parallel threads
  question_patch:
    fraction: 0.75              # Proportion of endowments assigned to low-entropy questions
    top_k: 3                    # Number of lowest-entropy questions tracked per round
    min_repeats: 2              # Reallocate patching to a mixed mode if a question appears in top_k this many times
  verbose: true                 # Print detailed logs during generation

# ============================================================
#           [STAGE 2] LASSO REGRESSION APPROXIMATION
# ============================================================

# --- REGRESSION METHODS ---

lasso:
  alpha_expr: "np.logspace(-5, 1, 20)"                      # Lasso tuning grid
  max_iter: 10000
  validation:                                               # Cross-Validation Strategy
    strategy: "cv"                                            # "holdout" or "cv" (cross-validation)
    cv_folds: 5                                               # Used only if strategy == "cv"
  post_selection_refit: false                                # Refit OLS after selection (default: true)

elasticnet:
  alpha_expr: "np.logspace(-5, 1, 20)"                      # ElasticNet alpha grid
  l1_ratio_expr: "np.linspace(0.1, 1, 10)"                     # ElasticNet l1_ratio grid
  max_iter: 10000
  validation:                                               # Cross-Validation Strategy
    strategy: "cv"                                            # Only "cv" is supported
    cv_folds: 5                                               # Used only if strategy == "cv"
  plot_style: "2D"                                          # Plot style for the diagnostics: "2D" (default) or "3D"
  post_selection_refit: false


# --- SPLIT SETTINGS ---
split_settings:
  use_proxy_only: true                                      # Use only proxy agents
  train_val_split: ["train", "valid"]                       # Training and validation splits
  test_split: "test"                                        # Test split

# --- MODEL WEIGHT USAGE ---
weight_assignment_model: "elasticnet"              # Which model's weights to assign to endowments

# ============================================================
# NOTES:
# - You can specify either "lasso", "elasticnet", or both.
# - Post-selection refit via OLS is optional and enabled by default.
# - Weights for endowments will be assigned using `weight_assignment_model`.
# ============================================================