import os
import sys
import requests
import json
# Import necessary libraries
from vllm import LLM, SamplingParams
from modelscope import AutoTokenizer
import pandas as pd
from datasets import load_dataset
from openai import OpenAI
from datetime import datetime
import multiprocessing as mp
from functools import partial
import time
import multiprocessing
multiprocessing.set_start_method('spawn', force=True)

# --- Configuration ---

# SET VLLM_USE_V1=0 before run (if using vLLM locally)
# os.environ['VLLM_USE_V1'] = '0' # Uncomment if needed

# Ensure you set your OpenRouter API key as an environment variable
# export OPENROUTER_API_KEY='your_actual_api_key_here'
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
USE_OPENROUTER = os.environ.get("USE_OPENROUTER", '').lower() == 'true'
USE_INTERVENTION_PROMPT = os.environ.get("USE_INTERVENTION_PROMPT", '').lower() == 'true'
if USE_INTERVENTION_PROMPT:
    intervention_prefix = """Language confusion in the context of large language models (LLMs) refers to the phenomenon where a model mixes or confuses elements of multiple languages inappropriately during generation. Pay attention to prevent it. For example, "곧 방호복을 입은 경찰관들이 yard에 들어와 타격 가스로 수감자들을 몰아넣었다." is incorrect. It should be "곧 방호복을 입은 경찰관들이 마당에 들어와 최루가스로 수감자들을 몰아넣었다." Pay attention to avoid language confusion during generation.\n\n"""
else:
    intervention_prefix = ''


# --- Unified Model Paths ---
# Use file paths for local models, OpenRouter model identifiers for remote models.
# Add 'openrouter_alias' if the identifier differs from the key.
model_dict = {
    # --- Local Models (vLLM) ---
    'qwen3-4b-nogate': {
        'path': './cs_gate_train/models/qwen3-4b-2507'
    },
    'qwen3-4b-nonorm': {
        'path': './cs_gate_train/models/gate-qwen3-4b-2507-nonorm-20k_95p_2025-09-02-02:49:58_plugged'
    },
    'qwen3-4b-norm': {
        'path': './cs_gate_train/models/gate-qwen3-4b-2507-20k_95p_2025-08-25-13:27:50_plugged'
    },
    'qwen3-8b-nogate': {
        'path': './models/qwen3-8b'
    },
    'qwen3-8b-nonorm': {
        'path': './cs_gate_train/models/gate-qwen3-8b-nonorm-20k_95p_2025-09-02-02:50:25_plugged'
    },
    'qwen3-8b-norm': {
        'path': './cs_gate_train/models/gate-qwen3-8b-20k_95p_2025-08-25-13:26:19_plugged'
    },
    '30b-nogate': {
        'path': '/cpfs01/user/jiawei.lyt/ckpt/verl_checkpoints/lyt-rl-gen/qwen3-tpp-nothink-0721-distilled-data0706-recitex1-bothtrans-mixlangx2-GenRM-32B-sentcs-GSPO-ref-turbopp-LENGTH_FLIP_THRESHOLD1.3-LENGTH_FLIP_PROB0.75-REF_ANSWER_POSITION-A-expert-12k_bs512_minibs128_n8/global_step_60/actor_hf'
    },
    '30b-gate-inner': {
        'path': './codeswitch/cs_gate/models/gate-qwen3-20k_95p_flores_2025-07-28-09:02:44'
    },
    '30b-gate-inner-think': {
        'path': './cs_gate_train/models/30b_think-gate-qwen3-controlfix-20k_95p_flores_2025-08-19-05:21:18_plugged'
    },
    '30b-think-nogate': {
        'path': '/cpfs01/user/jiawei.lyt/ckpt/verl_checkpoints/lyt-rl-gen/qwen3-tpp-thinking-fh0723-mkd035-distilled-data0706-recitex1-bothtrans-mixlangx2-GenRM-32B-sentcs-GSPO-ref-turbopp-THINK-FLIP1-2.4-LENGTH_FLIP_THRESHOLD1.3-LENGTH_FLIP_PROB0.75-REF_ANSWER_POSITION-A-expert-12k_bs512_minibs128_n8/global_step_90/actor_hf'
    },
    '30b-think-norm': {
        'path': './cs_gate_train/models/gate-qwen-30b-think-norm-20k_95p_2025-09-01-12:07:35_plugged'
    },
    '30b-gate-oss-old': {
        'path': './cs_gate_train/models/opensource-turbo-nothink-gate-qwen3-controlfix-20k_95p_flores_2025-08-18-11:06:45_plugged'
    },
    '30b-gate-oss': {
        'path': './cs_gate_train/models/opensource-turbo-nothink-gate-qwen3-controlfix-20k_95p_flores_2025-08-22-15:21:43_plugged'
    },
    '30b-gate-oss-norm': {
        'path': './cs_gate_train/models/gate-qwen-30b-norm-20k_95p_2025-08-28-08:59:51_plugged'
    },
    'llama-8b-oss': {
        'path': './cs_gate_train/models/gate-llama3-8b-20k_95p_norm_2025-08-25-03:50:13_plugged'
    },
    'llama-8b-oss_nonorm': {
        'path': './cs_gate_train/models/gate-llama3-8b-20k_95p_2025-08-24-11:41:02_plugged'
    },
    'llama-8b-nogate': {
        'path': './cs_gate_train/models/llama3-8b'
    },
    'gemma-4b-oss': {
        'path': './cs_gate_train/models/gate-gemma3-4b-20k_95p_2025-08-25-07:50:43_plugged'
    },
    'gemma-4b-oss-nonorm': {
        'path': './cs_gate_train/models/gate-gemma3-4b-nonorm-20k_95p_2025-08-26-12:21:17_plugged'
    },
    'gemma-4b-nogate': {
        'path': './cs_gate_train/models/gemma3-4b'
    },
    'gemma-12b-oss': {
        'path': './cs_gate_train/models/gate-gemma3-12b-20k_95p_2025-08-26-02:18:32_plugged'
    },
    'gemma-12b-nogate': {
        'path': './cs_gate_train/models/gemma3-12b'
    },
    'gemma-12b-oss-nonorm': {
        'path': './cs_gate_train/models/gate-gemma3-12b-nonorm-20k_95p_2025-08-28-08:33:06_plugged'
    },
    'aya-expanse-8b-nogate': {
        'path': './cs_gate_train/models/aya-expanse-8b'
    },
    'gpt-oss-20b-nogate-local': { # Renamed key to distinguish from OpenRouter version
        'path': './cs_gate_train/models/gpt-oss-20b'
    },
    'gpt-oss-20b-norm': {
        'path': './cs_gate_train/models/gate-gpt-oss-20b-norm-20k_95p_2025-08-29-12:31:23_plugged'
    },
    'gpt-oss-20b-nonorm': {
        'path': './cs_gate_train/models/gate-gpt-oss-20b-nonorm-20k_95p_2025-08-29-12:17:02_plugged'
    },
    'seed-oss-36b': {
        'path': './models/seed-oss-36b'
    },
    'olmo-32b': {
        'path': './cs_gate_train/models/OLMo-2-0325-32B-Instruct'
    },
    'olmo-32b-oss-norm': {
        'path': './cs_gate_train/models/gate-olmo-32b-norm-20k_95p_2025-08-29-07:43:20_plugged'
    },
    'olmo-32b-oss-nonorm': {
        'path': './cs_gate_train/models/gate-olmo-32b-nonorm-20k_95p_2025-08-29-07:50:05_plugged'
    },
    # --- OpenRouter Models ---
    'gpt-oss-20b-nogate': { # This key maps to the OpenRouter version
        'openrouter_alias': 'openai/gpt-oss-20b'
    },
    'gpt-oss-120b-nogate': { # This key maps to the OpenRouter version
        'openrouter_alias': 'openai/gpt-oss-120b'
    },
    'gpt-5-chat': { # This key maps to the OpenRouter version
        'openrouter_alias': 'gpt-5-chat-2025-08-07'
    },
    'gpt-5': { # This key maps to the OpenRouter version
        'openrouter_alias': 'openai/gpt-5'
    },
    '30b-think-nogate-or': { # This key maps to the OpenRouter version
        'openrouter_alias': 'qwen/qwen3-235b-a22b-thinking-2507'
    },
    'deepseek-v3.1': { # This key maps to the OpenRouter version
        'openrouter_alias': 'deepseek-v3.1'
    },
    'claude-sonnet-4': { # This key maps to the OpenRouter version
        'openrouter_alias': 'anthropic/claude-sonnet-4'
    },
    'gemini-2.5-flash': { # This key maps to the OpenRouter version
        'openrouter_alias': 'google/gemini-2.5-flash'
    },
    'kimi-k2': {
        'openrouter_alias': 'moonshotai/kimi-k2'
    },
    'qwen3-235b-instruct': {
        'openrouter_alias': 'qwen/qwen3-235b-a22b-2507'
    },
    'gemini-2.5-pro': {
        'openrouter_alias': 'gemini-2.5-pro'
    },
    'deepseek-v3-0324': {
        'openrouter_alias': 'deepseek-v3-250324-inner'
    },
    'doubao-1-6': {
        'openrouter_alias': 'doubao-seed-1-6-250615'
    },
    'grok-4': {
        'openrouter_alias': 'grok-4-0709'
    },
    'glm-4.5': {
        'openrouter_alias': 'glm-4.5-inner'
    },
    'glm-4.5-air': {
        'openrouter_alias': 'glm-4.5-air-inner'
    },
    'gpt-5-nano': {
        'openrouter_alias': 'gpt-5-nano-2025-08-07'
    },
    'gpt-5-mini': {
        'openrouter_alias': 'gpt-5-mini-2025-08-07'
    },
    'qwen3-coder': {
        'openrouter_alias': 'qwen3-coder-plus-2025-07-22'
    }
    # Add other OpenRouter models as needed
    # 'llama-8b-nogate-or': {
    #     'openrouter_alias': 'meta-llama/llama-3.1-8b-instruct:free'
    # },
}

# --- Data Preparation ---

def prepare_include(tokenizer=None, think=False):
    # If tokenizer is None, we are likely using OpenRouter and don't need to apply chat template here
    def get_prefix(lang):
        answer_trans = {
            'Arabic': 'إجابة',
            'Hebrew': 'תשובה',
            'Greek': 'Απάντηση',
            'Korean': '답변',
            'Russian': 'Ответ',
            'Vietnamese': 'Câu trả lời'
        }
        return f"Answer the following multiple choice question. The last line of your response should be of the following format: '{answer_trans[lang]}: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering, reply in {lang}."
    langs = ['Arabic', 'Hebrew', 'Greek', 'Korean', 'Russian', 'Vietnamese']
    answers = []
    prompts = []
    for lang in langs:
        lang_ds = load_dataset("./cs_gate_train/eval/data/include-base-44", lang)
        for row in lang_ds['test']:
            prompt = get_prefix(lang) + row['question'] + '\n\n' + f"A) {row['option_a']}\n" + f"B) {row['option_b']}\n" + f"C) {row['option_c']}\n" + f"D) {row['option_d']}"
            prompt = intervention_prefix + prompt
            if tokenizer:
                prompt = tokenizer.apply_chat_template([
                    {'role': 'user', 'content': prompt}
                ], add_generation_prompt=True, tokenize=False)
            if think:
                prompt += '  <think>' # Add thinking token if needed by the model (Note: space added for clarity, adjust if needed)
            prompts.append(prompt)
            answers.append(row['answer'])
    return prompts, answers

def prepare_flores(tokenizer=None, think=False):
    df = pd.read_json('./cs_gate_train/open_source_dataset/flores_plus_devtest.jsonl', lines=True)
    prompts = []
    answers = []
    def format_prompt(prompt):
        prompt = intervention_prefix + prompt
        if tokenizer:
            return tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], add_generation_prompt=True, tokenize=False)
        else:
             # If no tokenizer, assume raw prompt is fine for OpenRouter or model handles it
            return prompt
    iso_15924_arr = ['Cyrl', 'Hebr', 'Hang', 'Grek', 'Arab', 'Thai']
    for lang in iso_15924_arr:
        lang_samples = df[df['iso_15924'] == lang].sample(n=500, random_state=42)
        lang_prompts = lang_samples.apply(lambda row: format_prompt(row['zh_query']), axis=1)
        lang_answers = lang_samples.apply(lambda row: row['src_zh'], axis=1)
        prompts.extend(lang_prompts)
        answers.extend(lang_answers)
    glottocode_arr = ['viet1252', 'stan1290']
    for lang in glottocode_arr:
        lang_samples = df[df['glottocode'] == lang].sample(n=500, random_state=42)
        lang_prompts = lang_samples.apply(lambda row: format_prompt(row['zh_query']), axis=1)
        lang_answers = lang_samples.apply(lambda row: row['src_zh'], axis=1)
        prompts.extend(lang_prompts)
        answers.extend(lang_answers)
    return prompts, answers

def prepare_flores_latin(tokenizer=None, think=False, add_prompt=False):
    df = pd.read_json('./cs_gate_train/eval/data/flores_no_latin.jsonl', lines=True)
    prompts = []
    answers = []
    def format_prompt(prompt):
        prompt = intervention_prefix + prompt
        if tokenizer:
            return tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], add_generation_prompt=True, tokenize=False, enable_thinking=False)
        else:
             # If no tokenizer, assume raw prompt is fine for OpenRouter or model handles it
            return prompt
    for _, row in df.iterrows():
        prompts.append(format_prompt(row['en_query']))
        answers.append(row['text'])
    return prompts, answers

def prepare_humaneval_xl(tokenizer=None, think=False):
    print('start prepare_humaneval_xl')
    # os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
    # pls = ['python', 'java', 'javascript', 'csharp', 'go', 'kotlin', 'perl', 'php', 'ruby', 'scala', 'swift', 'typescript']
    # languages = ["English", "Russian", "Chinese", "German", "Spanish", "French", "Italian", "Portuguese", "Greek", "Hungarian", "Dutch", "Finnish", "Indonesian", "Turkish", "Arabic", "Vietnamese", "Bulgarian", "Persian", "Malay", "Hebrew", "Estonian", "Tagalog", "Afrikaans"]
    pls = ['python', 'perl']
    langs = ['Arabic', 'Hebrew']
    prompts = []
    answers = []
    def format_prompt(prompt):
        prompt = intervention_prefix + prompt
        if tokenizer:
            return tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], add_generation_prompt=True, tokenize=False)
        else:
             # If no tokenizer, assume raw prompt is fine for OpenRouter or model handles it
            return prompt
    for _ in range(2):
        dataset = load_dataset("floatai/HumanEval-XL", "python", trust_remote_code=True)
        for lang in langs:
            lang_ds = dataset[lang]
            for sample in lang_ds:
                # 5 trials on each prompt
                for _ in range(5):
                    prompts.append(format_prompt(sample['prompt']))
                    answers.append(sample['task_id'])
    print('finish prepare_humaneval_xl')
    return prompts, answers

# --- Generation Logic ---

def generate_with_vllm(llm, prompts, sampling_params):
    """Generate using vLLM."""
    return llm.generate(prompts, sampling_params)

def _generate_single_query(client_config, model_name_or_alias, sampling_params_dict, prompt_data):
    """Generate a single query using OpenRouter - worker function for multiprocessing."""
    prompt_index, prompt = prompt_data
    # retry for at most 10 times
    for _ in range(3):
        try:
            # Construct messages
            messages = [{"role": "user", "content": prompt}]
            # Corrected URL (removed trailing space)
            url = client_config['base_url']
            headers = {
                "Authorization": f"Bearer {client_config['api_key']}", # Use the key from outer scope or config
                "Content-Type": "application/json"
            }
            if 'gpt-oss' in model_name_or_alias:
                payload = {
                    "model": model_name_or_alias,
                    "messages": messages,
                    "reasoning": {
                        "effort": "medium",
                    },
                    'provider': {
                        'sort': 'throughput'
                    }
                }
            else:
                payload = {
                    "model": model_name_or_alias,
                    "messages": messages,
                    'provider': {
                        'sort': 'throughput'
                    }
                }
            # # Add sampling parameters from the dict
            # payload.update(sampling_params_dict)

            response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=300)
            response.raise_for_status() # Raise an exception for bad status codes

            response_data = response.json()
            # print(f"DEBUG: Full Response for prompt {prompt_index}: {response_data}") # Debug line

            # Safely extract content and reasoning
            message = response_data.get('choices', [{}])[0].get('message', {})
            generated_text = message.get('content', 'No content returned')
            # Extract reasoning
            thinking = message.get('reasoning', '') # Default to empty string if not present

            # Return structured result
            result = {
                'index': prompt_index,
                'prompt': prompt,
                'generated_text': generated_text,
                'thinking': thinking, # Include thinking in the result
                'error': None
            }
            print(f"Generated {prompt_index+1} (first 100 chars): {generated_text[:100]}...")
            return result

        except Exception as e:
            print(e)
            time.sleep(10)
            pass
    result = {
        'index': prompt_index,
        'prompt': prompt,
        'generated_text': f"<<ERROR>>",
        'thinking': '', # Include empty thinking on error
        'error': 'error'
    }
    return result

# Alternative version with better error handling and progress tracking
def generate_with_openrouter_parallel_v2(model_name_or_alias, prompts, sampling_params_dict, max_workers=10, chunk_size=1):
    """Generate using OpenRouter with parallel processing - enhanced version."""

    if max_workers is None:
        max_workers = min(len(prompts), mp.cpu_count())

    # Extract client configuration
    if not USE_OPENROUTER:
        client_config = {
            'base_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions', # Ensure it's a string
            'api_key': OPENROUTER_API_KEY
        }
    else:
        client_config = {
            'base_url': 'https://openrouter.ai/api/v1/chat/completions', # Ensure it's a string
            'api_key': OPENROUTER_API_KEY
        }

    # Prepare prompt data with indices
    prompt_data_list = list(enumerate(prompts))

    # Create partial function with fixed arguments
    worker_func = partial(
        _generate_single_query,
        client_config,
        model_name_or_alias,
        sampling_params_dict
    )

    outputs = [None] * len(prompts)  # Pre-allocate list to maintain order

    try:
        # Use process pool for parallel execution
        with mp.Pool(processes=max_workers) as pool:
            results = pool.map(worker_func, prompt_data_list, chunksize=chunk_size)

        # Place results in correct positions
        for result in results:
            index = result['index']
            # Create an object structure compatible with the existing code's expectation
            # This object now includes 'thinking'
            output_obj = type('obj', (object,), {
                'prompt': result['prompt'],
                'outputs': [type('obj', (object,), {
                    'text': result['generated_text'],
                    'thinking': result.get('thinking', '') # Safely get thinking
                })()]
            })
            outputs[index] = output_obj

    except Exception as e:
        print(f"Error in parallel processing: {e}")
        # Fallback to sequential processing might be complex, raising error for now
        raise # Re-raise the error for visibility

    # Remove any None values (shouldn't happen, but just in case)
    outputs = [output for output in outputs if output is not None]

    return outputs

# --- Unified Main Execution Logic ---

def run_test(dataset_name, model_name, num_workers=10):
    """
    Run a test for a given dataset and model, automatically detecting backend.
    """
    print(f"Running '{dataset_name}' test for model: {model_name}...")

    # --- Input Validation ---
    if model_name not in model_dict:
        print(f"Error: Model '{model_name}' not found in model_dict.")
        sys.exit(1)

    if dataset_name not in ['include', 'flores', 'flores-latin', 'humaneval']:
        print(f"Error: Unsupported dataset name '{dataset_name}'. Please use 'include' or 'flores'.")
        sys.exit(1)

    model_config = model_dict[model_name]

    # Prepare Data
    if dataset_name == 'include':
        prepare_fn = prepare_include
        default_max_tokens = 8000
    elif dataset_name == 'flores-latin':
        prepare_fn = prepare_flores_latin
        default_max_tokens = 8000
    elif dataset_name == 'humaneval':
        prepare_fn = prepare_humaneval_xl
        default_max_tokens = 8000
    else: # flores
        prepare_fn = prepare_flores
        default_max_tokens = 8000

    # --- Determine Backend and Prepare Data/Sampling ---
    if 'path' in model_config:
        # --- Use vLLM ---
        print("Detected local model (vLLM)...")
        if 'gpt' not in model_name.lower():
            assert os.environ.get('VLLM_USE_V1') == '0', "Please set VLLM_USE_V1=0 for vLLM"
        else:
            assert os.environ.get('VLLM_USE_V1') == '1', "Please set VLLM_USE_V1=1 for vLLM for GPTOSS"

        model_path = model_config['path']
        # if dataset_name == 'include': # Set TOK_PATH only for include if needed by tokenizer
        #     os.environ['TOK_PATH'] = model_path
        tokenizer = AutoTokenizer.from_pretrained(os.environ['TOK_PATH'])
        should_think = 'think' in model_name

        prompts, answers = prepare_fn(tokenizer, should_think)
        max_tokens = default_max_tokens # Use default, could make configurable if needed

        # Setup LLM and Sampling Params
    
        if 'olmo' in model_name:
            llm = LLM(model=model_path, dtype="bfloat16", max_model_len=4096, tensor_parallel_size=1)
        else:
            llm = LLM(model=model_path, dtype="bfloat16", max_model_len=8192, tensor_parallel_size=2)

        if 'gemma' in model_name:
            sampling_params = SamplingParams(temperature=1, top_p=0.95, top_k=64, max_tokens=max_tokens)
        if 'gpt' in model_name:
            # p should be 1 and k should be none, but this doesn't fit to the intervention impl
            sampling_params = SamplingParams(temperature=1, top_p=0.9999, top_k=100, max_tokens=max_tokens)
        else:
            sampling_params = SamplingParams(temperature=0.7, top_p=0.8, top_k=20, max_tokens=max_tokens)

        if 'flores' in dataset_name:
            sampling_params.max_tokens = 512

        # Generate
        outputs = generate_with_vllm(llm, prompts, sampling_params)

    elif 'openrouter_alias' in model_config:
        # --- Use OpenRouter ---
        print("Detected OpenRouter model...")
        if not OPENROUTER_API_KEY:
            raise ValueError("OPENROUTER_API_KEY environment variable is not set.")

        openrouter_model_alias = model_config['openrouter_alias']
        # Pass tokenizer=None to data preparation functions for OpenRouter
        should_think = 'think' in model_name # Heuristic, adjust if needed

        prompts, answers = prepare_fn(tokenizer=None, think=should_think)
        # Limit prompts for testing (remove or adjust as needed)
        # prompts = prompts[:10]
        # answers = answers[:10]
        max_tokens = default_max_tokens # Use default

        # Define sampling params dict suitable for OpenAI/OpenRouter API
        # top_k might be ignored depending on the provider/model on OpenRouter
        sampling_params_dict = {
            "max_tokens": max_tokens,
            "temperature": 0.7,
            "top_p": 0.8,
            # "top_k": 20, # Not standard, often ignored
        }
        if 'gemma' in model_name:
             sampling_params_dict.update({
                "temperature": 1,
                "top_p": 0.95,
                # "top_k": 64,
             })

        # Generate
        outputs = generate_with_openrouter_parallel_v2(openrouter_model_alias, prompts, sampling_params_dict, max_workers=num_workers)

    else:
        print(f"Error: Model configuration for '{model_name}' is invalid. It must have either 'path' or 'openrouter_alias'.")
        sys.exit(1)

    # --- Process and Save Results (Common for both backends) ---
    res_arr = []
    for output, answer in zip(outputs, answers):
        prompt = output.prompt
        generated_text = output.outputs[0].text
        # Attempt to get thinking; default to empty if not present (e.g., for vLLM)
        thinking = getattr(output.outputs[0], 'thinking', '')
        # print("Prompt:", [prompt])
        # print("Response:", [generated_text])
        # print("Thinking:", [thinking]) # Optional: print thinking
        res_arr.append({
            'query': prompt,
            'query_response': generated_text,
            'answer': answer,
            'thinking': thinking # Add thinking to the result array
        })
    res_df = pd.DataFrame(res_arr)
    timestamp = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
    safe_model_name = model_name.replace("/", "_").replace(":", "_")
    if USE_INTERVENTION_PROMPT:
        prompt_note = 'useprompt_'
    else:
        prompt_note = ''
    out_path = f'./data/{safe_model_name}_{dataset_name}_res_{prompt_note}{timestamp}.jsonl' # Sanitize filename
    res_df.to_json(out_path, orient='records', lines=True)
    print("Saved to:", out_path)


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("Usage: python script.py <dataset_name> <model_name>")
        print("       dataset_name: 'include' or 'flores'")
        print("       model_name: A key from the model_dict (e.g., '30b-nogate', 'gpt-oss-20b-nogate')")
        print("Example: python script.py include 30b-nogate")
        print("Example: python script.py flores gpt-oss-20b-nogate")
        print("Note: Set OPENROUTER_API_KEY environment variable if testing OpenRouter models.")
        sys.exit(1)

    dataset_name = sys.argv[1].lower()
    model_name = sys.argv[2]

    run_test(dataset_name, model_name, 50) # Reduced default workers for testing