# cirbench/utils/api/qwen.py
from __future__ import annotations
import os, time, json
from typing import List, Dict, Any, Optional

try:
    # Requires: pip install openai>=1.0
    from openai import OpenAI
except Exception:
    OpenAI = None


class _Out:
    def __init__(self, text: str, meta: Dict[str, Any]):
        self.text = text
        self.meta = meta


def _safe_get_usage_tokens(usage: Any) -> tuple[Optional[int], Optional[int], Optional[int]]:
    """
    Normalize usage payload from OpenAI-compatible servers.
    Returns (prompt_tokens, completion_tokens, total_tokens)
    """
    if not usage:
        return (None, None, None)
    # Some SDKs expose attributes, others dicts
    pt = getattr(usage, "prompt_tokens", None)
    ct = getattr(usage, "completion_tokens", None)
    tt = getattr(usage, "total_tokens", None)
    if isinstance(usage, dict):
        pt = usage.get("prompt_tokens", pt)
        ct = usage.get("completion_tokens", ct)
        tt = usage.get("total_tokens", tt)
    return (pt, ct, tt)


def _finish_reason(choice: Any) -> str:
    """
    Normalize finish_reason to a lower-case keyword.
    We map 'MAX_TOKENS' to 'length' for consistency.
    """
    fr = None
    if choice is not None:
        fr = getattr(choice, "finish_reason", None)
        if isinstance(choice, dict):
            fr = choice.get("finish_reason", fr)
    if not fr:
        return "unknown"
    frs = str(fr).strip().lower()
    return "length" if frs == "max_tokens" else frs


class QwenRunner:
    """
    Qwen (DashScope OpenAI-compatible) runner.
    Expected by cirbench: generate(prompts: List[str]) -> List[_Out]

    Features:
      - Respects temperature/top_p and user-specified max_tokens/max_output_tokens.
      - Auto stop on IR tags and lenient continuations when cut by length.
      - Merges multiple continuations and stops as soon as closing tags appear.
    """

    def __init__(self, model: str, params: Dict[str, Any] | None = None):
        if OpenAI is None:
            raise RuntimeError("Missing dependency: `openai` package is required.")
        ps = dict(params or {})
        api_key = ps.get("api_key") or os.getenv("DASHSCOPE_API_KEY")
        base_url = ps.get("base_url") or os.getenv("DASHSCOPE_BASE_URL") or "https://dashscope.aliyuncs.com/compatible-mode/v1"
        if not api_key:
            raise RuntimeError("DASHSCOPE_API_KEY not set (and no `params.api_key` provided).")

        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model = model
        self.system_prompt = ps.get("system") or "You are a helpful assistant."

        # Optional generation params
        self.temperature: Optional[float] = ps.get("temperature", None)
        self.top_p: Optional[float] = ps.get("top_p", None)

        # User-specified output limit (OpenAI-style or Gemini-style)
        self.user_max_tokens: Optional[int] = ps.get("max_tokens", None)
        if self.user_max_tokens is None:
            mo = ps.get("max_output_tokens")
            if mo is not None:
                try:
                    self.user_max_tokens = int(mo)
                except Exception:
                    pass

        # Stop sequences: allow 'stop' or 'stop_sequences'
        self.stop: Optional[List[str]] = None
        _stop = ps.get("stop", None)
        if _stop is None:
            _stop = ps.get("stop_sequences", None)
        if _stop is not None:
            if isinstance(_stop, (list, tuple)):
                self.stop = [str(s) for s in _stop if s is not None]
            else:
                self.stop = [str(_stop)]

        # Auto add IR stops (and their half-closed variants) to be conservative
        self.auto_stop_irout: bool = bool(ps.get("auto_stop_irout", True))

        # Dynamic retry & continuation controls
        self.ctx_limit: Optional[int] = ps.get("ctx_limit", None)     # e.g., 32000/131072 if you know it
        self.safety_tokens: int = int(ps.get("safety_tokens", 128))   # budget cushion
        self.min_gen_tokens: int = int(ps.get("min_gen_tokens", 64))
        self.retry_on_length: bool = bool(ps.get("retry_on_length", True))
        self.retry_cap_tokens: int = int(ps.get("retry_cap_tokens", 1024))
        self.max_continuations: int = int(ps.get("max_continuations", 0))

        # Request timeout (seconds)
        self.timeout: Optional[float] = ps.get("timeout", None)

        # Provider-specific passthrough: extra_body (OpenAI SDK supports this)
        self.extra_body: Optional[Dict[str, Any]] = None
        eb = ps.get("extra_body", None)
        if eb is not None:
            if isinstance(eb, dict):
                self.extra_body = eb
            elif isinstance(eb, str):
                # allow YAML users to pass a JSON string; ignore parse errors silently
                try:
                    self.extra_body = json.loads(eb)
                except Exception:
                    self.extra_body = None

        # Convenience flag to surface in meta for easy auditing
        self.enable_thinking_flag: Optional[bool] = None
        if isinstance(self.extra_body, dict):
            v = self.extra_body.get("enable_thinking")
            if isinstance(v, bool):
                self.enable_thinking_flag = v
            elif isinstance(v, str):
                self.enable_thinking_flag = (v.strip().lower() == "true")

        # Optional thinking budget (limits length of reasoning phase)
        # Priority (to match official API habit): extra_body.thinking_budget > params.thinking_budget > env CIRBENCH_QWEN_THINK_BUDGET
        tb_param = ps.get("thinking_budget", None)
        tb_eb = None
        if isinstance(self.extra_body, dict):
            tb_eb = self.extra_body.get("thinking_budget", None)
        tb_env = os.getenv("CIRBENCH_QWEN_THINK_BUDGET")
        try:
            tb_env_val = int(tb_env) if tb_env is not None and str(tb_env).strip() != "" else None
        except Exception:
            tb_env_val = None
        src = None
        if isinstance(tb_eb, int):
            eff_tb = tb_eb; src = "extra_body"
        elif isinstance(tb_param, int):
            eff_tb = tb_param; src = "params"
        else:
            eff_tb = tb_env_val; src = "env" if tb_env_val is not None else None
        self.thinking_budget: Optional[int] = eff_tb
        self._thinking_budget_source = src

        # Ensure extra_body carries the effective thinking_budget if set (so the outgoing request matches the official API)
        if isinstance(self.thinking_budget, int) and self.thinking_budget > 0:
            if self.extra_body is None:
                self.extra_body = {}
            self.extra_body.setdefault("thinking_budget", self.thinking_budget)

        # If thinking is enabled but caller did not explicitly ask to expose reasoning,
        # proactively request it (many providers require include_reasoning true).
        if self.enable_thinking_flag:
            if self.extra_body is None:
                self.extra_body = {}
            # include_reasoning (DashScope & some gateways)
            self.extra_body.setdefault("include_reasoning", True)
            # Some self-hosted gateways require putting flags under chat_template_kwargs
            ctk = self.extra_body.get("chat_template_kwargs")
            if not isinstance(ctk, dict):
                ctk = {}
            # Make sure both enable_thinking and include_reasoning are mirrored here too
            ctk.setdefault("enable_thinking", True)
            ctk.setdefault("include_reasoning", True)
            # Mirror thinking_budget as well for gateways that read it there
            if isinstance(self.thinking_budget, int) and self.thinking_budget > 0:
                ctk.setdefault("thinking_budget", self.thinking_budget)
            self.extra_body["chat_template_kwargs"] = ctk

    # ---- Internal helpers -------------------------------------------------

    @staticmethod
    def _default_stops() -> List[str]:
        # Include half-closed variants to stop early and avoid double-closing.
        return ["</CIR_JSON>", "</IR_OUT>", "</CIR_JSON", "</IR_OUT"]

    def _compose_stops(self, prompt: str) -> List[str]:
        base = list(self.stop or [])
        if self.auto_stop_irout:
            base += self._default_stops()
        # dedupe while preserving order
        seen, out = set(), []
        for s in base:
            if s not in seen:
                out.append(s); seen.add(s)
        return out

    def _chat_stream(self, prompt: str, *, max_tokens: Optional[int] = None, prior: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
        """
        Streaming path for Qwen3 thinking mode (per provider docs: thinking only supports streaming).
        Accumulates content pieces; fetches usage from the final chunk when include_usage is enabled.
        """
        t0 = time.time()
        # Default: DO NOT print reasoning logs unless explicitly enabled by env.
        _lr_env = os.getenv("CIRBENCH_QWEN_LOG_REASONING", "0").strip().lower()
        log_reasoning = _lr_env in ("1", "true", "yes", "on")
        # 'full' logs follow global debug switches; if log_reasoning is False, we still allow normal debug printing elsewhere
        log_full = (os.getenv("CIRBENCH_DEBUG_FULL", "0") == "1" or os.getenv("CIRBENCH_DEBUG", "0") == "1")
        reasoning_text_parts: list[str] = []
        reasoning_chunk_count = 0
        is_answering = False
        if log_reasoning:
            print("\n" + "=" * 20 + "Thinking: " + "=" * 20 + "\n")
        try:
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ]
            if prior:
                messages.append({"role": "assistant", "content": prior})
                messages.append({
                    "role": "user",
                    "content": (
                        "Continue exactly where you left off. "
                        "Do not repeat earlier text. "
                        "Stop immediately after you emit the closing tag (e.g., </CIR_JSON> or </IR_OUT>)."
                    ),
                })

            kwargs: Dict[str, Any] = {
                "model": self.model,
                "messages": messages,
                "stream": True,
                "stream_options": {"include_usage": True},
            }
            if self.temperature is not None:
                kwargs["temperature"] = self.temperature
            if self.top_p is not None:
                kwargs["top_p"] = self.top_p
            if max_tokens is not None:
                kwargs["max_tokens"] = int(max_tokens)
            if self.timeout is not None:
                kwargs["timeout"] = float(self.timeout)
            if self.stop:
                kwargs["stop"] = self._compose_stops(prompt)

            attached_extra = None
            if self.enable_thinking_flag:
                if self.extra_body is None:
                    self.extra_body = {}
                self.extra_body.setdefault("enable_thinking", True)
                if isinstance(self.thinking_budget, int) and self.thinking_budget > 0:
                    self.extra_body.setdefault("thinking_budget", self.thinking_budget)
            if self.extra_body:
                kwargs["extra_body"] = self.extra_body
                attached_extra = dict(self.extra_body)

            if os.getenv("CIRBENCH_DEBUG") == "1":
                print("[DEBUG.qwen.stream] kwargs keys:", sorted(list(kwargs.keys())))

            # Start streaming
            stream = self.client.chat.completions.create(**kwargs)
            parts: list[str] = []
            finish = "unknown"
            last_usage = None
            reasoning_tokens = None

            for chunk in stream:
                try:
                    # usage may be available only on the final chunk
                    u = getattr(chunk, "usage", None)
                    if u is not None:
                        last_usage = u
                        # Try to extract reasoning tokens if provided
                        try:
                            ctd = getattr(u, "completion_tokens_details", None)
                            if ctd is not None:
                                reasoning_tokens = getattr(ctd, "reasoning_tokens", reasoning_tokens)
                            otd = getattr(u, "output_tokens_details", None)
                            if otd is not None:
                                reasoning_tokens = getattr(otd, "reasoning_tokens", reasoning_tokens)
                            if isinstance(u, dict):
                                if isinstance(u.get("completion_tokens_details"), dict):
                                    reasoning_tokens = u["completion_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                                if isinstance(u.get("output_tokens_details"), dict):
                                    reasoning_tokens = u["output_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                        except Exception:
                            pass
                    # If this chunk has no choices (usage-only), print token usage as in the official sample
                    if log_reasoning and (getattr(chunk, "choices", None) in (None, [], ())):
                        print("\n" + "=" * 20 + "Token Consumption" + "=" * 20 + "\n")
                        try:
                            # chunk.usage may be an object or dict
                            print(getattr(chunk, "usage", None) or {})
                        except Exception:
                            print("{}")
                except Exception:
                    pass

                # choices / deltas
                try:
                    chs = getattr(chunk, "choices", None)
                    if not chs and isinstance(chunk, dict):
                        chs = chunk.get("choices")
                    if chs:
                        c0 = chs[0]
                        # finish_reason may be present on the terminal chunk
                        fr = getattr(c0, "finish_reason", None)
                        if fr is None and isinstance(c0, dict):
                            fr = c0.get("finish_reason")
                        if fr:
                            finish = _finish_reason(c0)

                        delta = getattr(c0, "delta", None)
                        if delta is None and isinstance(c0, dict):
                            delta = c0.get("delta")

                        if delta is not None:
                            content = getattr(delta, "content", None)
                            if content is None and isinstance(delta, dict):
                                content = delta.get("content")
                            if content:
                                parts.append(content)
                                if not is_answering and log_reasoning:
                                    print("\n" + "=" * 20 + "Full resp" + "=" * 20 + "\n")
                                    is_answering = True

                            # Reasoning content
                            _rc = getattr(delta, "reasoning_content", None)
                            if _rc is None and isinstance(delta, dict):
                                _rc = delta.get("reasoning_content")
                            if _rc:
                                reasoning_text_parts.append(_rc)
                                reasoning_chunk_count += 1
                                if log_reasoning and not is_answering:
                                    print(_rc, end="", flush=True)
                except Exception:
                    pass

            text = "".join(parts)
            reasoning_text = "".join(reasoning_text_parts)
            reasoning_len = len(reasoning_text) if reasoning_text else 0
            reasoning_preview = (reasoning_text[:240] + ("..." if reasoning_len > 240 else "")) if reasoning_text else None
            ptoks, ctoks, ttoks = _safe_get_usage_tokens(last_usage or {})

            meta = {
                "provider": "qwen",
                "model": self.model,
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": finish,
                "prompt_tokens": ptoks,
                "out_tokens": ctoks,
                "total_tokens": ttoks,
                "max_tokens_used": max_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "reasoning_tokens": reasoning_tokens,
                "thinking_budget": self.thinking_budget,
                "thinking_budget_source": self._thinking_budget_source,
                "reasoning_text_len": reasoning_len,
                "reasoning_text_preview": reasoning_preview,
                "had_reasoning_stream": bool(reasoning_len > 0),
                "reasoning_chunk_count": int(reasoning_chunk_count),
                "extra_body_keys": list(attached_extra.keys()) if isinstance(attached_extra, dict) else None,
                "streaming": True,
            }
            if bool(self.enable_thinking_flag) and reasoning_chunk_count == 0:
                # Default to silent; print only if explicitly allowed
                if os.getenv("CIRBENCH_QWEN_SILENCE_WARN", "1").strip().lower() in ("0", "false", "no", "off"):
                    print("[WARN.qwen] thinking enabled but no reasoning_content observed. Check: (1) model supports thinking (e.g., qwen3-max-preview), (2) base_url & API key region match, (3) gateway returns reasoning_content.")
            return text, meta
        except Exception as ex:
            return "", {
                "provider": "qwen",
                "model": self.model,
                "error": f"{type(ex).__name__}:{str(ex)[:300]}",
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": "exception",
                "prompt_tokens": None,
                "out_tokens": None,
                "total_tokens": None,
                "max_tokens_used": max_tokens,
                "reasoning_tokens": None,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "thinking_budget": self.thinking_budget,
                "thinking_budget_source": self._thinking_budget_source,
                "streaming": True,
            }

    def _chat_once(self, prompt: str, *, max_tokens: Optional[int] = None, prior: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
        """
        Make a single non-streaming chat call. Return (text, meta).
        `meta` includes usage, finish_reason, and low-level details.
        """
        # For Qwen3 thinking mode, provider docs state only streaming is supported
        if bool(self.enable_thinking_flag):
            return self._chat_stream(prompt, max_tokens=max_tokens, prior=prior)
        t0 = time.time()
        try:
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ]
            # If we have partial assistant content, ask the model to continue exactly
            if prior:
                messages.append({"role": "assistant", "content": prior})
                messages.append({
                    "role": "user",
                    "content": (
                        "Continue exactly where you left off. "
                        "Do not repeat earlier text. "
                        "Stop immediately after you emit the closing tag (e.g., </CIR_JSON> or </IR_OUT>)."
                    ),
                })

            kwargs: Dict[str, Any] = {
                "model": self.model,
                "messages": messages,
                "stream": False,
            }
            if self.temperature is not None:
                kwargs["temperature"] = self.temperature
            if self.top_p is not None:
                kwargs["top_p"] = self.top_p
            if max_tokens is not None:
                kwargs["max_tokens"] = int(max_tokens)
            if self.timeout is not None:
                kwargs["timeout"] = float(self.timeout)

            attached_extra = None
            if self.extra_body:
                kwargs["extra_body"] = self.extra_body
                attached_extra = dict(self.extra_body)

            if os.getenv("CIRBENCH_DEBUG") == "1":
                print("[DEBUG.qwen.nostream] kwargs keys:", sorted(list(kwargs.keys())))
            resp = self.client.chat.completions.create(**kwargs)

            text = ""
            finish = "unknown"
            if resp and getattr(resp, "choices", None):
                choice0 = resp.choices[0]
                finish = _finish_reason(choice0)
                msg = getattr(choice0, "message", None)
                if msg is None and isinstance(choice0, dict):
                    msg = choice0.get("message")
                if msg is not None:
                    content = getattr(msg, "content", None)
                    if content is None and isinstance(msg, dict):
                        content = msg.get("content", "")
                    text = content or ""

            ptoks, ctoks, ttoks = _safe_get_usage_tokens(getattr(resp, "usage", None) or {})

            # Optional: some OpenAI-compatible servers expose detailed token info
            # Try both completion_tokens_details.reasoning_tokens and output_tokens_details.reasoning_tokens
            reasoning_tokens = None
            try:
                usage = getattr(resp, "usage", None)
                # Attribute style
                ctd = getattr(usage, "completion_tokens_details", None)
                if ctd is not None:
                    reasoning_tokens = getattr(ctd, "reasoning_tokens", reasoning_tokens)
                otd = getattr(usage, "output_tokens_details", None)
                if otd is not None:
                    reasoning_tokens = getattr(otd, "reasoning_tokens", reasoning_tokens)
                # Dict style
                if isinstance(usage, dict):
                    if isinstance(usage.get("completion_tokens_details"), dict):
                        reasoning_tokens = usage["completion_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                    if isinstance(usage.get("output_tokens_details"), dict):
                        reasoning_tokens = usage["output_tokens_details"].get("reasoning_tokens", reasoning_tokens)
            except Exception:
                pass

            meta = {
                "provider": "qwen",
                "model": self.model,
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": finish,
                "prompt_tokens": ptoks,
                "out_tokens": ctoks,
                "total_tokens": ttoks,
                "max_tokens_used": max_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "reasoning_tokens": reasoning_tokens,
                "thinking_budget": self.thinking_budget,
                "thinking_budget_source": self._thinking_budget_source,
                "extra_body_keys": list(attached_extra.keys()) if isinstance(attached_extra, dict) else None,
                "streaming": False,
            }
            return text, meta
        except Exception as ex:
            return "", {
                "provider": "qwen",
                "model": self.model,
                "error": f"{type(ex).__name__}:{str(ex)[:300]}",
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": "exception",
                "prompt_tokens": None,
                "out_tokens": None,
                "total_tokens": None,
                "max_tokens_used": max_tokens,
                "reasoning_tokens": reasoning_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "thinking_budget": self.thinking_budget,
                "thinking_budget_source": self._thinking_budget_source,
                "streaming": False,
            }

    # ---- Public API -------------------------------------------------------

    def generate(self, prompts: List[str]) -> List[_Out]:
        outs: List[_Out] = []

        def _has_stop(full_text: str, prompt: str) -> bool:
            if not full_text:
                return False
            stops = self._compose_stops(prompt)
            return any(s in full_text for s in stops)

        for p in prompts:
            # First attempt (respect user-provided max_tokens if any)
            text, meta = self._chat_once(p, max_tokens=self.user_max_tokens)

            # Decide whether to retry due to LENGTH/MAX_TOKENS
            fr = str(meta.get("finish_reason", "")).lower()
            pt = meta.get("prompt_tokens")
            ct = meta.get("out_tokens")

            need_retry = False
            context_overflow = False
            retry_allowed = None  # how many max_tokens we plan to give on retry

            if self.retry_on_length:
                # Trigger 1: explicit finish_reason indicates length bound
                if fr in {"length", "max_tokens"}:
                    need_retry = True
                # Trigger 2: zero completion with known large prompt and we have ctx_limit hint
                elif (ct in (0, None)) and (self.ctx_limit is not None) and (pt is not None):
                    need_retry = True

            if need_retry and self.ctx_limit is not None and (pt is not None):
                allowed = int(self.ctx_limit) - int(pt) - int(self.safety_tokens)
                if allowed <= 0:
                    context_overflow = True
                    need_retry = False
                else:
                    retry_allowed = max(self.min_gen_tokens, min(allowed, self.retry_cap_tokens))

            if need_retry and (retry_allowed is None):
                retry_allowed = min(self.retry_cap_tokens, max(self.min_gen_tokens, 512))

            if need_retry and (retry_allowed is not None) and (retry_allowed > 0) and (not _has_stop(text, p)):
                text2, meta2 = self._chat_once(p, max_tokens=int(retry_allowed))
                text = text2 or text
                meta.update({
                    "retry": True,
                    "retry_allowed_tokens": int(retry_allowed),
                    "finish_reason": meta2.get("finish_reason", meta.get("finish_reason")),
                    "prompt_tokens": meta2.get("prompt_tokens", meta.get("prompt_tokens")),
                    "out_tokens": meta2.get("out_tokens", meta.get("out_tokens")),
                    "total_tokens": meta2.get("total_tokens", meta.get("total_tokens")),
                    "max_tokens_used": meta2.get("max_tokens_used", meta.get("max_tokens_used")),
                    "latency_ms": meta.get("latency_ms", 0) + meta2.get("latency_ms", 0),
                })
            else:
                meta.update({
                    "retry": False,
                    "retry_allowed_tokens": retry_allowed,
                    "context_overflow": context_overflow,
                    "ctx_limit": self.ctx_limit,
                })

            # Continuations: only if we have not yet seen a stop
            continued = 0
            need_more = not _has_stop(text, p)
            while need_more and (continued < self.max_continuations):
                # Compute a safe token budget for the continuation
                cont_tokens = min(self.retry_cap_tokens, max(self.min_gen_tokens, 512))
                if (self.ctx_limit is not None) and (meta.get("prompt_tokens") is not None):
                    allowed = int(self.ctx_limit) - int(meta["prompt_tokens"]) - int(self.safety_tokens)
                    if allowed <= 0:
                        break
                    cont_tokens = max(self.min_gen_tokens, min(cont_tokens, allowed))

                # Use tail of existing text as prior to help the model continue without repeating
                tail = text[-4000:] if text else ""
                t_more, m_more = self._chat_once(p, max_tokens=int(cont_tokens), prior=tail)
                if not t_more:
                    break

                text += t_more
                continued += 1

                # Accumulate meta
                meta["latency_ms"] = int(meta.get("latency_ms", 0)) + int(m_more.get("latency_ms", 0))
                meta["out_tokens"] = (meta.get("out_tokens") or 0) + (m_more.get("out_tokens") or 0)
                meta["finish_reason"] = m_more.get("finish_reason", meta.get("finish_reason"))

                # Stop once we detect any stop sequence
                if _has_stop(text, p):
                    break
                need_more = not _has_stop(text, p)

            meta["continued"] = bool(continued)
            meta["continued_steps"] = continued

            outs.append(_Out(text, meta))
        return outs


def make(model_cfg: Dict[str, Any]):
    """
    Factory entrypoint for cirbench.utils.api.base.make_runner()
    Expects: model_cfg = {"kind": "qwen", "name": "<model>", "params": {...}}
    """
    name = model_cfg.get("name") or "qwen3-max"
    params = model_cfg.get("params") or {}
    return QwenRunner(name, params)