# cirbench/utils/api/grok.py
from __future__ import annotations
import os, time, json
from typing import List, Dict, Any, Optional

# Small helper: parse boolean-ish env vars ("1/true/yes/on")
def _env_on(name: str, default: str = "0") -> bool:
    v = os.getenv(name, default)
    return isinstance(v, str) and v.strip().lower() in ("1", "true", "yes", "on")

try:
    # Requires: pip install openai>=1.0
    from openai import OpenAI
except Exception:
    OpenAI = None


class _Out:
    def __init__(self, text: str, meta: Dict[str, Any]):
        self.text = text
        self.meta = meta


def _safe_get_usage_tokens(usage: Any) -> tuple[Optional[int], Optional[int], Optional[int]]:
    """
    Normalize usage payload from OpenAI-compatible servers.
    Returns (prompt_tokens, completion_tokens, total_tokens)
    """
    if not usage:
        return (None, None, None)
    # Some SDKs expose attributes, others dicts
    pt = getattr(usage, "prompt_tokens", None)
    ct = getattr(usage, "completion_tokens", None)
    tt = getattr(usage, "total_tokens", None)
    if isinstance(usage, dict):
        pt = usage.get("prompt_tokens", pt)
        ct = usage.get("completion_tokens", ct)
        tt = usage.get("total_tokens", tt)
    return (pt, ct, tt)


def _finish_reason(choice: Any) -> str:
    """
    Normalize finish_reason to a lower-case keyword.
    We map 'MAX_TOKENS' to 'length' for consistency.
    """
    fr = None
    if choice is not None:
        fr = getattr(choice, "finish_reason", None)
        if isinstance(choice, dict):
            fr = choice.get("finish_reason", fr)
    if not fr:
        return "unknown"
    frs = str(fr).strip().lower()
    return "length" if frs == "max_tokens" else frs


class GrokRunner:
    """
    xAI Grok runner (OpenAI-compatible).
    Expected by cirbench: generate(prompts: List[str]) -> List[_Out]

    Features:
      - Respects temperature/top_p and user-specified max_tokens/max_output_tokens.
      - Auto stop on IR tags and lenient continuations when cut by length.
      - Merges multiple continuations and stops as soon as closing tags appear.
    """

    def __init__(self, model: str, params: Dict[str, Any] | None = None):
        if OpenAI is None:
            raise RuntimeError("Missing dependency: `openai` package is required.")
        ps = dict(params or {})
        api_key = ps.get("api_key") or os.getenv("OPENAI_API_KEY")
        base_url = ps.get("base_url") or os.getenv("OPENAI_BASE_URL")
        if not api_key:
            raise RuntimeError("OPENAI_API_KEY not set (and no `params.api_key` provided).")

        if base_url:
            self.client = OpenAI(api_key=api_key, base_url=base_url)
        else:
            self.client = OpenAI(api_key=api_key)
        self.model = model
        self.system_prompt = ps.get("system") or "You are a helpful assistant."

        # Optional generation params
        self.temperature: Optional[float] = ps.get("temperature", None)
        self.top_p: Optional[float] = ps.get("top_p", None)

        # User-specified output limit (OpenAI-style or Gemini-style)
        self.user_max_tokens: Optional[int] = ps.get("max_tokens", None)
        if self.user_max_tokens is None:
            mo = ps.get("max_output_tokens")
            if mo is not None:
                try:
                    self.user_max_tokens = int(mo)
                except Exception:
                    pass

        # Stop sequences: allow 'stop' or 'stop_sequences'
        self.stop: Optional[List[str]] = None
        _stop = ps.get("stop", None)
        if _stop is None:
            _stop = ps.get("stop_sequences", None)
        if _stop is not None:
            if isinstance(_stop, (list, tuple)):
                self.stop = [str(s) for s in _stop if s is not None]
            else:
                self.stop = [str(_stop)]

        # Auto add IR stops (and their half-closed variants) to be conservative
        self.auto_stop_irout: bool = bool(ps.get("auto_stop_irout", True))

        # Dynamic retry & continuation controls
        self.ctx_limit: Optional[int] = ps.get("ctx_limit", None)     # e.g., 32000/131072 if you know it
        self.safety_tokens: int = int(ps.get("safety_tokens", 128))   # budget cushion
        self.min_gen_tokens: int = int(ps.get("min_gen_tokens", 64))
        self.retry_on_length: bool = bool(ps.get("retry_on_length", True))
        self.retry_cap_tokens: int = int(ps.get("retry_cap_tokens", 1024))
        self.max_continuations: int = int(ps.get("max_continuations", 0))

        # Request timeout (seconds)
        self.timeout: Optional[float] = ps.get("timeout", None)

        # Provider-specific passthrough: extra_body (OpenAI SDK supports this)
        self.extra_body: Optional[Dict[str, Any]] = None
        eb = ps.get("extra_body", None)
        if eb is not None:
            if isinstance(eb, dict):
                self.extra_body = eb
            elif isinstance(eb, str):
                # allow YAML users to pass a JSON string; ignore parse errors silently
                try:
                    self.extra_body = json.loads(eb)
                except Exception:
                    self.extra_body = None

        # Convenience flag to surface in meta for easy auditing
        self.enable_thinking_flag: Optional[bool] = None
        if isinstance(self.extra_body, dict):
            v = self.extra_body.get("enable_thinking")
            if isinstance(v, bool):
                self.enable_thinking_flag = v
            elif isinstance(v, str):
                self.enable_thinking_flag = (v.strip().lower() == "true")

    # ---- Internal helpers -------------------------------------------------

    @staticmethod
    def _default_stops() -> List[str]:
        # Include half-closed variants to stop early and avoid double-closing.
        return ["</CIR_JSON>", "</IR_OUT>", "</CIR_JSON", "</IR_OUT"]

    def _compose_stops(self, prompt: str) -> List[str]:
        base = list(self.stop or [])
        if self.auto_stop_irout:
            base += self._default_stops()
        # dedupe while preserving order
        seen, out = set(), []
        for s in base:
            if s not in seen:
                out.append(s); seen.add(s)
        return out

    def _chat_stream(self, prompt: str, *, max_tokens: Optional[int] = None, prior: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
        """
        Streaming path for Qwen3 thinking mode (per provider docs: thinking only supports streaming).
        Accumulates content pieces; fetches usage from the final chunk when include_usage is enabled.
        """
        t0 = time.time()
        # Default: DO NOT print reasoning logs unless explicitly enabled by env.
        _lr_env = os.getenv("CIRBENCH_GPT_LOG_REASONING", "0").strip().lower()
        log_reasoning = _lr_env in ("1", "true", "yes", "on")
        # 'full' logs follow global debug switches; if log_reasoning is False, we still allow normal debug printing elsewhere
        log_full = (os.getenv("CIRBENCH_DEBUG_FULL", "0") == "1" or os.getenv("CIRBENCH_DEBUG", "0") == "1")
        reasoning_text_parts: list[str] = []
        reasoning_chunk_count = 0
        is_answering = False

        # Additional diagnostics controls
        log_chunks = _env_on("CIRBENCH_GPT_LOG_CHUNKS")
        log_summary = _env_on("CIRBENCH_GPT_LOG_STREAM_SUMMARY")

        # Streaming counters & timing
        chunks_total = 0
        chunks_with_choices = 0
        chunks_with_delta_content = 0
        chunks_with_delta_reasoning = 0
        chunks_usage_only = 0
        finish_reason_set_count = 0
        first_token_ms = None  # Time-to-first-token (TTFT)
        if log_reasoning:
            print("\n" + "=" * 20 + "Thinking:" + "=" * 20 + "\n")
        try:
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ]
            if prior:
                messages.append({"role": "assistant", "content": prior})
                messages.append({
                    "role": "user",
                    "content": (
                        "Continue exactly where you left off. "
                        "Do not repeat earlier text. "
                        "Stop immediately after you emit the closing tag (e.g., </CIR_JSON> or </IR_OUT>)."
                    ),
                })

            kwargs: Dict[str, Any] = {
                "model": self.model,
                "messages": messages,
                "stream": True,
                "stream_options": {"include_usage": True},
            }
            if self.temperature is not None:
                kwargs["temperature"] = self.temperature
            if self.top_p is not None:
                kwargs["top_p"] = self.top_p
            if max_tokens is not None:
                kwargs["max_tokens"] = int(max_tokens)
            if self.timeout is not None:
                kwargs["timeout"] = float(self.timeout)

            attached_extra = None
            if self.extra_body:
                kwargs["extra_body"] = self.extra_body
                attached_extra = dict(self.extra_body)

            if os.getenv("CIRBENCH_DEBUG") == "1":
                print("[DEBUG.grok.stream] kwargs keys:", sorted(list(kwargs.keys())))

            # Start streaming
            stream = self.client.chat.completions.create(**kwargs)
            parts: list[str] = []
            finish = "unknown"
            last_usage = None
            reasoning_tokens = None

            for chunk in stream:
                # Per-chunk flags/counters
                chunks_total += 1
                had_usage = False
                had_choices = False
                had_content = False
                content_len = 0
                had_reasoning = False
                reasoning_len_chunk = 0
                had_finish = False

                try:
                    # usage may be available only on the final chunk
                    u = getattr(chunk, "usage", None)
                    if u is not None:
                        last_usage = u
                        had_usage = True
                        # Try to extract reasoning tokens if provided
                        try:
                            ctd = getattr(u, "completion_tokens_details", None)
                            if ctd is not None:
                                reasoning_tokens = getattr(ctd, "reasoning_tokens", reasoning_tokens)
                            otd = getattr(u, "output_tokens_details", None)
                            if otd is not None:
                                reasoning_tokens = getattr(otd, "reasoning_tokens", reasoning_tokens)
                            if isinstance(u, dict):
                                if isinstance(u.get("completion_tokens_details"), dict):
                                    reasoning_tokens = u["completion_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                                if isinstance(u.get("output_tokens_details"), dict):
                                    reasoning_tokens = u["output_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                        except Exception:
                            pass
                    # If this chunk has no choices (usage-only), print token usage as in the official sample
                    if log_reasoning and (getattr(chunk, "choices", None) in (None, [], ())):
                        print("\n" + "=" * 20 + "Token Consumption" + "=" * 20 + "\n")
                        try:
                            # chunk.usage may be an object or dict
                            print(getattr(chunk, "usage", None) or {})
                        except Exception:
                            print("{}")
                except Exception:
                    pass

                # choices / deltas
                try:
                    chs = getattr(chunk, "choices", None)
                    if not chs and isinstance(chunk, dict):
                        chs = chunk.get("choices")
                    if chs:
                        had_choices = True
                        c0 = chs[0]
                        # finish_reason may be present on the terminal chunk
                        fr = getattr(c0, "finish_reason", None)
                        if fr is None and isinstance(c0, dict):
                            fr = c0.get("finish_reason")
                        if fr:
                            finish = _finish_reason(c0)
                            had_finish = True
                            finish_reason_set_count += 1

                        delta = getattr(c0, "delta", None)
                        if delta is None and isinstance(c0, dict):
                            delta = c0.get("delta")

                        if delta is not None:
                            content = getattr(delta, "content", None)
                            if content is None and isinstance(delta, dict):
                                content = delta.get("content")
                            if content:
                                parts.append(content)
                                content_len = len(content or "")
                                had_content = bool(content)
                                if had_content and first_token_ms is None:
                                    first_token_ms = int(round((time.time() - t0) * 1000))
                                if not is_answering and log_reasoning:
                                    print("\n" + "=" * 20 + "Full resp:" + "=" * 20 + "\n")
                                    is_answering = True

                            _rc = getattr(delta, "reasoning_content", None)
                            if _rc is None and isinstance(delta, dict):
                                _rc = delta.get("reasoning_content")
                            if _rc:
                                reasoning_text_parts.append(_rc)
                                reasoning_chunk_count += 1
                                reasoning_len_chunk = len(_rc or "")
                                had_reasoning = bool(_rc)
                                if log_reasoning and not is_answering:
                                    print(_rc, end="", flush=True)
                except Exception:
                    pass

                # Post-chunk counters & optional logging
                if had_usage and not had_choices:
                    chunks_usage_only += 1
                if had_choices:
                    chunks_with_choices += 1
                if had_content:
                    chunks_with_delta_content += 1
                if had_reasoning:
                    chunks_with_delta_reasoning += 1

                if log_chunks:
                    try:
                        print(
                            f"[grok.stream.chunk #{chunks_total}] usage={had_usage} choices={had_choices} "
                            f"finish={had_finish} content_len={content_len} reasoning_len={reasoning_len_chunk}"
                        )
                    except Exception:
                        pass

            text = "".join(parts)
            reasoning_text = "".join(reasoning_text_parts)
            reasoning_len = len(reasoning_text) if reasoning_text else 0
            reasoning_preview = (reasoning_text[:240] + ("..." if reasoning_len > 240 else "")) if reasoning_text else None
            ptoks, ctoks, ttoks = _safe_get_usage_tokens(last_usage or {})

            if log_summary or os.getenv("CIRBENCH_DEBUG") == "1":
                try:
                    print(
                        "[grok.stream.summary] "
                        f"chunks_total={chunks_total}, with_choices={chunks_with_choices}, "
                        f"with_content={chunks_with_delta_content}, with_reasoning={chunks_with_delta_reasoning}, "
                        f"usage_only={chunks_usage_only}, finish_reason='{finish}', "
                        f"ttft_ms={first_token_ms}, usage_present={last_usage is not None}"
                    )
                except Exception:
                    pass

            meta = {
                "provider": "grok",
                "model": self.model,
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": finish,
                "prompt_tokens": ptoks,
                "out_tokens": ctoks,
                "total_tokens": ttoks,
                "max_tokens_used": max_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "reasoning_tokens": reasoning_tokens,
                "reasoning_text_len": reasoning_len,
                "reasoning_text_preview": reasoning_preview,
                "had_reasoning_stream": bool(reasoning_len > 0),
                "reasoning_chunk_count": int(reasoning_chunk_count),
                "extra_body_keys": list(attached_extra.keys()) if isinstance(attached_extra, dict) else None,
                "streaming": True,
                "ttft_ms": first_token_ms,
                "chunks_total": chunks_total,
                "chunks_with_choices": chunks_with_choices,
                "chunks_with_delta_content": chunks_with_delta_content,
                "chunks_with_delta_reasoning": chunks_with_delta_reasoning,
                "chunks_usage_only": chunks_usage_only,
                "finish_reason_events": finish_reason_set_count,
            }
            if bool(self.enable_thinking_flag) and reasoning_chunk_count == 0:
                # Default to silent; print only if explicitly allowed
                if os.getenv("CIRBENCH_GPT_SILENCE_WARN", "1").strip().lower() in ("0", "false", "no", "off"):
                    print("[WARN.grok] thinking enabled but no reasoning_content observed. Check model support and SDK/gateway configuration.")
            return text, meta
        except Exception as ex:
            return "", {
                "provider": "grok",
                "model": self.model,
                "error": f"{type(ex).__name__}:{str(ex)[:300]}",
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": "exception",
                "prompt_tokens": None,
                "out_tokens": None,
                "total_tokens": None,
                "max_tokens_used": max_tokens,
                "reasoning_tokens": None,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "streaming": True,
            }

    def _chat_once(self, prompt: str, *, max_tokens: Optional[int] = None, prior: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
        """
        Make a single non-streaming chat call. Return (text, meta).
        `meta` includes usage, finish_reason, and low-level details.
        """
        # If thinking mode is requested by caller, prefer streaming for better token accounting
        if bool(self.enable_thinking_flag):
            return self._chat_stream(prompt, max_tokens=max_tokens, prior=prior)
        t0 = time.time()
        try:
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ]
            # If we have partial assistant content, ask the model to continue exactly
            if prior:
                messages.append({"role": "assistant", "content": prior})
                messages.append({
                    "role": "user",
                    "content": (
                        "Continue exactly where you left off. "
                        "Do not repeat earlier text. "
                        "Stop immediately after you emit the closing tag (e.g., </CIR_JSON> or </IR_OUT>)."
                    ),
                })

            kwargs: Dict[str, Any] = {
                "model": self.model,
                "messages": messages,
                "stream": False,
            }
            if self.temperature is not None:
                kwargs["temperature"] = self.temperature
            if self.top_p is not None:
                kwargs["top_p"] = self.top_p
            if max_tokens is not None:
                kwargs["max_tokens"] = int(max_tokens)
            if self.timeout is not None:
                kwargs["timeout"] = float(self.timeout)

            attached_extra = None
            if self.extra_body:
                kwargs["extra_body"] = self.extra_body
                attached_extra = dict(self.extra_body)

            if os.getenv("CIRBENCH_DEBUG") == "1":
                print("[DEBUG.grok.nostream] kwargs keys:", sorted(list(kwargs.keys())))

            resp = self.client.chat.completions.create(**kwargs)
            if _env_on("CIRBENCH_GPT_LOG_NOSTREAM_RAW"):
                try:
                    # Try structured dump first; fall back to string repr
                    dump = getattr(resp, "model_dump", None)
                    if callable(dump):
                        print("[grok.nostream.raw]", dump())
                    else:
                        print("[grok.nostream.raw]", getattr(resp, "__dict__", None) or str(resp))
                except Exception:
                    print("[grok.nostream.raw] <unserializable response>")

            text = ""
            finish = "unknown"
            if resp and getattr(resp, "choices", None):
                choice0 = resp.choices[0]
                finish = _finish_reason(choice0)
                msg = getattr(choice0, "message", None)
                if msg is None and isinstance(choice0, dict):
                    msg = choice0.get("message")
                if msg is not None:
                    content = getattr(msg, "content", None)
                    if content is None and isinstance(msg, dict):
                        content = msg.get("content", "")
                    text = content or ""

            ptoks, ctoks, ttoks = _safe_get_usage_tokens(getattr(resp, "usage", None) or {})

            # Optional: some OpenAI-compatible servers expose detailed token info
            # Try both completion_tokens_details.reasoning_tokens and output_tokens_details.reasoning_tokens
            reasoning_tokens = None
            try:
                usage = getattr(resp, "usage", None)
                # Attribute style
                ctd = getattr(usage, "completion_tokens_details", None)
                if ctd is not None:
                    reasoning_tokens = getattr(ctd, "reasoning_tokens", reasoning_tokens)
                otd = getattr(usage, "output_tokens_details", None)
                if otd is not None:
                    reasoning_tokens = getattr(otd, "reasoning_tokens", reasoning_tokens)
                # Dict style
                if isinstance(usage, dict):
                    if isinstance(usage.get("completion_tokens_details"), dict):
                        reasoning_tokens = usage["completion_tokens_details"].get("reasoning_tokens", reasoning_tokens)
                    if isinstance(usage.get("output_tokens_details"), dict):
                        reasoning_tokens = usage["output_tokens_details"].get("reasoning_tokens", reasoning_tokens)
            except Exception:
                pass

            meta = {
                "provider": "grok",
                "model": self.model,
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": finish,
                "prompt_tokens": ptoks,
                "out_tokens": ctoks,
                "total_tokens": ttoks,
                "max_tokens_used": max_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "reasoning_tokens": reasoning_tokens,
                "extra_body_keys": list(attached_extra.keys()) if isinstance(attached_extra, dict) else None,
                "streaming": False,
            }
            return text, meta
        except Exception as ex:
            return "", {
                "provider": "grok",
                "model": self.model,
                "error": f"{type(ex).__name__}:{str(ex)[:300]}",
                "latency_ms": int(round((time.time() - t0) * 1000)),
                "finish_reason": "exception",
                "prompt_tokens": None,
                "out_tokens": None,
                "total_tokens": None,
                "max_tokens_used": max_tokens,
                "reasoning_tokens": reasoning_tokens,
                "thinking_enabled": bool(self.enable_thinking_flag),
                "streaming": False,
            }

    # ---- Public API -------------------------------------------------------

    def generate(self, prompts: List[str]) -> List[_Out]:
        outs: List[_Out] = []

        def _has_stop(full_text: str, prompt: str) -> bool:
            if not full_text:
                return False
            stops = self._compose_stops(prompt)
            return any(s in full_text for s in stops)

        for p in prompts:
            # First attempt (respect user-provided max_tokens if any)
            text, meta = self._chat_once(p, max_tokens=self.user_max_tokens)

            if _env_on("CIRBENCH_GPT_LOG_STOP_MATCH"):
                try:
                    stops = self._compose_stops(p)
                    tail = (text or "")[-160:]
                    print(f"[grok.stop] detected={_has_stop(text, p)} stops={stops} tail=\"{tail}\"")
                except Exception:
                    pass

            # Decide whether to retry due to LENGTH/MAX_TOKENS
            fr = str(meta.get("finish_reason", "")).lower()
            pt = meta.get("prompt_tokens")
            ct = meta.get("out_tokens")

            need_retry = False
            context_overflow = False
            retry_allowed = None  # how many max_tokens we plan to give on retry

            if self.retry_on_length:
                # Trigger 1: explicit finish_reason indicates length bound
                if fr in {"length", "max_tokens"}:
                    need_retry = True
                # Trigger 2: zero completion with known large prompt and we have ctx_limit hint
                elif (ct in (0, None)) and (self.ctx_limit is not None) and (pt is not None):
                    need_retry = True

            if need_retry and self.ctx_limit is not None and (pt is not None):
                allowed = int(self.ctx_limit) - int(pt) - int(self.safety_tokens)
                if allowed <= 0:
                    context_overflow = True
                    need_retry = False
                else:
                    retry_allowed = max(self.min_gen_tokens, min(allowed, self.retry_cap_tokens))

            if need_retry and (retry_allowed is None):
                retry_allowed = min(self.retry_cap_tokens, max(self.min_gen_tokens, 512))

            if need_retry and (retry_allowed is not None) and (retry_allowed > 0) and (not _has_stop(text, p)):
                text2, meta2 = self._chat_once(p, max_tokens=int(retry_allowed))
                text = text2 or text
                meta.update({
                    "retry": True,
                    "retry_allowed_tokens": int(retry_allowed),
                    "finish_reason": meta2.get("finish_reason", meta.get("finish_reason")),
                    "prompt_tokens": meta2.get("prompt_tokens", meta.get("prompt_tokens")),
                    "out_tokens": meta2.get("out_tokens", meta.get("out_tokens")),
                    "total_tokens": meta2.get("total_tokens", meta.get("total_tokens")),
                    "max_tokens_used": meta2.get("max_tokens_used", meta.get("max_tokens_used")),
                    "latency_ms": meta.get("latency_ms", 0) + meta2.get("latency_ms", 0),
                })
            else:
                meta.update({
                    "retry": False,
                    "retry_allowed_tokens": retry_allowed,
                    "context_overflow": context_overflow,
                    "ctx_limit": self.ctx_limit,
                })

            # Continuations: only if we have not yet seen a stop
            continued = 0
            need_more = not _has_stop(text, p)
            while need_more and (continued < self.max_continuations):
                # Compute a safe token budget for the continuation
                cont_tokens = min(self.retry_cap_tokens, max(self.min_gen_tokens, 512))
                if (self.ctx_limit is not None) and (meta.get("prompt_tokens") is not None):
                    allowed = int(self.ctx_limit) - int(meta["prompt_tokens"]) - int(self.safety_tokens)
                    if allowed <= 0:
                        break
                    cont_tokens = max(self.min_gen_tokens, min(cont_tokens, allowed))

                # Use tail of existing text as prior to help the model continue without repeating
                tail = text[-4000:] if text else ""
                t_more, m_more = self._chat_once(p, max_tokens=int(cont_tokens), prior=tail)
                if not t_more:
                    break

                text += t_more
                continued += 1

                # Accumulate meta
                meta["latency_ms"] = int(meta.get("latency_ms", 0)) + int(m_more.get("latency_ms", 0))
                meta["out_tokens"] = (meta.get("out_tokens") or 0) + (m_more.get("out_tokens") or 0)
                meta["finish_reason"] = m_more.get("finish_reason", meta.get("finish_reason"))

                # Stop once we detect any stop sequence
                if _has_stop(text, p):
                    break
                need_more = not _has_stop(text, p)

            meta["continued"] = bool(continued)
            meta["continued_steps"] = continued

            outs.append(_Out(text, meta))
        return outs


def make(model_cfg: Dict[str, Any]):
    """
    Factory entrypoint for cirbench.utils.api.base.make_runner()
    Expects: model_cfg = {"kind": "grok", "name": "<model>", "params": {...}}
    """
    name = model_cfg.get("name") or "grok-4-0709"
    params = model_cfg.get("params") or {}
    return GrokRunner(name, params)