# Event Ingestion Pipeline

Notes for how we ingest events from prediction markets and persist them.

## At a Glance

```
Exchange APIs ──▶ Discovery Jobs ──▶ Normalization & DB ──▶ Market Snapshots
                                          │
                                          ▼
                               Close-Time + Outcome Watchers
```


- **Discovery:** ranked ticker harvesting plus duplicate screening.
- **Normalization:** deterministic conversion from upstream payloads to event rows.
- **Snapshots:** periodic capture of liquidity/volume histories.
- **Close & Outcome:** watchers that keep close times honest and persist resolutions.

## Records (simplified)

```python
@dataclass
class EventRecord:
    ticker: str
    title: str
    category: str
    markets: list[str]          # ordered subtitles
    close_time: datetime        # tz-aware
    rules_text: str | None
    updated_at: datetime

@dataclass
class MarketSnapshot:
    ticker: str
    market: str
    volume: float
    liquidity: float
    last_price: float
    yes_bid: float
    no_bid: float
    captured_at: datetime
```

These structs are the only payloads allowed to cross service boundaries.

## Guardrails

- **Runway:** ignore any candidate with <6h remaining.
- **Retries:** insert only if the ticker is absent; blind reruns stay idempotent.
- **Scheduling:** refresh close times before enqueuing follow-ups.
- **Ordering:** persist tz-aware `close_time` values so pagination stays deterministic.


## Representative Snippets

All examples below are pythonic represenations/pseudocode of the Prophet Arena event sourcing pipeline.


### Batch Discovery

```python
def ingest_batch(batch_size: int) -> None:
    tickers = exchange_client.discover(
        batch_size,
        exclude=events_repo.all_tickers(),
    )
    for payload in exchange_client.hydrate(tickers):
        event = normalize(payload)
        if event.close_time <= clock.now() + MIN_LOOKAHEAD:
            continue
        events_repo.insert_if_missing(event)
```

### Persistence + Deduplication

```python
def persist_events(events: list[EventRecord], session: SessionLike) -> None:
    for chunk in chunked(events, size=100):
        new_rows = [
            row for row in chunk
            if not session.exists("events", ticker=row.ticker)
        ]
        session.bulk_insert("events", [
            {
                "event_ticker": row.ticker,
                "title": row.title,
                "markets": json.dumps(row.markets),
                "category": row.category,
                "close_time": row.close_time,
                "rules": row.rules_text,
                "updated_at": row.updated_at,
            }
            for row in new_rows
        ])
        session.commit()
```

### Scheduling the Next Harvest

```python
def enqueue_refresh(event: EventRecord, depth: int) -> None:
    horizon = event.close_time - clock.now()
    if horizon <= MIN_CHAIN_WINDOW or depth >= MAX_CHAIN_DEPTH:
        return
    job = SchedulerJob(
        kind="event_refresh",
        run_at=clock.now() + horizon / 2,
        payload={"event": event.snapshot(), "depth": depth + 1},
    )
    scheduler.create(job, key=ENV["SCHEDULER_API_KEY"])
```

### Market Snapshot Capture

```python
def capture_markets(event: EventRecord) -> list[MarketSnapshot]:
    payload = exchange_client.fetch_markets(event.ticker)
    return [
        MarketSnapshot(
            ticker=event.ticker,
            market=item["yes_sub_title"],
            volume=item["volume"],
            liquidity=item["liquidity"],
            last_price=item["last_price"],
            yes_bid=item["yes_bid"],
            no_bid=item["no_bid"],
            captured_at=clock.now(),
        )
        for item in payload.get("markets", [])
    ]
```

### Resolution Watcher

```python
def sync_outcome(event: EventRecord) -> None:
    latest = exchange_client.fetch_event(event.ticker)
    if all(m["status"] == "finalized" for m in latest["markets"]):
        result = {
            m["yes_sub_title"]: 1 if m["result"] == "yes" else 0
            for m in latest["markets"]
        }
        store_resolution(event.ticker, result, min_close_time(latest["markets"]))
    else:
        extend_close_time(event.ticker, max_expected_close(latest["markets"]))
```





## "Ready to execute" Simple script to pull events from Kalshi (see get_kalshi_events.py in this dir)
```python
#!/usr/bin/env python3
"""
Simple script to fetch the top N upcoming Kalshi events using the public trial API
(unauthenticated). It mirrors but simplifies the prophet arena ingestion pipeline.

Example:
    $ python3 get_kalshi_events.py --limit 1
"""


import argparse
import os
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterable, List
import requests  # type: ignore
BASE_URL = os.getenv("KALSHI_BASE_URL", "https://api.elections.kalshi.com")
MIN_RUNWAY = timedelta(hours=6)


def kalshi_request(path: str, *, params: Dict[str, Any] | None = None) -> Dict[str, Any]:
    response = requests.get(
        f"{BASE_URL}{path}",
        params=params,
        timeout=30,
    )
    response.raise_for_status()
    return response.json()


def parse_close_time(value: Any) -> datetime | None:
    if not value:
        return None

    if isinstance(value, (int, float)):
        # Kalshi sometimes sends unix timestamps
        return datetime.fromtimestamp(float(value), tz=timezone.utc)

    if isinstance(value, str):
        text = value.strip()
        if not text:
            return None
        if text.endswith("Z"):
            text = text[:-1] + "+00:00"
        try:
            return datetime.fromisoformat(text)
        except ValueError:
            return None

    if isinstance(value, datetime):
        return value if value.tzinfo else value.replace(tzinfo=timezone.utc)

    return None


def collect_candidates(events: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
    now = datetime.now(timezone.utc)
    merged: Dict[str, Dict[str, Any]] = {}

    for event in events:
        ticker = event.get("event_ticker")
        if not ticker:
            continue

        nested_markets = event.get("markets") or []
        if not nested_markets:
            # fallback: treat event as a single market with aggregate fields
            nested_markets = [event]

        close_time = parse_close_time(event.get("close_time"))
        if not close_time:
            close_candidates = [
                parse_close_time(
                    market.get("expected_expiration_time") or market.get("close_time")
                )
                for market in nested_markets
            ]
            close_candidates = [candidate for candidate in close_candidates if candidate]
            close_time = min(close_candidates, default=None)

        if not close_time or close_time - now < MIN_RUNWAY:
            continue

        merged.setdefault(
            ticker,
            {
                "title": event.get("title") or ticker,
                "close_time": close_time,
                "open_interest": 0.0,
                "volume": 0.0,
                "markets": set(),
            },
        )

        entry = merged[ticker]
        entry["close_time"] = min(entry["close_time"], close_time)

        for market in nested_markets:
            entry["open_interest"] += float(market.get("open_interest") or 0)
            entry["volume"] += float(market.get("volume") or 0)
            subtitle = market.get("yes_sub_title") or market.get("title")
            if subtitle:
                entry["markets"].add(subtitle)

    summaries = []
    for ticker, data in merged.items():
        summaries.append({
            "event_ticker": ticker,
            "title": data["title"],
            "close_time": data["close_time"],
            "open_interest": data["open_interest"],
            "volume": data["volume"],
            "markets": sorted(data["markets"])[:5],
        })
    return summaries


def fetch_top_events(limit: int) -> List[Dict[str, Any]]:
    payload = kalshi_request(
        "/trade-api/v2/events",
        params={
            "status": "open",
            "limit": 200,
            "with_nested_markets": "true",
        },
    )
    events = payload.get("events") or []
    candidates = collect_candidates(events)
    candidates.sort(
        key=lambda e: (
            -e["open_interest"],
            -e["volume"],
            e["close_time"],
        )
    )
    return candidates[:limit]


def main() -> None:
    parser = argparse.ArgumentParser(description="Fetch top Kalshi events.")
    parser.add_argument("--limit", type=int, default=5, help="number of events to show")
    args = parser.parse_args()
    events = fetch_top_events(args.limit)
    for idx, event in enumerate(events, start=1):
        print(f"{idx}. {event['event_ticker']} — {event['title']}")
        print(
            f"   close: {event['close_time'].isoformat()} | "
            f"open interest: {event['open_interest']:.0f} | "
            f"volume: {event['volume']:.0f}"
        )
        for market in event["markets"]:
            print(f"   • {market}")


if __name__ == "__main__":
    main()
```