{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nGA7-0gbyEZE",
        "outputId": "a224fc9c-87b4-4264-9e70-4eddd556b110"
      },
      "outputs": [],
      "source": [
        "#!/usr/bin/env python\n",
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "\"\"\"\n",
        "Mini CCNet-style pipeline, reusing cc_net functions where possible.\n",
        "\n",
        "Features:\n",
        "1. Read documents from final_uniform_replace.jsonl (one JSON per line = one document)\n",
        "2. Perform document-level deduplication using cc_net.normalize_for_dedup\n",
        "3. Perform LID using cc_net.split_by_lang.Classifier\n",
        "4. Compute perplexity + bucket with cc_net.perplexity.MultiSentencePiece + DocLM + PerplexityBucket\n",
        "5. Compute retention rate of zero-width characters / watermark characters\n",
        "\"\"\"\n",
        "\n",
        "# ============================================================\n",
        "# 0️⃣ Dependency installation (Use pip in scripts; in Colab use !pip install)\n",
        "# ============================================================\n",
        "# Upgrade pip (optional)\n",
        "!pip install --upgrade pip >/dev/null\n",
        "\n",
        "# 1. Install kenlm (from GitHub; default branch is master)\n",
        "!pip install git+https://github.com/kpu/kenlm.git >/dev/null\n",
        "\n",
        "# 2. Install other dependencies\n",
        "!pip install fasttext sentencepiece tqdm >/dev/null\n",
        "\n",
        "# 3. Clone cc_net directly (no need to specify @master)\n",
        "!git clone https://github.com/facebookresearch/cc_net.git >/dev/null\n",
        "\n",
        "# 4. Add cc_net to sys.path\n",
        "import sys, os\n",
        "sys.path.append(\"/content/cc_net\")  # Default clone path in Colab\n",
        "\n",
        "# Verify import\n",
        "from cc_net import text_normalizer, split_by_lang, perplexity\n",
        "print(\"✅ cc_net imported OK\")\n",
        "\n",
        "\n",
        "import os\n",
        "import json\n",
        "import gzip\n",
        "import re\n",
        "import hashlib\n",
        "from pathlib import Path\n",
        "from typing import List, Dict, Any, Tuple\n",
        "\n",
        "import numpy as np\n",
        "from tqdm import tqdm\n",
        "\n",
        "# cc_net original functions\n",
        "from cc_net import text_normalizer\n",
        "from cc_net import split_by_lang, perplexity\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 1️⃣ Path configuration\n",
        "# ============================================================\n",
        "INPUT_FILE = Path(\"/content/final_uniform_replace.jsonl\")\n",
        "OUTPUT_DIR = Path(\"/content/cleaned_output\")\n",
        "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
        "\n",
        "CLEANED_FILE = OUTPUT_DIR / \"cleaned_ccnet.jsonl.gz\"\n",
        "DROPPED_FILE = OUTPUT_DIR / \"dropped_ccnet.jsonl\"\n",
        "PPL_FILE = OUTPUT_DIR / \"cleaned_ccnet_with_ppl.jsonl.gz\"\n",
        "\n",
        "LID_MODEL_PATH = Path(\"/content/lid.176.bin\")  # fastText LID model\n",
        "SP_MODEL_DIR = Path(\"/content/ccnet_models\")\n",
        "SP_MODEL_DIR.mkdir(parents=True, exist_ok=True)\n",
        "SP_MODEL = SP_MODEL_DIR / \"en.sp.model\"\n",
        "LM_MODEL = SP_MODEL_DIR / \"en.arpa.bin\"\n",
        "CUTOFF_CSV = SP_MODEL_DIR / \"cutoff.csv\"\n",
        "\n",
        "MYTEXT_PATH = Path(\"/content/myText.txt\")\n",
        "\n",
        "print(f\"✅ Using input: {INPUT_FILE}\")\n",
        "print(f\"✅ Output: {CLEANED_FILE}\")\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 2️⃣ fastText + NumPy 2 compatibility patch\n",
        "#    (Some fastText versions use np.assarray / np.array(copy=False))\n",
        "# ============================================================\n",
        "if not hasattr(np, \"assarray\"):\n",
        "    np.assarray = np.asarray  # compatibility patch\n",
        "\n",
        "_old_np_array = np.array  # backup\n",
        "\n",
        "\n",
        "def _safe_array(obj, *args, **kwargs):\n",
        "    # Remove \"copy=False\" to avoid NumPy 2.x incompatibility\n",
        "    if \"copy\" in kwargs and kwargs[\"copy\"] is False:\n",
        "        kwargs.pop(\"copy\")\n",
        "    return _old_np_array(obj, *args, **kwargs)\n",
        "\n",
        "\n",
        "np.array = _safe_array  # patched version\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 3️⃣ Zero-width character counting utility\n",
        "# ============================================================\n",
        "ZWC_RE = re.compile(r'[\\u200b\\u200c\\u200d\\ufeff]')\n",
        "\n",
        "\n",
        "def count_zwc(text: str) -> int:\n",
        "    return len(ZWC_RE.findall(text or \"\"))\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 4️⃣ Load documents (one JSON per line)\n",
        "#    Also assign stable wm_idx for watermarked docs (1-to-1 mapping)\n",
        "# ============================================================\n",
        "def load_docs(input_file: Path) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:\n",
        "    \"\"\"\n",
        "    Returns:\n",
        "    docs: cleaned document list (text, is_watermarked, wm_idx)\n",
        "    raw_wm_docs: all original is_watermarked=True docs (for raw stats)\n",
        "    \"\"\"\n",
        "    docs = []\n",
        "    raw_wm_docs = []\n",
        "\n",
        "    wm_idx_counter = 0\n",
        "\n",
        "    with input_file.open(\"r\", encoding=\"utf-8\") as f:\n",
        "        for line in f:\n",
        "            line = line.strip()\n",
        "            if not line:\n",
        "                continue\n",
        "            try:\n",
        "                raw = json.loads(line)\n",
        "            except Exception:\n",
        "                continue\n",
        "\n",
        "            text = raw.get(\"watermarked\", raw.get(\"text\", \"\"))\n",
        "            is_wm = bool(raw.get(\"is_watermarked\", False))\n",
        "\n",
        "            wm_idx = None\n",
        "            if is_wm:\n",
        "                wm_idx = wm_idx_counter\n",
        "                wm_idx_counter += 1\n",
        "                raw_wm_docs.append(\n",
        "                    {\n",
        "                        \"wm_idx\": wm_idx,\n",
        "                        \"watermarked\": raw.get(\"watermarked\", text or \"\"),\n",
        "                    }\n",
        "                )\n",
        "\n",
        "            docs.append(\n",
        "                {\n",
        "                    \"text\": text or \"\",\n",
        "                    \"is_watermarked\": is_wm,\n",
        "                    \"wm_idx\": wm_idx,\n",
        "                }\n",
        "            )\n",
        "\n",
        "    print(f\"📥 Loaded {len(docs)} docs, {len(raw_wm_docs)} raw watermarked docs.\")\n",
        "    return docs, raw_wm_docs\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 5️⃣ Document-level dedup (using cc_net.normalize_for_dedup)\n",
        "# ============================================================\n",
        "def normalize_for_dedup(text: str) -> str:\n",
        "    # Wrapper for cc_net.text_normalizer.normalize_for_dedup\n",
        "    return text_normalizer.normalize_for_dedup(text or \"\")\n",
        "\n",
        "\n",
        "def ccnet_doc_dedup(\n",
        "    docs: List[Dict[str, Any]], field: str = \"text\"\n",
        ") -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:\n",
        "    \"\"\"\n",
        "    Document-level dedup:\n",
        "    Entire text is normalized then hashed (SHA1).\n",
        "    Uses cc_net.normalize_for_dedup.\n",
        "    \"\"\"\n",
        "    seen_hashes = set()\n",
        "    kept, dropped = [], []\n",
        "\n",
        "    for doc in tqdm(docs, desc=\"🔹 CCNet-style doc dedup\"):\n",
        "        raw = doc.get(field, \"\")\n",
        "        if not raw.strip():\n",
        "            doc[\"is_duplicate\"] = True\n",
        "            dropped.append(doc)\n",
        "            continue\n",
        "\n",
        "        norm = normalize_for_dedup(raw)\n",
        "        if not norm:\n",
        "            doc[\"is_duplicate\"] = True\n",
        "            dropped.append(doc)\n",
        "            continue\n",
        "\n",
        "        h = hashlib.sha1(norm.encode(\"utf-8\")).hexdigest()\n",
        "        if h in seen_hashes:\n",
        "            doc[\"is_duplicate\"] = True\n",
        "            dropped.append(doc)\n",
        "        else:\n",
        "            seen_hashes.add(h)\n",
        "            doc[\"is_duplicate\"] = False\n",
        "            kept.append(doc)\n",
        "\n",
        "    return kept, dropped\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 6️⃣ Language identification using cc_net.split_by_lang.Classifier\n",
        "# ============================================================\n",
        "def load_lid_classifier(model_path: Path) -> split_by_lang.Classifier:\n",
        "    if not model_path.exists():\n",
        "        # Download fastText official LID model (compatible with cc_net)\n",
        "        os.system(\n",
        "            f\"wget -q -O {model_path} \"\n",
        "            \"https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\"\n",
        "        )\n",
        "    return split_by_lang.Classifier(\n",
        "        model=model_path,\n",
        "        field=\"text\",\n",
        "        out_field=\"language\",\n",
        "        top=1,\n",
        "        threshold=0.0,\n",
        "    )\n",
        "\n",
        "\n",
        "def run_lid(docs: List[Dict[str, Any]], classifier: split_by_lang.Classifier) -> None:\n",
        "    for doc in tqdm(docs, desc=\"🌐 LID via cc_net.Classifier\"):\n",
        "        try:\n",
        "            classifier(doc)  # Classifier mutates the doc dict\n",
        "        except Exception:\n",
        "            doc[\"language\"] = \"unk\"\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 7️⃣ Perplexity: MultiSentencePiece + DocLM + PerplexityBucket\n",
        "# ============================================================\n",
        "def ensure_lm_files() -> None:\n",
        "    if not SP_MODEL.exists():\n",
        "        os.system(\n",
        "            f\"wget -q -O {SP_MODEL} \"\n",
        "            \"https://dl.fbaipublicfiles.com/cc_net/lm/en.sp.model\"\n",
        "        )\n",
        "    if not LM_MODEL.exists():\n",
        "        os.system(\n",
        "            f\"wget -q -O {LM_MODEL} \"\n",
        "            \"https://dl.fbaipublicfiles.com/cc_net/lm/en.arpa.bin\"\n",
        "        )\n",
        "\n",
        "\n",
        "def build_cutoff_csv(docs: List[Dict[str, Any]]) -> None:\n",
        "    \"\"\"\n",
        "    Build percentile-based cutoff.csv from current sample's perplexity distribution.\n",
        "    Note: Unlike official global CCNet cutoff, but compatible with PerplexityBucket logic.\n",
        "    \"\"\"\n",
        "    ppl_values = [d.get(\"perplexity\", -1.0) for d in docs if d.get(\"perplexity\", -1.0) > 0]\n",
        "    if len(ppl_values) > 10:\n",
        "        percentiles = np.percentile(ppl_values, np.arange(101))\n",
        "        np.savetxt(\n",
        "            CUTOFF_CSV,\n",
        "            percentiles.reshape(-1, 1),\n",
        "            fmt=\"%.6f\",\n",
        "            delimiter=\",\",\n",
        "            header=\"en\",\n",
        "            comments=\"\",\n",
        "        )\n",
        "        print(f\"✅ Created percentile-based cutoff.csv at {CUTOFF_CSV}\")\n",
        "    else:\n",
        "        # Fallback\n",
        "        np.savetxt(\n",
        "            CUTOFF_CSV,\n",
        "            np.linspace(10, 100, 101).reshape(-1, 1),\n",
        "            fmt=\"%.6f\",\n",
        "            delimiter=\",\",\n",
        "            header=\"en\",\n",
        "            comments=\"\",\n",
        "        )\n",
        "        print(f\"⚠️ Too few PPL samples, wrote fallback cutoff.csv at {CUTOFF_CSV}\")\n",
        "\n",
        "\n",
        "def run_perplexity_and_bucket(docs: List[Dict[str, Any]]) -> None:\n",
        "    ensure_lm_files()\n",
        "\n",
        "    sp = perplexity.MultiSentencePiece(\n",
        "        {\"en\": SP_MODEL},\n",
        "        field=\"text\",\n",
        "        output_field=\"tokenized\",\n",
        "        normalize=True,\n",
        "    )\n",
        "\n",
        "    lm = perplexity.DocLM(\n",
        "        {\"en\": LM_MODEL},\n",
        "        field=\"tokenized\",\n",
        "        output_field=\"perplexity\",\n",
        "        normalize=False,\n",
        "    )\n",
        "\n",
        "    # Apply SP + LM to English-language docs\n",
        "    for doc in tqdm(docs, desc=\"🧠 Perplexity via cc_net.MultiSentencePiece + DocLM\"):\n",
        "        lang = doc.get(\"language\", \"unk\")\n",
        "        if lang != \"en\":\n",
        "            doc[\"perplexity\"] = -1.0\n",
        "            continue\n",
        "\n",
        "        try:\n",
        "            sp(doc)\n",
        "            lm(doc)\n",
        "        except Exception:\n",
        "            doc[\"perplexity\"] = -1.0\n",
        "\n",
        "    # Build cutoff.csv from distribution\n",
        "    build_cutoff_csv(docs)\n",
        "\n",
        "    # Bucket assignment via PerplexityBucket\n",
        "    bucketizer = perplexity.PerplexityBucket(CUTOFF_CSV)\n",
        "    for doc in docs:\n",
        "        try:\n",
        "            bucketizer(doc)\n",
        "        except Exception:\n",
        "            doc[\"bucket\"] = \"all\"\n",
        "\n",
        "    # Remove tokenized field (similar to cc_net.DropKeys)\n",
        "    for doc in docs:\n",
        "        if \"tokenized\" in doc:\n",
        "            del doc[\"tokenized\"]\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 8️⃣ Write outputs + zero-width stats\n",
        "# ============================================================\n",
        "def write_outputs(\n",
        "    kept_docs: List[Dict[str, Any]],\n",
        "    dropped_docs: List[Dict[str, Any]],\n",
        ") -> None:\n",
        "    # Zero-width stats\n",
        "    count_before = sum(count_zwc(d.get(\"text\", \"\")) for d in kept_docs + dropped_docs)\n",
        "    count_after = sum(count_zwc(d.get(\"text\", \"\")) for d in kept_docs)\n",
        "    retention = count_after / count_before if count_before else 0.0\n",
        "\n",
        "    # Write cleaned docs (with perplexity & bucket)\n",
        "    with gzip.open(CLEANED_FILE, \"wt\", encoding=\"utf-8\") as out:\n",
        "        for d in kept_docs:\n",
        "            out.write(json.dumps(d, ensure_ascii=False) + \"\\n\")\n",
        "\n",
        "    with gzip.open(PPL_FILE, \"wt\", encoding=\"utf-8\") as out:\n",
        "        for d in kept_docs:\n",
        "            out.write(json.dumps(d, ensure_ascii=False) + \"\\n\")\n",
        "\n",
        "    with DROPPED_FILE.open(\"w\", encoding=\"utf-8\") as out:\n",
        "        for d in dropped_docs:\n",
        "            out.write(json.dumps(d, ensure_ascii=False) + \"\\n\")\n",
        "\n",
        "    wm_dropped = sum(1 for d in dropped_docs if d.get(\"is_watermarked\"))\n",
        "\n",
        "    print(\"\\n✅ CCNet-style cleaning done.\")\n",
        "    print(\n",
        "        f\"📊 Total input: {len(kept_docs) + len(dropped_docs)} | \"\n",
        "        f\"Kept: {len(kept_docs)} | Dropped: {len(dropped_docs)}\"\n",
        "    )\n",
        "    print(f\"💧 Zero-width before: {count_before}\")\n",
        "    print(f\"💧 Zero-width after: {count_after}\")\n",
        "    print(f\"📊 Zero-width retention rate: {retention:.2%}\")\n",
        "    print(\n",
        "        f\"🧊 Dropped watermarked docs: {wm_dropped} \"\n",
        "        f\"({wm_dropped / len(dropped_docs) * 100 if dropped_docs else 0:.2f}%)\"\n",
        "    )\n",
        "    print(f\"💾 Cleaned output: {CLEANED_FILE}\")\n",
        "    print(f\"💾 Dropped list: {DROPPED_FILE}\")\n",
        "    print(f\"💾 Output with PPL: {PPL_FILE}\")\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 9️⃣ Watermark 1-to-1 retention stats (all documents)\n",
        "# ============================================================\n",
        "def load_invisible_chars(mytext_path: Path) -> List[str]:\n",
        "    with mytext_path.open(\"r\", encoding=\"utf-8\") as f:\n",
        "        chars = f.read()\n",
        "    invisible = [c for c in chars if not c.isprintable() and c != \"\\n\"]\n",
        "    print(f\"💧 Loaded {len(invisible)} invisible characters from {mytext_path}\")\n",
        "    return invisible\n",
        "\n",
        "\n",
        "def watermark_stats(\n",
        "    kept_docs: List[Dict[str, Any]],\n",
        "    raw_wm_docs: List[Dict[str, Any]],\n",
        "    invisible_chars: List[str],\n",
        ") -> None:\n",
        "    \"\"\"\n",
        "    Strict 1-to-1 mapping:\n",
        "    - raw_wm_docs[i].wm_idx = i corresponds to invisible_chars[i]\n",
        "    - If kept_docs include a watermarked doc with wm_idx, count occurrences of its invisible char\n",
        "    \"\"\"\n",
        "\n",
        "    bucket_stats = {\"all\": {\"total\": 0, \"wm_docs\": 0, \"wm_zwc\": 0}}\n",
        "\n",
        "    # Stats for cleaned data\n",
        "    for doc in kept_docs:\n",
        "        text = doc.get(\"text\", \"\") or \"\"\n",
        "        if not text:\n",
        "            continue\n",
        "\n",
        "        bucket_stats[\"all\"][\"total\"] += 1\n",
        "\n",
        "        if doc.get(\"is_watermarked\") and doc.get(\"wm_idx\") is not None:\n",
        "            wm_idx = doc[\"wm_idx\"]\n",
        "            if 0 <= wm_idx < len(invisible_chars):\n",
        "                ch = invisible_chars[wm_idx]\n",
        "                bucket_stats[\"all\"][\"wm_docs\"] += 1\n",
        "                bucket_stats[\"all\"][\"wm_zwc\"] += text.count(ch)\n",
        "\n",
        "    # Stats for raw data\n",
        "    raw_total_wm_docs = len(raw_wm_docs)\n",
        "    raw_total_zwc = 0\n",
        "    for raw in raw_wm_docs:\n",
        "        wm_idx = raw[\"wm_idx\"]\n",
        "        if 0 <= wm_idx < len(invisible_chars):\n",
        "            ch = invisible_chars[wm_idx]\n",
        "            raw_total_zwc += raw.get(\"watermarked\", \"\").count(ch)\n",
        "\n",
        "    print(\"\\n📊 Watermark Retention (all documents)\")\n",
        "    print(f\"Total raw watermarked docs: {raw_total_wm_docs}\")\n",
        "    print(f\"Total raw invisible chars (1-to-1): {raw_total_zwc}\")\n",
        "\n",
        "    data = bucket_stats[\"all\"]\n",
        "    wm_docs = data[\"wm_docs\"]\n",
        "    wm_zwc = data[\"wm_zwc\"]\n",
        "    doc_ratio = wm_docs / data[\"total\"] * 100 if data[\"total\"] else 0.0\n",
        "    zwc_retention = wm_zwc / raw_total_zwc * 100 if raw_total_zwc else 0.0\n",
        "\n",
        "    print(f\"\\n—— ALL ——\")\n",
        "    print(f\"📄 Total docs: {data['total']}\")\n",
        "    print(f\"💧 Watermarked docs: {wm_docs} ({doc_ratio:.2f}%)\")\n",
        "    print(f\"🔢 Invisible chars retained: {wm_zwc}\")\n",
        "    print(f\"📈 Retention vs. raw: {zwc_retention:.2f}%\")\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 🔚 main\n",
        "# ============================================================\n",
        "def main():\n",
        "    # 1. Load original docs & raw watermark list\n",
        "    docs, raw_wm_docs = load_docs(INPUT_FILE)\n",
        "\n",
        "    # 2. Zero-width stats before cleaning\n",
        "    count_before = sum(count_zwc(d[\"text\"]) for d in docs)\n",
        "    print(f\"💧 Zero-width before cleaning: {count_before}\")\n",
        "\n",
        "    # 3. Document-level dedup via cc_net\n",
        "    kept_docs, dropped_docs = ccnet_doc_dedup(docs)\n",
        "\n",
        "    # 4. LID via cc_net Classifier\n",
        "    lid_classifier = load_lid_classifier(LID_MODEL_PATH)\n",
        "    run_lid(kept_docs, lid_classifier)\n",
        "\n",
        "    # 5. Perplexity + bucket via cc_net\n",
        "    run_perplexity_and_bucket(kept_docs)\n",
        "\n",
        "    # 6. Write outputs + zero-width stats\n",
        "    write_outputs(kept_docs, dropped_docs)\n",
        "\n",
        "    # 7. Watermark retention stats\n",
        "    if MYTEXT_PATH.exists():\n",
        "        invisible_chars = load_invisible_chars(MYTEXT_PATH)\n",
        "        watermark_stats(kept_docs, raw_wm_docs, invisible_chars)\n",
        "    else:\n",
        "        print(f\"⚠️ {MYTEXT_PATH} not found, skip watermark stats.\")\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    main()\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "欢迎使用 Colab",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
