{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "J9fimcNCSUwI",
        "outputId": "542bfdd8-5225-4ee2-b61b-417687684219"
      },
      "outputs": [],
      "source": [
        "# ============================================\n",
        "# 🧰 Install required libraries\n",
        "# ============================================\n",
        "!pip install pandas langdetect matplotlib requests tensorflow-datasets --quiet\n",
        "!pip install apache_beam --quiet\n",
        "\n",
        "# ============================================\n",
        "# 🧹 C4-style cleaning via wrappers around c4_utils + watermark detection\n",
        "# ============================================\n",
        "\n",
        "import re\n",
        "import json\n",
        "import hashlib\n",
        "import requests\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# --- NLTK sentence tokenizer (still used for post-dedupe checks) ---\n",
        "import nltk\n",
        "nltk.download(\"punkt\")\n",
        "nltk.download(\"punkt_tab\")\n",
        "from nltk.tokenize import sent_tokenize\n",
        "\n",
        "# --- Langdetect, deterministic seed (C4 style) ---\n",
        "from langdetect import detect_langs, DetectorFactory\n",
        "DetectorFactory.seed = 0\n",
        "\n",
        "# --- Import C4 utilities from TFDS ---\n",
        "from tensorflow_datasets.text import c4_utils\n",
        "PageFeatures = c4_utils.PageFeatures  # convenience alias\n",
        "\n",
        "# ============================================\n",
        "# ⚙️ Config (match TFDS C4 clean=True, en)\n",
        "# ============================================\n",
        "\n",
        "CONFIG = {\n",
        "    \"target_lang\": \"en\",\n",
        "    \"clean\": True,\n",
        "    \"dedupe\": True,\n",
        "    \"badwords_filter_fraction\": 1.0,\n",
        "    \"min_words_per_line\": 5,\n",
        "    \"min_num_sentences\": 3,\n",
        "    \"max_word_length\": 1000,\n",
        "    \"max_length_chars\": int(1.9e5),\n",
        "    \"line_delimiter\": \"\\n\",\n",
        "}\n",
        "\n",
        "# ============================================\n",
        "# 1️⃣ Load your data + watermark reference characters\n",
        "# ============================================\n",
        "\n",
        "data_path = \"/content/final_uniform_replace.jsonl\"\n",
        "mytext_path = \"/content/myText.txt\"\n",
        "\n",
        "# --- Load dataset ---\n",
        "records = [json.loads(l) for l in open(data_path, \"r\", encoding=\"utf-8\")]\n",
        "df = pd.DataFrame(records)\n",
        "df = df[df.get(\"is_watermarked\", False) == True].copy()\n",
        "texts = df[\"watermarked\"].astype(str).tolist()\n",
        "print(f\"✅ Loaded {len(texts)} watermarked documents.\")\n",
        "\n",
        "# --- Load invisible characters from myText.txt ---\n",
        "with open(mytext_path, \"r\", encoding=\"utf-8\") as f:\n",
        "    chars = f.read()\n",
        "\n",
        "ZWC = [c for c in chars if not c.isprintable() and c != \"\\n\"]\n",
        "ZWC_CODES = [f\"U+{ord(c):04X}\" for c in ZWC]\n",
        "print(f\"💧 Loaded {len(ZWC)} invisible watermark characters:\")\n",
        "print(\", \".join(ZWC_CODES[:15]) + (\" ...\" if len(ZWC_CODES) > 15 else \"\"))\n",
        "\n",
        "# ============================================\n",
        "# 2️⃣ Core cleaning utilities via c4_utils wrappers\n",
        "# ============================================\n",
        "\n",
        "def normalize_text(raw_text: str) -> str:\n",
        "    \"\"\"Simple normalization to ensure string input.\"\"\"\n",
        "    if not isinstance(raw_text, str):\n",
        "        raw_text = str(raw_text)\n",
        "    return raw_text.strip()\n",
        "\n",
        "\n",
        "# --- Wrapper around c4_utils.clean_page, disabling sentence-count filter ---\n",
        "CITATION_REGEX = re.compile(r\"\\[\\d*\\]|\\[edit\\]|\\[citation needed\\]\")\n",
        "\n",
        "def c4_clean_page_wrapper(text: str) -> str | None:\n",
        "    \"\"\"\n",
        "    Wraps the official C4 clean_page logic,\n",
        "    but disables the minimum-number-of-sentences filter\n",
        "    by setting min_num_sentences=0.\n",
        "    \"\"\"\n",
        "    page = PageFeatures(text=text)\n",
        "    cleaned_iter = c4_utils.clean_page(\n",
        "        page,\n",
        "        citation_regex=CITATION_REGEX,\n",
        "        min_words_per_line=CONFIG[\"min_words_per_line\"],\n",
        "        min_num_sentences=0,\n",
        "        max_word_length=CONFIG[\"max_word_length\"],\n",
        "        line_delimiter=CONFIG[\"line_delimiter\"],\n",
        "    )\n",
        "    cleaned_pages = list(cleaned_iter)\n",
        "    if not cleaned_pages:\n",
        "        return None\n",
        "    return cleaned_pages[0].text\n",
        "\n",
        "\n",
        "# --- Wrapper around c4_utils.is_valid_length ---\n",
        "def is_valid_length_wrapper(\n",
        "    text: str,\n",
        "    max_length: int = CONFIG[\"max_length_chars\"],\n",
        ") -> bool:\n",
        "    page = PageFeatures(text=text)\n",
        "    return c4_utils.is_valid_length(page, max_length=max_length)\n",
        "\n",
        "\n",
        "# --- Our own MD5 hash for lines (not from c4_utils, used for local/global dedupe) ---\n",
        "def md5_hash_line(line: str) -> str:\n",
        "    return hashlib.md5(line.strip().lower().encode(\"utf-8\")).hexdigest()\n",
        "\n",
        "\n",
        "def dedupe_lines_c4_style(text: str, line_delimiter: str = CONFIG[\"line_delimiter\"]) -> str:\n",
        "    \"\"\"\n",
        "    Simple intra-document line deduplication (local dedupe).\n",
        "\n",
        "    Note: This is NOT the full Beam-based remove_duplicate_text from C4,\n",
        "    but uses the same idea of MD5 hashing per line.\n",
        "    \"\"\"\n",
        "    seen, kept = set(), []\n",
        "    for line in text.split(line_delimiter):\n",
        "        h = md5_hash_line(line)\n",
        "        if h not in seen:\n",
        "            seen.add(h)\n",
        "            kept.append(line)\n",
        "    return line_delimiter.join(kept).strip()\n",
        "\n",
        "\n",
        "# --- Wrapper around c4_utils.detect_english ---\n",
        "def detect_english_wrapper(text: str, min_prob: float = 0.99) -> bool:\n",
        "    \"\"\"\n",
        "    Uses the official C4 detect_english logic via c4_utils.\n",
        "\n",
        "    Returns True iff the page is accepted as English.\n",
        "    \"\"\"\n",
        "    page = PageFeatures(text=text)\n",
        "    # detect_english is a generator that yields PageFeatures if accepted\n",
        "    for _ in c4_utils.detect_english(page, min_probability=min_prob):\n",
        "        return True\n",
        "    return False\n",
        "\n",
        "\n",
        "# --- Badwords filter: use c4_utils.get_badwords_filter_fn for English ---\n",
        "def load_ldnoobw_badwords_en() -> set[str]:\n",
        "    try:\n",
        "        url = (\n",
        "            \"https://raw.githubusercontent.com/LDNOOBW/\"\n",
        "            \"List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en\"\n",
        "        )\n",
        "        resp = requests.get(url, timeout=10)\n",
        "        resp.raise_for_status()\n",
        "        return {w.strip() for w in resp.text.splitlines() if w.strip()}\n",
        "    except Exception:\n",
        "        # Fallback small set\n",
        "        return {\"sex\", \"nsfw\", \"badword\"}\n",
        "\n",
        "\n",
        "EN_BADWORDS = load_ldnoobw_badwords_en()\n",
        "_BADWORDS_DICT = {\"en\": list(EN_BADWORDS)}\n",
        "\n",
        "_BADWORDS_FILTER_FN = c4_utils.get_badwords_filter_fn(\n",
        "    badwords=_BADWORDS_DICT,\n",
        "    filter_fraction=CONFIG[\"badwords_filter_fraction\"],\n",
        ")\n",
        "\n",
        "def badwords_filter_en_wrapper(text: str) -> bool:\n",
        "    \"\"\"\n",
        "    Wraps the C4 badwords filter for English using get_badwords_filter_fn.\n",
        "\n",
        "    Returns True if the page is kept, False if filtered out.\n",
        "    \"\"\"\n",
        "    page = PageFeatures(text=text, language=\"en\")\n",
        "    return _BADWORDS_FILTER_FN(page)\n",
        "\n",
        "\n",
        "# ============================================\n",
        "# 3️⃣ Full C4-like clean pipeline (using wrappers)\n",
        "# ============================================\n",
        "\n",
        "def full_c4_clean_single(text: str, doc_id=None, debug_log=None) -> str | None:\n",
        "    if not text or not isinstance(text, str):\n",
        "        if debug_log is not None:\n",
        "            debug_log.append((doc_id, \"empty_or_invalid\"))\n",
        "        return None\n",
        "\n",
        "    # Basic normalization\n",
        "    text = normalize_text(text)\n",
        "\n",
        "    # Length filter (wrapper around c4_utils.is_valid_length)\n",
        "    if not is_valid_length_wrapper(text):\n",
        "        if debug_log is not None:\n",
        "            debug_log.append((doc_id, \"too_long\"))\n",
        "        return None\n",
        "\n",
        "    # Core cleaning (wrapper around c4_utils.clean_page)\n",
        "    text = c4_clean_page_wrapper(text)\n",
        "    if not text:\n",
        "        if debug_log is not None:\n",
        "            debug_log.append((doc_id, \"line_filter\"))\n",
        "        return None\n",
        "\n",
        "    # Intra-document dedupe (local line dedupe, our own implementation)\n",
        "    if CONFIG[\"dedupe\"]:\n",
        "        text = dedupe_lines_c4_style(text)\n",
        "        if not text:\n",
        "            if debug_log is not None:\n",
        "                debug_log.append((doc_id, \"empty_after_dedupe\"))\n",
        "            return None\n",
        "        # # Extra sentence count check after dedupe (similar spirit to C4)\n",
        "        # if len(sent_tokenize(text)) < CONFIG[\"min_num_sentences\"]:\n",
        "        #     if debug_log is not None:\n",
        "        #         debug_log.append((doc_id, \"too_few_sentences\"))\n",
        "        #     return None\n",
        "\n",
        "    # Badwords filtering (wrapper around c4_utils.get_badwords_filter_fn)\n",
        "    if not badwords_filter_en_wrapper(text):\n",
        "        if debug_log is not None:\n",
        "            debug_log.append((doc_id, \"badwords\"))\n",
        "        return None\n",
        "\n",
        "    # English detection (wrapper around c4_utils.detect_english)\n",
        "    if not detect_english_wrapper(text):\n",
        "        if debug_log is not None:\n",
        "            debug_log.append((doc_id, \"language_not_en\"))\n",
        "        return None\n",
        "\n",
        "    if debug_log is not None:\n",
        "        debug_log.append((doc_id, \"passed\"))\n",
        "\n",
        "    return text\n",
        "\n",
        "\n",
        "# ============================================\n",
        "# 4️⃣ Cleaning execution + simple global dedupe\n",
        "# ============================================\n",
        "\n",
        "global_seen_hashes: set[str] = set()\n",
        "debug_log: list[tuple[int, str]] = []\n",
        "cleaned_with_id: list[tuple[int, str]] = []\n",
        "\n",
        "\n",
        "def remove_global_duplicates_keep_id(doc_id: int, text: str):\n",
        "    \"\"\"\n",
        "    Simple global line deduplication across documents.\n",
        "\n",
        "    NOTE: This is still our own simplified global dedupe,\n",
        "    NOT the full Beam-based remove_duplicate_text from C4.\n",
        "    \"\"\"\n",
        "    new_lines = []\n",
        "    for line in text.splitlines():\n",
        "        h = md5_hash_line(line)\n",
        "        if h in global_seen_hashes:\n",
        "            continue\n",
        "        global_seen_hashes.add(h)\n",
        "        new_lines.append(line)\n",
        "    if not new_lines:\n",
        "        return None\n",
        "    return (doc_id, \"\\n\".join(new_lines))\n",
        "\n",
        "\n",
        "for i, t in enumerate(texts):\n",
        "    cleaned = full_c4_clean_single(t, doc_id=i, debug_log=debug_log)\n",
        "    if cleaned:\n",
        "        kept = remove_global_duplicates_keep_id(i, cleaned)\n",
        "        if kept:\n",
        "            cleaned_with_id.append(kept)\n",
        "\n",
        "df_debug = pd.DataFrame(debug_log, columns=[\"doc_id\", \"status\"])\n",
        "print(\"\\n📊 Filter summary:\\n\", df_debug[\"status\"].value_counts())\n",
        "\n",
        "\n",
        "# ============================================\n",
        "# 5️⃣ Per-line invisible watermark analysis (1-to-1) + reduction tracking\n",
        "# ============================================\n",
        "\n",
        "def count_char(text: str, ch: str) -> int:\n",
        "    \"\"\"Count occurrences of a specific invisible char.\"\"\"\n",
        "    return text.count(ch)\n",
        "\n",
        "per_doc_retention = []\n",
        "for i, (orig_id, cleaned) in enumerate(cleaned_with_id):\n",
        "    if i >= len(ZWC):\n",
        "        break\n",
        "    wm_char = ZWC[i]\n",
        "    orig = texts[orig_id]\n",
        "    orig_count = count_char(orig, wm_char)\n",
        "    cleaned_count = count_char(cleaned, wm_char)\n",
        "    reduced = cleaned_count < orig_count\n",
        "    per_doc_retention.append({\n",
        "        \"doc_id\": orig_id,\n",
        "        \"char_code\": f\"U+{ord(wm_char):04X}\",\n",
        "        \"orig_count\": orig_count,\n",
        "        \"cleaned_count\": cleaned_count,\n",
        "        \"retention_ratio\": (cleaned_count / orig_count) if orig_count > 0 else 0,\n",
        "        \"reduced\": reduced,\n",
        "    })\n",
        "\n",
        "df_ret = pd.DataFrame(per_doc_retention).sort_values(\"doc_id\").reset_index(drop=True)\n",
        "retained = sum(df_ret[\"cleaned_count\"] > 0)\n",
        "retention_rate = retained / len(df_ret) * 100 if len(df_ret) else 0\n",
        "reduced_docs = df_ret[\"reduced\"].sum()\n",
        "\n",
        "print(f\"\\n✅ 1-to-1 watermark detection complete.\")\n",
        "print(f\"💧 Retained {retained}/{len(df_ret)} ({retention_rate:.2f}%) watermarks.\")\n",
        "print(f\"📉 Watermark reduced in {reduced_docs} documents.\")\n",
        "print(\"\\nSample result:\")\n",
        "print(df_ret.head(10))\n",
        "\n",
        "plt.figure(figsize=(6, 4))\n",
        "plt.hist(df_ret[\"retention_ratio\"] * 100, bins=10, edgecolor=\"black\")\n",
        "plt.xlabel(\"Watermark retention (%)\")\n",
        "plt.ylabel(\"Document count\")\n",
        "plt.title(\"1-to-1 Invisible Watermark Retention per Document\")\n",
        "plt.grid(alpha=0.3)\n",
        "plt.show()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "aNRl-PRPTiWG",
        "outputId": "fb7ca6c3-9376-498d-d6b7-56a9fc64e8e5"
      },
      "outputs": [],
      "source": [
        "!pip install tensorflow-datasets nltk langdetect pandas requests matplotlib tqdm\n",
        "!pip install apache_beam\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "欢迎使用 Colab",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
