{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "_sHINySkHtkC",
        "outputId": "a7d30b4d-0e68-4d0f-c25a-9013eac6deec"
      },
      "outputs": [],
      "source": [
        "# ============================================================\n",
        "# 🧰 Install dependencies + clone FineWeb2 repo\n",
        "# ============================================================\n",
        "!pip install git+https://github.com/huggingface/datatrove.git -q\n",
        "!pip install pandas matplotlib tqdm ftfy trafilatura pyyaml fasteners fasttext-numpy2-wheel orjson -q\n",
        "!git clone https://github.com/huggingface/fineweb-2.git -q\n",
        "\n",
        "# ============================================================\n",
        "# 🩹 Patch dependency check (avoid fasttext/glotlid hard error)\n",
        "# ============================================================\n",
        "import datatrove.utils._import_utils as iu\n",
        "iu._raise_error_for_missing_dependencies = (\n",
        "    lambda step, deps: print(f\"[patch] skipping dependency check for {step}\")\n",
        ")\n",
        "\n",
        "# ============================================================\n",
        "# 🧾 Imports\n",
        "# ============================================================\n",
        "import json, os, glob\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import yaml\n",
        "\n",
        "from datatrove.pipeline.readers.jsonl import JsonlReader\n",
        "from datatrove.pipeline.writers.jsonl import JsonlWriter\n",
        "from datatrove.pipeline.filters import (\n",
        "    FineWebQualityFilter,\n",
        "    GopherQualityFilter,\n",
        "    GopherRepetitionFilter,\n",
        "    LanguageFilter,\n",
        ")\n",
        "from datatrove.pipeline.dedup import (\n",
        "    MinhashDedupSignature,\n",
        "    MinhashDedupBuckets,\n",
        "    MinhashDedupCluster,\n",
        "    MinhashDedupFilter,\n",
        ")\n",
        "from datatrove.pipeline.dedup.minhash import MinhashConfig\n",
        "from datatrove.pipeline.formatters import FTFYFormatter, PIIFormatter, SymbolLinesFormatter\n",
        "from datatrove.utils.hashing import HashConfig\n",
        "\n",
        "# ============================================================\n",
        "# ⚙️ Load FineWeb2 official config (if exists)\n",
        "# ============================================================\n",
        "FW2_REPO_DIR = \"/content/fineweb-2\"\n",
        "CONFIG_DIR = os.path.join(FW2_REPO_DIR, \"configs\")\n",
        "\n",
        "lang_script = \"eng_Latn\"  \n",
        "\n",
        "config_path = os.path.join(CONFIG_DIR, f\"{lang_script}.yml\")\n",
        "if os.path.exists(config_path):\n",
        "    with open(config_path, \"r\", encoding=\"utf-8\") as f:\n",
        "        filter_config = yaml.safe_load(f)\n",
        "    print(f\"✅ Loaded FineWeb2 config: {config_path}\")\n",
        "else:\n",
        "    # English is not included in the multilingual configs, so here's a reasonable default version following the official FineWeb/FineWeb2 syntax.\n",
        "    print(f\"⚠️ Config {config_path} not found, using fallback defaults for {lang_script}\")\n",
        "    filter_config = {\n",
        "        \"language_score\": 0.65,\n",
        "        \"dup_line_frac\": 0.1,\n",
        "        \"top_n_grams\": 100,\n",
        "        \"dup_n_grams\": 20,\n",
        "        \"line_punct_thr\": 0.4,\n",
        "        \"new_line_ratio\": 0.4,\n",
        "        \"max_avg_word_length\": 15,\n",
        "        \"min_avg_word_length\": 2,\n",
        "        \"stopwords\": [\n",
        "            \"the\", \"is\", \"in\", \"and\", \"to\", \"of\", \"that\",\n",
        "            \"for\", \"on\", \"with\", \"as\", \"by\", \"at\", \"from\"\n",
        "        ],\n",
        "        \"max_non_alpha_words_ratio\": 0.5,\n",
        "    }\n",
        "# Normalize top_n_grams / dup_n_grams to conform to the expected type of GopherRepetitionFilter.\n",
        "tn = filter_config.get(\"top_n_grams\")\n",
        "dn = filter_config.get(\"dup_n_grams\")\n",
        "\n",
        "# If it's a single number, use the default configuration of the official Common Crawl Pipeline Creator.\n",
        "if isinstance(tn, (int, float)):\n",
        "    filter_config[\"top_n_grams\"] = [(2, 0.2), (3, 0.18), (4, 0.16)]\n",
        "\n",
        "if isinstance(dn, (int, float)):\n",
        "    filter_config[\"dup_n_grams\"] = [\n",
        "        (5, 0.15),\n",
        "        (6, 0.14),\n",
        "        (7, 0.13),\n",
        "        (8, 0.12),\n",
        "        (9, 0.11),\n",
        "        (10, 0.10),\n",
        "    ]\n",
        "\n",
        "# ============================================================\n",
        "# 🚀 STEP 0: Load input data (with watermark)\n",
        "# ============================================================\n",
        "data_path = \"/content/final_uniform_replace.jsonl\"\n",
        "records = [json.loads(line) for line in open(data_path, \"r\", encoding=\"utf-8\")]\n",
        "\n",
        "df = pd.DataFrame(records)\n",
        "df = df[df.get(\"is_watermarked\", False) == True].copy()\n",
        "texts = df[\"watermarked\"].astype(str).tolist()\n",
        "print(f\"✅ Loaded {len(texts)} watermarked documents for cleaning.\")\n",
        "\n",
        "input_jsonl = \"/content/input_texts.jsonl\"\n",
        "with open(input_jsonl, \"w\", encoding=\"utf-8\") as f:\n",
        "    for i, t in enumerate(texts):\n",
        "        f.write(json.dumps({\"text\": t, \"id\": i}) + \"\\n\")\n",
        "\n",
        "# ============================================================\n",
        "# 🚀 STEP 1: Language Filter (Standalone LanguageFilter)\n",
        "\n",
        "# - The official pipeline uses GlotLID + YAML's language_score for filtering in this step.\n",
        "\n",
        "# - We don't have GlotLID to predict results, so we use ft176 + threshold to simulate.\n",
        "# ============================================================\n",
        "lang_threshold = filter_config.get(\"language_score\", 0.65)\n",
        "\n",
        "lang_filter = LanguageFilter(\n",
        "    backend=\"ft176\",\n",
        "    language_threshold=lang_threshold,\n",
        "    label_only=False,\n",
        ")\n",
        "\n",
        "reason_map = {}\n",
        "lang_out_path = \"/content/lang_filtered\"\n",
        "writer = JsonlWriter(lang_out_path)\n",
        "passed_count = 0\n",
        "\n",
        "for doc in JsonlReader(input_jsonl, text_key=\"text\", id_key=\"id\")():\n",
        "    if lang_filter.filter(doc):\n",
        "        writer.write(doc)\n",
        "        passed_count += 1\n",
        "    else:\n",
        "        reason_map[doc.id] = \"language_filter\"\n",
        "writer.close()\n",
        "\n",
        "n_lang_pass = passed_count\n",
        "print(f\"✅ STEP1 (Language Filter): {passed_count}/{len(texts)} passed\")\n",
        "print(f\"❌ Removed: {len(texts) - passed_count}\")\n",
        "\n",
        "# ============================================================\n",
        "# 🧬 STEP 2: Minhash deduplication (align with the official 4-stage configuration)\n",
        "# ============================================================\n",
        "dedup_base = \"/content/minhash\"\n",
        "os.makedirs(f\"{dedup_base}/signatures\", exist_ok=True)\n",
        "os.makedirs(f\"{dedup_base}/buckets\", exist_ok=True)\n",
        "os.makedirs(f\"{dedup_base}/remove_ids\", exist_ok=True)\n",
        "os.makedirs(f\"{dedup_base}/removed\", exist_ok=True)\n",
        "\n",
        "# —— Fully reuse the parameters of MinhashConfig in fineweb-2-pipeline.py —— #\n",
        "minhash_config = MinhashConfig(\n",
        "    hash_config=HashConfig(\n",
        "        hash_fc=\"xxhash\",\n",
        "        precision=64,  # better precision -> fewer false positives\n",
        "    ),\n",
        "    num_buckets=14,\n",
        "    hashes_per_bucket=8,\n",
        "    n_grams=5,\n",
        ")\n",
        "\n",
        "# stage 1: compute signatures\n",
        "MinhashDedupSignature(\n",
        "    output_folder=f\"{dedup_base}/signatures\",\n",
        "    config=minhash_config,\n",
        "    language=lang_script, \n",
        ").run(JsonlReader(lang_out_path, text_key=\"text\", id_key=\"id\")())\n",
        "\n",
        "# Stage 2: Bucketization — Simulating world_size = num_buckets number of workers\n",
        "world_size = minhash_config.num_buckets  # 14 here\n",
        "for rank in range(world_size):\n",
        "    MinhashDedupBuckets(\n",
        "        input_folder=f\"{dedup_base}/signatures\",\n",
        "        output_folder=f\"{dedup_base}/buckets\",\n",
        "        config=MinhashConfig(hash_config=minhash_config.hash_config),\n",
        "    ).run(rank=rank, world_size=world_size)\n",
        "\n",
        "\n",
        "# stage 3: clustering\n",
        "MinhashDedupCluster(\n",
        "    input_folder=f\"{dedup_base}/buckets\",\n",
        "    output_folder=f\"{dedup_base}/remove_ids\",\n",
        "    config=minhash_config,\n",
        "    save_cluster_size=True,  \n",
        ").run(world_size=1)\n",
        "\n",
        "# stage 4: filter duplicates\n",
        "dedup_output = f\"{dedup_base}/output.jsonl\"\n",
        "dedup_removed_dir = f\"{dedup_base}/removed\"\n",
        "dedup_removed_file = os.path.join(dedup_removed_dir, \"removed.jsonl\")\n",
        "\n",
        "writer = JsonlWriter(dedup_output)\n",
        "filt = MinhashDedupFilter(\n",
        "    input_folder=f\"{dedup_base}/remove_ids\",\n",
        "    exclusion_writer=JsonlWriter(dedup_removed_file),  \n",
        ")\n",
        "\n",
        "for doc in filt.run(JsonlReader(lang_out_path, text_key=\"text\", id_key=\"id\")()):\n",
        "    writer.write(doc)\n",
        "writer.close()\n",
        "\n",
        "# ============================================================\n",
        "# 🧩 Check deduplication results\n",
        "# ============================================================\n",
        "dedup_removed_ids = set()\n",
        "removed_files = glob.glob(os.path.join(dedup_removed_dir, \"*.jsonl*\"))\n",
        "\n",
        "for path in removed_files:\n",
        "    for doc in JsonlReader(path, text_key=\"text\", id_key=\"id\")():\n",
        "        dedup_removed_ids.add(doc.id)\n",
        "        reason_map[doc.id] = \"dedup\"\n",
        "\n",
        "print(f\"✅ STEP2 (Minhash dedup): removed {len(dedup_removed_ids)} duplicates\")\n",
        "\n",
        "# ============================================================\n",
        "# 🧹 STEP 3: Quality filtering + text fixing\n",
        "# ============================================================\n",
        "\n",
        "# 1) GopherRepetitionFilter\n",
        "gopher_rep = GopherRepetitionFilter(\n",
        "    language=lang_script,\n",
        "    dup_para_frac=0,\n",
        "    dup_line_char_frac=0,\n",
        "    dup_para_char_frac=0,\n",
        "    dup_line_frac=filter_config[\"dup_line_frac\"],\n",
        "    top_n_grams=filter_config[\"top_n_grams\"],\n",
        "    dup_n_grams=filter_config[\"dup_n_grams\"],\n",
        ")\n",
        "\n",
        "# 2) FineWebQualityFilter\n",
        "fw_quality = FineWebQualityFilter(\n",
        "    language=lang_script,\n",
        "    short_line_thr=999,\n",
        "    char_duplicates_ratio=0.1,\n",
        "    line_punct_thr=filter_config[\"line_punct_thr\"],\n",
        "    new_line_ratio=filter_config[\"new_line_ratio\"],\n",
        ")\n",
        "\n",
        "# 3) GopherQualityFilter\n",
        "gopher_qual = GopherQualityFilter(\n",
        "    language=lang_script,\n",
        "    max_avg_word_length=filter_config[\"max_avg_word_length\"],\n",
        "    min_avg_word_length=filter_config[\"min_avg_word_length\"],\n",
        "    stop_words=filter_config[\"stopwords\"],\n",
        "    max_non_alpha_words_ratio=filter_config[\"max_non_alpha_words_ratio\"],\n",
        "    min_stop_words=2,\n",
        ")\n",
        "\n",
        "# 4) FTFY + PII + fix table Format\n",
        "ftfy_fmt = FTFYFormatter()\n",
        "pii_fmt = PIIFormatter()\n",
        "sym_fmt = SymbolLinesFormatter(symbols_to_remove=[\"|\"], replace_char=\"\\n\")\n",
        "\n",
        "cleaned_with_id = []\n",
        "debug_log = []\n",
        "\n",
        "for doc in JsonlReader(dedup_output, text_key=\"text\", id_key=\"id\")():\n",
        "    if not hasattr(doc, \"text\") or not hasattr(doc, \"id\"):\n",
        "        reason_map[getattr(doc, \"id\", -1)] = \"missing_field\"\n",
        "        continue\n",
        "\n",
        "    tid, text = doc.id, doc.text\n",
        "\n",
        "    # gopher repetition\n",
        "    if not gopher_rep.filter(doc):\n",
        "        reason_map[tid] = \"gopher_rep\"\n",
        "        continue\n",
        "\n",
        "    # fineweb quality\n",
        "    if not fw_quality.filter(doc):\n",
        "        reason_map[tid] = \"fw_quality\"\n",
        "        continue\n",
        "\n",
        "    # gopher quality\n",
        "    if not gopher_qual.filter(doc):\n",
        "        reason_map[tid] = \"gopher_qual\"\n",
        "        continue\n",
        "\n",
        "    # final touches: FTFY -> PII -> SymbolLines\n",
        "    text = sym_fmt.format(pii_fmt.format(ftfy_fmt.format(text)))\n",
        "    cleaned_with_id.append((tid, text))\n",
        "\n",
        "cleaned_dict = dict(cleaned_with_id)\n",
        "\n",
        "# ============================================================\n",
        "# 💧 STEP 4: 1-to-1 Invisible Watermark Detection\n",
        "# ============================================================\n",
        "mytext_path = \"/content/myText.txt\"\n",
        "with open(mytext_path, \"r\", encoding=\"utf-8\") as f:\n",
        "    chars = f.read()\n",
        "\n",
        "INVISIBLE_CHARS = [c for c in chars if not c.isprintable() and c != \"\\n\"]\n",
        "INVISIBLE_CODES = [f\"U+{ord(c):04X}\" for c in INVISIBLE_CHARS]\n",
        "print(f\"💧 Loaded {len(INVISIBLE_CHARS)} invisible watermark characters:\")\n",
        "print(\", \".join(INVISIBLE_CODES[:10]) + (\" ...\" if len(INVISIBLE_CODES) > 10 else \"\"))\n",
        "\n",
        "def count_invisible(text: str, idx: int) -> int:\n",
        "    if idx >= len(INVISIBLE_CHARS):\n",
        "        return 0\n",
        "    return text.count(INVISIBLE_CHARS[idx])\n",
        "\n",
        "def has_invisible(text: str, idx: int) -> bool:\n",
        "    return count_invisible(text, idx) > 0\n",
        "\n",
        "orig_with_wm = sum(has_invisible(t, i) for i, t in enumerate(texts))\n",
        "clean_with_wm = sum(has_invisible(cleaned_dict.get(i, \"\"), i) for i in range(len(texts)))\n",
        "doc_retention_rate = (clean_with_wm / orig_with_wm * 100) if orig_with_wm else 0.0\n",
        "\n",
        "removed_doc_ids = [i for i in range(len(texts)) if i not in cleaned_dict]\n",
        "\n",
        "print(f\"\\n💧 Original docs with watermark: {orig_with_wm}\")\n",
        "print(f\"💧 Cleaned docs with watermark:  {clean_with_wm}\")\n",
        "print(f\"✅ Document-level retention rate: {doc_retention_rate:.2f}%\")\n",
        "\n",
        "# ============================================================\n",
        "# 📁 STEP 5: Output removed documents + removal reasons\n",
        "# ============================================================\n",
        "rows = []\n",
        "for rid in removed_doc_ids:\n",
        "    rows.append({\n",
        "        \"doc_id\": rid,\n",
        "        \"reason\": reason_map.get(rid, \"unknown\"),\n",
        "        \"raw_text\": texts[rid][:400].replace(\"\\n\", \" \"),\n",
        "        \"zwc_count\": count_invisible(texts[rid], rid),\n",
        "        \"orig_len\": len(texts[rid]),\n",
        "    })\n",
        "df_removed = pd.DataFrame(rows)\n",
        "print(\"\\n📊 Removal summary by reason:\")\n",
        "print(df_removed[\"reason\"].value_counts())\n",
        "\n",
        "removed_csv = \"/content/removed_docs_with_text.csv\"\n",
        "df_removed.to_csv(removed_csv, index=False)\n",
        "print(f\"\\n📁 Full removed docs saved to {removed_csv}\")\n",
        "\n",
        "# ============================================================\n",
        "# 📈 STEP 6: Visualize document counts by stage\n",
        "# ============================================================\n",
        "stage_counts = {\n",
        "    \"original\": len(texts),\n",
        "    \"lang_filter\": n_lang_pass,\n",
        "    \"dedup\": n_lang_pass - len(dedup_removed_ids),\n",
        "    \"quality\": len(cleaned_with_id),\n",
        "}\n",
        "\n",
        "print(\"\\n📈 Stage counts summary:\")\n",
        "for stage, count in stage_counts.items():\n",
        "    print(f\"  {stage:<15}: {count}\")\n",
        "\n",
        "plt.figure(figsize=(6, 4))\n",
        "plt.bar(stage_counts.keys(), stage_counts.values())  \n",
        "plt.ylabel(\"Remaining documents\")\n",
        "plt.title(\"Document count after each FineWeb2-like cleaning stage\")\n",
        "plt.grid(alpha=0.3)\n",
        "plt.show()\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "欢迎使用 Colab",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
