{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "def get_subdirectories(directory):\n",
    "    subdirectories = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for dir in dirs:\n",
    "            subdirectories.append(os.path.join(root, dir))\n",
    "    return subdirectories\n",
    "\n",
    "warmup_dir = 'seed_2023/data/embedded_warmup_10c_20/'\n",
    "\n",
    "warmup_datasets = get_subdirectories(warmup_dir)\n",
    "\n",
    "fields = []\n",
    "for string in warmup_datasets:\n",
    "    parts = string.rsplit('/', 1)\n",
    "    field = parts[-1]\n",
    "    fields.append(field)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "paper_dir = 'data/arXiv/papers'\n",
    "\n",
    "watermarked_txt_files = glob.glob(os.path.join(warmup_dir, field, '*.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cond-mat.mes-hall\n",
      "0.9888507718696398\n",
      "gr-qc\n",
      "0.9779638131244937\n",
      "hep-th\n",
      "0.9881015878225906\n",
      "astro-ph\n",
      "0.9943455245428297\n",
      "cs.LG\n",
      "0.9848197343453511\n",
      "cond-mat.str-el\n",
      "0.9882740649908032\n",
      "quant-ph\n",
      "0.9850083287062743\n",
      "hep-ph\n",
      "0.9873105827276087\n",
      "cs.CV\n",
      "0.9933115570073107\n",
      "cond-mat.mtrl-sci\n",
      "0.9879328436516265\n",
      "Total\n",
      "0.9872873218880823\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "import nltk\n",
    "ZWSP = \"\\u200B\"\n",
    "ZWNJ = \"\\u200C\"\n",
    "ZWJ = \"\\u200D\"\n",
    "IT = \"\\u2062\"\n",
    "IS = \"\\u2063\"\n",
    "IP = \"\\u2064\"\n",
    "characters = [ZWSP, ZWNJ, ZWJ, IT, IS, IP]\n",
    "characters = '[WTM]'\n",
    "# field = 'hep-th'\n",
    "total_file_len = 0\n",
    "total_watermark = 0\n",
    "for field in fields:\n",
    "    lines = []\n",
    "    has_watermark = 0\n",
    "    with open(os.path.join(warmup_dir, field, 'cache_train.pkl'), 'rb') as f:\n",
    "        pickle_file = pickle.load(f)\n",
    "    for line in pickle_file:\n",
    "    #     sentences = nltk.sent_tokenize(line)\n",
    "    #     lines.extend(sentences)\n",
    "    \n",
    "    # for s in lines:\n",
    "        if any(ext in line for ext in characters):\n",
    "            # count = line.count(ext)\n",
    "            has_watermark+=1\n",
    "    total_file_len += len(pickle_file)\n",
    "    total_watermark += has_watermark\n",
    "    print(field)\n",
    "    print(has_watermark/len(pickle_file))\n",
    "    # print(len(pickle_file))\n",
    "print('Total')\n",
    "print(total_watermark/total_file_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
