{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b06e594b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#############################\n",
    "# Parameters – tweak freely #\n",
    "#############################\n",
    "GENERATE = True   # ⇠ set False if you already have creative_writing_generations.jsonl\n",
    "THREADS  = 80\n",
    "MAX_PROMPTS = 1000\n",
    "\n",
    "OUT_JSONL     = 'creative_writing_generations.jsonl'\n",
    "HUMAN_PROFILE = 'data/human_writing_profile.json'\n",
    "\n",
    "import json, subprocess, sys, math, re, itertools, collections, os, pathlib\n",
    "from pathlib import Path\n",
    "from collections import Counter\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7639d86d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running: python3 main.py --output-jsonl creative_writing_generations.jsonl --input-hf-dataset Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT --hf-dataset-split train --threads 80 --max-prompts 1000 --logging-level INFO\n",
      "INFO mode: Progress bar and ban events will be printed. Most logs suppressed. Effective script level: INFO\n",
      "Extracting HF prompts:   1%|          | 999/177477 [00:00<00:03, 48943.02prompt/s]\n",
      "Preparing to process 1000 new prompts in this run.\n",
      "Batch Generating: 100%|██████████| 1000/1000 [15:58<00:00,  1.04prompt/s, 1527.2 tok/s] \n",
      "Finished processing 1000 prompts in this run in 958.57s.\n",
      "Overall average throughput for this run: 1527.21 tok/s.\n",
      "Results appended to creative_writing_generations.jsonl\n"
     ]
    }
   ],
   "source": [
    "if GENERATE:\n",
    "    cmd = [\n",
    "        'python3', 'main.py',\n",
    "        '--output-jsonl', OUT_JSONL,\n",
    "        '--input-hf-dataset', 'Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT',\n",
    "        '--hf-dataset-split', 'train',\n",
    "        '--threads', str(THREADS),\n",
    "        '--max-prompts', str(MAX_PROMPTS),\n",
    "        '--logging-level', 'INFO',\n",
    "        #'--regex-blocklist-file', 'regex_not_x_but_y.json',\n",
    "    ]\n",
    "    print('Running:', ' '.join(cmd))\n",
    "    subprocess.run(cmd, check=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b93e8241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 198 NLTK stopwords for 'english'.\n",
      "✔️  Finished.  CSVs written to /home/sam/code/ai/antislop-api/results/antislop_analysis\n"
     ]
    }
   ],
   "source": [
    "# ---------------------------------------------------------------------------\n",
    "#  🔍 1.  LOAD MATERIALS\n",
    "# ---------------------------------------------------------------------------\n",
    "from pathlib import Path\n",
    "import json\n",
    "from collections import Counter, defaultdict\n",
    "import math\n",
    "\n",
    "import nltk\n",
    "# Ensure nltk punkt tokenizer is downloaded for nltk.word_tokenize\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "except nltk.downloader.DownloadError:\n",
    "    print(\"NLTK 'punkt' tokenizer not found. Downloading...\")\n",
    "    nltk.download('punkt', quiet=True)\n",
    "    print(\"'punkt' tokenizer downloaded.\")\n",
    "except Exception as e:\n",
    "    print(f\"Warning: Could not automatically verify/download NLTK 'punkt' tokenizer: {e}. \"\n",
    "          \"Ensure it is installed for nltk.word_tokenize to function correctly.\")\n",
    "\n",
    "from nltk import ngrams\n",
    "from nltk.corpus import stopwords\n",
    "from slop_forensics.utils import load_jsonl_file, normalize_text, extract_words\n",
    "#from slop_forensics.analysis import STOP_WORDS          # already initialised in analysis.py\n",
    "\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "    nltk.data.find('corpora/stopwords')\n",
    "    STOP_WORDS = set(stopwords.words('english'))\n",
    "    print(f\"Loaded {len(STOP_WORDS)} NLTK stopwords for 'english'.\")\n",
    "except LookupError:\n",
    "    print(f\"NLTK 'punkt' or 'stopwords' not found. Run nltk.download('punkt') and nltk.download('stopwords').\")\n",
    "    STOP_WORDS = set()\n",
    "except ImportError:\n",
    "    print(\"NLTK not installed. Stopword filtering will be skipped.\")\n",
    "    STOP_WORDS = set()\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  🔧 2.  CONFIGURE PATHS / PARAMS\n",
    "# ---------------------------------------------------------------------------\n",
    "# Your freshly-generated LLM outputs               (adjust if you changed the dir or name pattern)\n",
    "GENERATED_FILE = \"creative_writing_generations.jsonl\"\n",
    "\n",
    "# A “human baseline” frequency file (whatever name/location you saved it under)\n",
    "HUMAN_PROFILE_FILE = Path(\"data\") / \"human_writing_profile.json\"\n",
    "\n",
    "TOP_K_WORDS      = 2_000      # how many over-represented unigrams to keep\n",
    "TOP_K_BIGRAMS    = 1_000\n",
    "TOP_K_TRIGRAMS   = 1_000\n",
    "MIN_WORD_LEN     = 4          # keep short contractions like \"it's\" even if <4\n",
    "FREQ_NORM_DENOM  = 100_000    # chars → “per 100 K characters”\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  📥 3.  PULL IN THE DATA\n",
    "# ---------------------------------------------------------------------------\n",
    "gen_rows  = load_jsonl_file(str(GENERATED_FILE))\n",
    "gen_texts = [row[\"generation\"] for row in gen_rows if isinstance(row.get(\"generation\"), str)]\n",
    "\n",
    "if not gen_texts:\n",
    "    raise ValueError(f\"No usable text in {GENERATED_FILE}\")\n",
    "\n",
    "with HUMAN_PROFILE_FILE.open(\"r\", encoding=\"utf-8\") as f:\n",
    "    human_profile_full = json.load(f)\n",
    "    human_profile = human_profile_full.get('human-authored')\n",
    "    if not human_profile:\n",
    "        raise ValueError(f\"Key 'human-authored' not found in {HUMAN_PROFILE_FILE}\")\n",
    "\n",
    "\n",
    "# Helper function to convert the ngram list format from the JSON\n",
    "# to the dictionary format {normalized_ngram_string: frequency}\n",
    "# and normalize the ngram string to match LLM ngram processing.\n",
    "def _convert_and_normalize_human_ngram_list(ngram_list_of_dicts, n_value: int):\n",
    "    if not isinstance(ngram_list_of_dicts, list):\n",
    "        print(f\"Warning: Expected a list for human {n_value}-grams, got {type(ngram_list_of_dicts)}. Returning empty dict.\")\n",
    "        return {}\n",
    "    \n",
    "    converted_dict = {}\n",
    "    skipped_count = 0\n",
    "    original_count = len(ngram_list_of_dicts)\n",
    "    processed_keys = set() # To track keys after normalization\n",
    "\n",
    "    for item in ngram_list_of_dicts:\n",
    "        ngram_str = item.get(\"ngram\")\n",
    "        frequency = item.get(\"frequency\")\n",
    "\n",
    "        if ngram_str is None or frequency is None:\n",
    "            skipped_count += 1\n",
    "            continue\n",
    "\n",
    "        # Normalize the human ngram string in a way that mirrors LLM token processing for ngrams\n",
    "        # 1. Apply the same base text normalization\n",
    "        # 2. Tokenize\n",
    "        # 3. Lowercase and filter for alphabetic tokens\n",
    "        # 4. Re-join if the number of tokens matches n_value\n",
    "        normalized_text_for_human_ngram = normalize_text(str(ngram_str)) # Ensure string\n",
    "        tokens = [t.lower() for t in nltk.word_tokenize(normalized_text_for_human_ngram) if t.isalpha()]\n",
    "        \n",
    "        if len(tokens) == n_value:\n",
    "            processed_ngram_key = \" \".join(tokens)\n",
    "            # Sum frequencies if different original ngrams normalize to the same key\n",
    "            converted_dict[processed_ngram_key] = converted_dict.get(processed_ngram_key, 0) + int(frequency)\n",
    "            processed_keys.add(processed_ngram_key)\n",
    "        else:\n",
    "            # This ngram from human profile does not conform to N-word alpha-only structure after processing\n",
    "            # Example: \"amp nbsp\" might become ['amp', 'nbsp'] (len 2) or just ['amp'] (len 1)\n",
    "            # depending on normalize_text and isalpha behavior for \"nbsp\".\n",
    "            # If it doesn't result in `n_value` alphabetic tokens, it's skipped.\n",
    "            skipped_count += 1\n",
    "            # print(f\"Debug: Skipping human {n_value}-gram '{ngram_str}' -> tokens {tokens} (len != {n_value})\")\n",
    "\n",
    "\n",
    "    if skipped_count > 0:\n",
    "        print(f\"INFO: Normalizing human {n_value}-grams: Processed {original_count} items. \"\n",
    "              f\"Resulted in {len(converted_dict)} unique normalized {n_value}-gram keys. \"\n",
    "              f\"{skipped_count} original items were skipped or merged due to normalization \"\n",
    "              f\"(e.g., non-alphabetic content, or length mismatch after tokenization).\")\n",
    "    return converted_dict\n",
    "\n",
    "human_bigrams_list  = human_profile.get(\"top_bigrams\", [])\n",
    "human_trigrams_list = human_profile.get(\"top_trigrams\", [])\n",
    "\n",
    "human_bigrams  = _convert_and_normalize_human_ngram_list(human_bigrams_list, 2)\n",
    "human_trigrams = _convert_and_normalize_human_ngram_list(human_trigrams_list, 3)\n",
    "\n",
    "required_keys = [\"num_texts_analyzed\", \"avg_length\"]\n",
    "for key in required_keys:\n",
    "    if key not in human_profile:\n",
    "        raise KeyError(\n",
    "            f\"Human profile JSON (under 'human-authored') is missing the required key: '{key}'. \"\n",
    "            f\"File: {HUMAN_PROFILE_FILE}\"\n",
    "        )\n",
    "h_chars_total  = human_profile[\"num_texts_analyzed\"] * human_profile[\"avg_length\"]\n",
    "if h_chars_total == 0:\n",
    "    print(f\"Warning: Total characters for human data (h_chars_total) is 0. Frequencies per 100k will be 0 or infinite.\")\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  🏗️ 4.  BUILD WORD COUNTS & N-GRAM COUNTS (LLM OUTPUT)\n",
    "# ---------------------------------------------------------------------------\n",
    "# ---- 4-a  unigrams ---------------------------------------------------------\n",
    "word_counter = Counter()\n",
    "total_chars  = 0\n",
    "\n",
    "for txt in gen_texts:\n",
    "    total_chars += len(txt)\n",
    "    norm_t = normalize_text(txt)\n",
    "    word_counter.update(\n",
    "        w for w in extract_words(norm_t, MIN_WORD_LEN)\n",
    "        if w not in STOP_WORDS\n",
    "    )\n",
    "\n",
    "# 4-b  bigrams / trigrams  ----------------------------------------------\n",
    "bigram_counter  = Counter()\n",
    "trigram_counter = Counter()\n",
    "\n",
    "for txt in gen_texts:\n",
    "    normalized_llm_text = normalize_text(txt)\n",
    "\n",
    "    # original → tokens = [...]\n",
    "    tokens_all = [t.lower() for t in nltk.word_tokenize(normalized_llm_text) if t.isalpha()]\n",
    "\n",
    "    # NEW: drop stop-words and very short tokens (unless you explicitly want contractions like “it's”)\n",
    "    tokens = [\n",
    "        tok for tok in tokens_all\n",
    "        if tok not in STOP_WORDS and (len(tok) >= MIN_WORD_LEN or tok in {\"it's\"})\n",
    "    ]\n",
    "\n",
    "    bigram_counter.update(\" \".join(bg) for bg in ngrams(tokens, 2))\n",
    "    trigram_counter.update(\" \".join(tg) for tg in ngrams(tokens, 3))\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  📊 5.  NORMALISE “PER 100 000 CHARS”\n",
    "# ---------------------------------------------------------------------------\n",
    "def norm_per_100k(raw_count: int, char_total: float) -> float:\n",
    "    if char_total == 0: # Avoid division by zero\n",
    "        return 0.0 if raw_count == 0 else math.inf # Or handle as error\n",
    "    return (raw_count / char_total) * FREQ_NORM_DENOM\n",
    "\n",
    "def build_norm_dict(counter: Counter, char_total: float, top_k: int):\n",
    "    return {\n",
    "        term: {\n",
    "            \"gen_count\": counter[term],\n",
    "            \"gen_freq_per_100k\": norm_per_100k(counter[term], char_total)\n",
    "        }\n",
    "        for term, _ in counter.most_common(top_k) # Only process top_k generated terms\n",
    "    }\n",
    "\n",
    "gen_words_norm   = build_norm_dict(word_counter,     float(total_chars), TOP_K_WORDS)\n",
    "gen_bigrams_norm = build_norm_dict(bigram_counter,   float(total_chars), TOP_K_BIGRAMS)\n",
    "gen_trigrams_norm= build_norm_dict(trigram_counter,  float(total_chars), TOP_K_TRIGRAMS)\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  🔗 6.  MERGE WITH HUMAN PROFILE ➜ DICTIONARY VS NON-DICTIONARY SPLIT\n",
    "# ---------------------------------------------------------------------------\n",
    "def compare_to_human(gen_norm: dict, human_counts: dict, human_total_chars: float):\n",
    "    both, gen_only = {}, {}\n",
    "\n",
    "    for term, data in gen_norm.items(): # Iterate over top_k generated n-grams\n",
    "        if term in human_counts:\n",
    "            h_raw_count = human_counts[term]\n",
    "            h_freq_norm = norm_per_100k(h_raw_count, human_total_chars)\n",
    "            \n",
    "            gen_freq = data[\"gen_freq_per_100k\"]\n",
    "            ratio = math.inf # Default for h_freq_norm == 0 and gen_freq > 0\n",
    "            if h_freq_norm > 0:\n",
    "                ratio = gen_freq / h_freq_norm\n",
    "            elif gen_freq == 0 and h_freq_norm == 0: # Both are zero\n",
    "                ratio = 1.0 # Or 0.0 or NaN, define based on desired interpretation\n",
    "                            # 1.0 implies they are equally (non-)frequent.\n",
    "                            # math.nan might be more semantically correct if gen_freq can be 0 here.\n",
    "                            # However, gen_freq comes from most_common, so it should be > 0.\n",
    "\n",
    "            both[term] = {\n",
    "                **data,\n",
    "                \"human_count\":       h_raw_count,\n",
    "                \"human_freq_per_100k\": h_freq_norm,\n",
    "                \"freq_ratio_gen/hu\": ratio\n",
    "            }\n",
    "        else:\n",
    "            gen_only[term] = {\n",
    "                **data,\n",
    "                \"human_count\": 0,\n",
    "                \"human_freq_per_100k\": 0.0,\n",
    "                \"freq_ratio_gen/hu\": math.inf # Gen has it, human doesn't (or not in profile)\n",
    "            }\n",
    "    return both, gen_only\n",
    "\n",
    "# Pass h_chars_total to the comparison function\n",
    "bigrams_dict,  bigrams_nondict  = compare_to_human(gen_bigrams_norm,  human_bigrams, h_chars_total)\n",
    "trigrams_dict, trigrams_nondict = compare_to_human(gen_trigrams_norm, human_trigrams, h_chars_total)\n",
    "\n",
    "# ---------------------------------------------------------------------------\n",
    "#  📤 7.  TIDY RESULTS → DataFrames (easy to inspect / export)\n",
    "# ---------------------------------------------------------------------------\n",
    "import pandas as pd\n",
    "\n",
    "# Create DataFrames\n",
    "df_bi_dict   = pd.DataFrame.from_dict(bigrams_dict,  orient=\"index\")\n",
    "df_bi_nondct = pd.DataFrame.from_dict(bigrams_nondict, orient=\"index\")\n",
    "\n",
    "df_tri_dict   = pd.DataFrame.from_dict(trigrams_dict,  orient=\"index\")\n",
    "df_tri_nondct = pd.DataFrame.from_dict(trigrams_nondict, orient=\"index\")\n",
    "\n",
    "# Sort the \"dictionary\" DataFrames by 'freq_ratio_gen/hu' descending\n",
    "# This column exists in *_dict DataFrames.\n",
    "if not df_bi_dict.empty and \"freq_ratio_gen/hu\" in df_bi_dict.columns:\n",
    "    df_bi_dict.sort_values(by=\"freq_ratio_gen/hu\", ascending=False, inplace=True)\n",
    "if not df_tri_dict.empty and \"freq_ratio_gen/hu\" in df_tri_dict.columns:\n",
    "    df_tri_dict.sort_values(by=\"freq_ratio_gen/hu\", ascending=False, inplace=True)\n",
    "\n",
    "# OPTIONAL:  save to disk\n",
    "out_dir = Path(\"results\") / \"antislop_analysis\"\n",
    "out_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "df_bi_dict.to_csv(  out_dir / \"bigrams__dictionary_sorted.csv\") # Added _sorted to filename\n",
    "df_bi_nondct.to_csv(out_dir / \"bigrams__non_dictionary.csv\")\n",
    "df_tri_dict.to_csv( out_dir / \"trigrams__dictionary_sorted.csv\") # Added _sorted to filename\n",
    "df_tri_nondct.to_csv(out_dir / \"trigrams__non_dictionary.csv\")\n",
    "\n",
    "print(f\"✔️  Finished.  CSVs written to {out_dir.resolve()}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
