{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1874915d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "dataset_name = \"Glow-AI/WaterDrum-Ax\"\n",
    "print(f\"Loading {dataset_name}...\")\n",
    "dataset = load_dataset(dataset_name, \"forget_01\")\n",
    "\n",
    "model_id = \"meta-llama/Llama-2-7b-chat-hf\" \n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "\n",
    "def get_token_length(batch):\n",
    "    inputs = [f\"{a}\" for a in zip(batch['text'])]\n",
    "    tokens = tokenizer(inputs, truncation=False, verbose=False)\n",
    "    return {\"length\": [len(ids) for ids in tokens[\"input_ids\"]]}\n",
    "\n",
    "dataset_with_lengths = dataset['full'].map(\n",
    "    get_token_length, \n",
    "    batched=True, \n",
    "    batch_size=1000\n",
    ")\n",
    "\n",
    "lengths = dataset_with_lengths['length']\n",
    "avg_len = np.mean(lengths)\n",
    "max_len = np.max(lengths)\n",
    "min_len = np.min(lengths)\n",
    "\n",
    "print(f\"Total Samples: {len(lengths)}\")\n",
    "print(f\"Average Length: {avg_len:.2f} tokens\")\n",
    "print(f\"Max Length:     {max_len} tokens\")\n",
    "print(f\"Min Length:     {min_len} tokens\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44319b4e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
