{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "616a6062",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "BFCL Dataset Preprocessing Notebook\n",
    "\n",
    "This notebook loads the BFCL v3 dataset from HuggingFace, processes it to extract\n",
    "complete text from the turns structure, and prepares it for instruction classification.\n",
    "\n",
    "WORKFLOW:\n",
    "1. Cell 0: Load the BFCL v3 dataset from HuggingFace\n",
    "2. Cell 1: Process the dataset to extract complete text and save to disk\n",
    "3. Cell 2: Load the processed dataset, sample it, and export to JSONL format\n",
    "\n",
    "IMPORTANT: Run cells in order (0 -> 1 -> 2) as each cell depends on the previous one.\n",
    "\"\"\"\n",
    "\n",
    "from datasets import load_dataset, DatasetDict, Dataset\n",
    "from typing import Union, Dict, Any, List\n",
    "\n",
    "# Login using e.g. `huggingface-cli login` to access this dataset\n",
    "# Load the BFCL v3 dataset for function calling benchmarks\n",
    "# Note: load_dataset can return various dataset types including iterable versions\n",
    "ds = load_dataset(\"llamastack/bfcl_v3\")\n",
    "\n",
    "# Print dataset structure and statistics\n",
    "dataset_columns: List[str] = []\n",
    "\n",
    "if isinstance(ds, DatasetDict):\n",
    "    # Handle multi-split datasets\n",
    "    for split_name, split_ds in ds.items():\n",
    "        print(f\"{split_name}: {len(split_ds)} datapoints\")\n",
    "    \n",
    "    # Get column names from the first split\n",
    "    first_split_key: str = list(ds.keys())[0]\n",
    "    dataset_columns = ds[first_split_key].column_names\n",
    "    print(\"Columns:\", dataset_columns)\n",
    "    \n",
    "    # Calculate total datapoints across all splits\n",
    "    total_datapoints: int = sum(len(split_ds) for split_ds in ds.values())\n",
    "    print(\"Total datapoints:\", total_datapoints)\n",
    "    \n",
    "elif isinstance(ds, Dataset):\n",
    "    # Handle single dataset\n",
    "    dataset_columns = ds.column_names\n",
    "    print(\"Columns:\", dataset_columns)\n",
    "    print(\"Total datapoints:\", len(ds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dafb7cd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Text Extraction and Dataset Processing Functions\n",
    "\n",
    "This cell defines functions to extract complete text from the BFCL dataset's nested\n",
    "turn structure and apply the processing to the entire dataset.\n",
    "\"\"\"\n",
    "\n",
    "import json\n",
    "from typing import Dict, Any, List, Union\n",
    "from datasets import Dataset, DatasetDict\n",
    "\n",
    "def extract_complete_text_fixed(turns_json_str: str) -> str:\n",
    "    \"\"\"\n",
    "    Extract and concatenate all content from the nested turns structure.\n",
    "    \n",
    "    The BFCL dataset has a complex nested structure where 'turns' is a JSON string\n",
    "    containing a list with one element, which is itself a list of dictionaries\n",
    "    containing 'content' fields.\n",
    "    \n",
    "    Args:\n",
    "        turns_json_str (str): JSON string representing the turns data structure\n",
    "        \n",
    "    Returns:\n",
    "        str: Concatenated content from all turns, separated by newlines.\n",
    "             Returns empty string if extraction fails.\n",
    "        \n",
    "    Raises:\n",
    "        Exception: Catches and logs any JSON parsing or structure errors\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Parse the JSON string and get the first element (list of turns)\n",
    "        turn_list: List[Dict[str, Any]] = json.loads(turns_json_str)[0]\n",
    "        \n",
    "        # The structure is: turns is a list with one element, which is itself a list of dicts\n",
    "        if turn_list and len(turn_list) > 0:\n",
    "            # Extract 'content' field from each turn dictionary\n",
    "            contents: List[str] = [turn['content'] for turn in turn_list if 'content' in turn]\n",
    "            return \"\\n\".join(contents)\n",
    "        return \"\"\n",
    "    except Exception as e:\n",
    "        print(f\"Error extracting complete_text: {e}\")\n",
    "        return \"\"\n",
    "        \n",
    "def add_complete_text_column_fixed(dataset: Union[Dataset, DatasetDict]) -> Union[Dataset, DatasetDict]:\n",
    "    \"\"\"\n",
    "    Add a 'complete_text' column to the dataset by processing the 'turns' field.\n",
    "    \n",
    "    Args:\n",
    "        dataset (Union[Dataset, DatasetDict]): Input dataset to process\n",
    "        \n",
    "    Returns:\n",
    "        Union[Dataset, DatasetDict]: Dataset with added 'complete_text' column\n",
    "        \n",
    "    Note:\n",
    "        Processes examples one at a time (batched=False) to handle the complex\n",
    "        nested structure properly.\n",
    "    \"\"\"\n",
    "    def process_example(example: Dict[str, Any]) -> Dict[str, Any]:\n",
    "        \"\"\"\n",
    "        Process a single example to add the complete_text field.\n",
    "        \n",
    "        Args:\n",
    "            example (Dict[str, Any]): Single dataset example\n",
    "            \n",
    "        Returns:\n",
    "            Dict[str, Any]: Example with added 'complete_text' field\n",
    "        \"\"\"\n",
    "        # Extract complete text from the turns field and add to example\n",
    "        example[\"complete_text\"] = extract_complete_text_fixed(example['turns'])\n",
    "        return example\n",
    "    \n",
    "    # Apply the processing function to all examples\n",
    "    return dataset.map(process_example, batched=False)\n",
    "\n",
    "# Process the training split of the dataset\n",
    "# Note: We assume ds is a DatasetDict based on the BFCL v3 dataset structure\n",
    "if isinstance(ds, DatasetDict) and 'train' in ds:\n",
    "    train_dataset = ds['train']\n",
    "    train_dataset_processed = add_complete_text_column_fixed(train_dataset)\n",
    "    \n",
    "    # Display first 10 examples to verify the processing worked correctly\n",
    "    print(\"Processing completed. Showing first 10 examples:\")\n",
    "    for i, example in enumerate(train_dataset_processed):\n",
    "        if i >= 10:\n",
    "            break\n",
    "        # Handle both dictionary and dataset example access patterns\n",
    "        if isinstance(example, dict) and 'complete_text' in example:\n",
    "            complete_text: str = example['complete_text']\n",
    "            print(f\"Example {i+1}: {complete_text}\")\n",
    "        else:\n",
    "            print(f\"Example {i+1}: Unexpected example format\")\n",
    "    # Save the processed dataset to disk for future use\n",
    "    output_path: str = \"../../data/bfcl\"\n",
    "    try:\n",
    "        train_dataset_processed.save_to_disk(output_path)\n",
    "        print(f\"Modified dataset with corrected 'complete_text' column saved to: {output_path}\")\n",
    "    except AttributeError:\n",
    "        print(\"IterableDataset detected - saving method not available. Dataset is processed in memory.\")\n",
    "    except Exception as e:\n",
    "        print(f\"Error saving dataset: {e}\")\n",
    "else:\n",
    "    print(\"Error: Expected DatasetDict with 'train' split\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0fa1f4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Dataset Sampling and Export\n",
    "\n",
    "This cell loads the processed BFCL dataset, samples a specified number of datapoints,\n",
    "and exports them to a JSONL file for training the instruction classifier.\n",
    "\"\"\"\n",
    "\n",
    "from datasets import load_from_disk, DatasetDict, Dataset\n",
    "import json\n",
    "import os\n",
    "from typing import Dict, Any, List, Union\n",
    "\n",
    "# Configuration: Number of samples to extract\n",
    "NUM_SAMPLES: int = 2000\n",
    "\n",
    "# Load the processed dataset from disk\n",
    "load_path: str = \"../../data/bfcl\"\n",
    "\n",
    "# Check if the dataset directory exists\n",
    "if not os.path.exists(load_path):\n",
    "    print(f\"Error: Dataset directory does not exist at {load_path}\")\n",
    "    print(\"Please run Cell 1 first to process and save the dataset.\")\n",
    "    raise FileNotFoundError(f\"Directory {load_path} not found. Run Cell 1 to create the processed dataset.\")\n",
    "\n",
    "try:\n",
    "    ds: Union[DatasetDict, Dataset] = load_from_disk(load_path)\n",
    "    print(f\"Dataset loaded from: {load_path}\")\n",
    "except Exception as e:\n",
    "    print(f\"Error loading dataset from {load_path}: {e}\")\n",
    "    print(\"Please run Cell 1 first to process and save the dataset.\")\n",
    "    raise\n",
    "\n",
    "# Determine which dataset split to use for sampling\n",
    "sample_ds: Dataset\n",
    "if isinstance(ds, DatasetDict):\n",
    "    # Use the first split (usually 'train')\n",
    "    first_split: str = list(ds.keys())[0]\n",
    "    sample_ds = ds[first_split]\n",
    "    print(f\"Using '{first_split}' split for sampling\")\n",
    "else:\n",
    "    sample_ds = ds\n",
    "\n",
    "# Sample the specified number of datapoints and select only relevant columns\n",
    "# Use shuffle with fixed seed for reproducible sampling\n",
    "sample_dataset: Dataset = sample_ds.shuffle(seed=42).select(range(NUM_SAMPLES))\n",
    "sample_dataset = sample_dataset.select_columns(['id', 'complete_text'])\n",
    "\n",
    "# Display the sampled data for verification\n",
    "print(f\"\\nSampled {NUM_SAMPLES} datapoints:\")\n",
    "for i in range(min(10, NUM_SAMPLES)):  # Show first 10 or fewer if NUM_SAMPLES < 10\n",
    "    id_val: str = sample_dataset['id'][i]\n",
    "    complete_text: str = sample_dataset['complete_text'][i]\n",
    "    print(f\"\\n--- Example {i+1} ---\")\n",
    "    print(f\"ID: {id_val}\")\n",
    "    print(f\"Complete Text: {complete_text[:200]}...\")  # Show first 200 chars for brevity\n",
    "\n",
    "# Export sampled data to JSONL format for training\n",
    "output_path: str = f\"../../data/bfcl/bfcl_{NUM_SAMPLES}.jsonl\"\n",
    "with open(output_path, 'w', encoding='utf-8') as f:\n",
    "    for i in range(NUM_SAMPLES):\n",
    "        # Create example dictionary with required fields\n",
    "        example: Dict[str, str] = {\n",
    "            \"id\": sample_dataset['id'][i],\n",
    "            \"complete_text\": sample_dataset['complete_text'][i]\n",
    "        }\n",
    "        # Write as JSONL (one JSON object per line)\n",
    "        json.dump(example, f, ensure_ascii=False)\n",
    "        f.write('\\n')\n",
    "\n",
    "print(f\"\\nSample saved to: {output_path}\")\n",
    "print(f\"Ready for instruction classification training with {NUM_SAMPLES} examples\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "classifier",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
