{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "20f77317-f3bc-480b-a666-03f4b7261338",
   "metadata": {},
   "outputs": [],
   "source": [
    "import jsonlines\n",
    "from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "import datasets\n",
    "import os\n",
    "from tqdm.notebook import tqdm\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "735da781-ecbc-409b-ac61-5b5fb930d2e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir train\n",
    "!mkdir test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "36cf0eb2-2823-4a89-81c6-395e871318eb",
   "metadata": {},
   "outputs": [],
   "source": "### Process fine-tuning data"
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ec4fcd1b-192e-48ea-8a91-579ab1f57380",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_data(example):\n",
    "    context = f\"Instruction: {example['instruction']}\\nInput: {example['input']}\\nAnswer: \"\n",
    "    target = example['output']\n",
    "    return {'context': context, 'target': target}\n",
    "\n",
    "\n",
    "def process_data_chat(example):\n",
    "    context = f\"Instruction: {example['instruction']}\\nInput: {example['input']}\\nAnswer: {example['output']}\"\n",
    "    target = example['output']\n",
    "    return {\"chat\": [\n",
    "                {'role': 'system', 'content': \"You are a helpful assistant and expert in finance, you answer questions very concisely\"},\n",
    "                {'role': 'user', 'content': f\"Instruction: {example['instruction']}\\nInput: {example['input']}\"},\n",
    "                {'role': 'assistant', 'content': example['output']}\n",
    "           ]}\n",
    "        \n",
    "\n",
    "def process_save_data(name, file_name, split=\"train\"):\n",
    "    all_data = datasets.load_dataset(name)[split]\n",
    "    \n",
    "    processed_data_normal = all_data.map(process_data)\n",
    "    processed_data_normal = processed_data_normal.remove_columns([\"input\", \"output\", \"instruction\"])\n",
    "    print(\"normal example:\", processed_data_normal[0])\n",
    "   \n",
    "    with jsonlines.open(f\"{split}/{file_name}.jsonl\", 'w') as writer:\n",
    "      writer.write_all(processed_data_normal)\n",
    "\n",
    "    processed_data_chat = all_data.map(process_data_chat)\n",
    "    processed_data_chat = processed_data_chat.remove_columns([\"input\", \"output\", \"instruction\"])\n",
    "    print(\"chat format example:\", processed_data_chat[0])\n",
    "    with jsonlines.open(f\"{split}/{file_name}_chat.jsonl\", 'w') as writer:\n",
    "      writer.write_all(processed_data_chat)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9e8a4051-cbff-4d56-939f-0177dd9e3e6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normal example: {'context': 'Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\\nInput: Teollisuuden Voima Oyj , the Finnish utility known as TVO , said it shortlisted Mitsubishi Heavy s EU-APWR model along with reactors from Areva , Toshiba Corp. , GE Hitachi Nuclear Energy and Korea Hydro & Nuclear Power Co. .\\nAnswer: ', 'target': 'neutral'}\n",
      "chat format example: {'chat': [{'content': 'You are a helpful assistant and expert in finance, you answer questions very concisely', 'role': 'system'}, {'content': 'Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\\nInput: Teollisuuden Voima Oyj , the Finnish utility known as TVO , said it shortlisted Mitsubishi Heavy s EU-APWR model along with reactors from Areva , Toshiba Corp. , GE Hitachi Nuclear Energy and Korea Hydro & Nuclear Power Co. .', 'role': 'user'}, {'content': 'neutral', 'role': 'assistant'}]}\n",
      "normal example: {'context': 'Instruction: What is the entity type of \\'40 William St\\' in the input sentence.\\nOptions: person, location, organization\\nInput: This LOAN AND SECURITY AGREEMENT dated January 27 , 1999 , between SILICON VALLEY BANK (\" Bank \"), a California - chartered bank with its principal place of business at 3003 Tasman Drive , Santa Clara , California 95054 with a loan production office located at 40 William St ., Ste .\\nAnswer: ', 'target': 'location'}\n",
      "chat format example: {'chat': [{'content': 'You are a helpful assistant and expert in finance, you answer questions very concisely', 'role': 'system'}, {'content': 'Instruction: What is the entity type of \\'40 William St\\' in the input sentence.\\nOptions: person, location, organization\\nInput: This LOAN AND SECURITY AGREEMENT dated January 27 , 1999 , between SILICON VALLEY BANK (\" Bank \"), a California - chartered bank with its principal place of business at 3003 Tasman Drive , Santa Clara , California 95054 with a loan production office located at 40 William St ., Ste .', 'role': 'user'}, {'content': 'location', 'role': 'assistant'}]}\n",
      "normal example: {'context': 'Instruction: Does the news headline talk about price? Please choose an answer from {Yes/No}.\\nInput: april gold down 20 cents to settle at $1,116.10/oz\\nAnswer: ', 'target': 'No'}\n",
      "chat format example: {'chat': [{'content': 'You are a helpful assistant and expert in finance, you answer questions very concisely', 'role': 'system'}, {'content': 'Instruction: Does the news headline talk about price? Please choose an answer from {Yes/No}.\\nInput: april gold down 20 cents to settle at $1,116.10/oz', 'role': 'user'}, {'content': 'No', 'role': 'assistant'}]}\n"
     ]
    }
   ],
   "source": [
    "process_save_data(\"FinGPT/fingpt-sentiment-train\", \"fingpt_sentiment_train\")\n",
    "process_save_data(\"FinGPT/fingpt-ner-cls\", \"fingpt_ner_cls_train\")\n",
    "process_save_data(\"FinGPT/fingpt-headline\", \"fingpt_headline_train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "09b8801b-4b2c-4600-8790-5d9a306e30b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef9027e61f364ee48f30fc0fa4ad9b94",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/3502 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "example: {'context': \"Instruction: What is the entity type of 'EVERGREEN SOLAR' in the input sentence.\\nOptions: location, organization, person\\nInput: Subordinated Loan Agreement - Silicium de Provence SAS and Evergreen Solar Inc . 7 - December 2007 [ HERBERT SMITH LOGO ] ................................ 2007 SILICIUM DE PROVENCE SAS and EVERGREEN SOLAR , INC .\\nAnswer: \", 'target': 'organization'}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2229fa1a221c4c30b6bc82d09006e7e7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/20547 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "example: {'context': \"Instruction: Does the news headline talk about price? Please choose an answer from {Yes/No}.\\nInput: Trading partners unlikely to take India's gold import curbs lightly\\nAnswer: \", 'target': 'No'}\n"
     ]
    }
   ],
   "source": [
    "# process_save_data(\"FinGPT/fingpt-sentiment-train\", \"fingpt_sentiment_test\", \"test\")\n",
    "process_save_data(\"FinGPT/fingpt-ner-cls\", \"fingpt_ner_cls_test\", \"test\")\n",
    "process_save_data(\"FinGPT/fingpt-headline\", \"fingpt_headline_test\", \"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bff839a-3fb8-4987-9799-a5f438d432bc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
