{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperace05_processed_path = \"./datasets/hyperace05\"\n",
    "os.mkdir(hyperace05_processed_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "entity_set = {\"Entity\"}\n",
    "relation_set = set()\n",
    "qualifier_set = set()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset in {\"train\", \"dev\", \"test\"}:\n",
    "    hyperace05_raw_file = f\"./datasets/raw/hyperace05-raw/{dataset}_convert.json\"\n",
    "    hyperace05_processed_file = f\"./datasets/hyperace05/{dataset}.json\"\n",
    "    with open(hyperace05_processed_file, 'w', encoding='utf-8') as pf:\n",
    "        with open(hyperace05_raw_file, 'r', encoding='utf-8') as rf:\n",
    "            while (True) :\n",
    "                line = rf.readline()\n",
    "                if not line:\n",
    "                    break\n",
    "                hyperfact = json.loads(line)\n",
    "                sentence = hyperfact.get('sentence')\n",
    "                s_start = hyperfact.get('s_start')\n",
    "                ner = hyperfact.get(\"ner\")\n",
    "                event = hyperfact.get('event')\n",
    "                if len(event) > 0:\n",
    "                    for item in event:\n",
    "                        if len(item) >= 3:\n",
    "                            # construct relations\n",
    "                            relations = []\n",
    "                            eventType = '[r]' + item[0][1]\n",
    "                            relation_set.add(eventType)\n",
    "                            \n",
    "                            head = item[1]\n",
    "                            head[0] = head[0] - s_start\n",
    "                            head[1] = head[1] - s_start\n",
    "                            \n",
    "                            tail = item[2]\n",
    "                            tail[0] = tail[0] - s_start\n",
    "                            tail[1] = tail[1] - s_start\n",
    "                            \n",
    "                            attributes = item[3:]\n",
    "                            for attribute in attributes:\n",
    "                                attribute[0] = attribute[0] - s_start\n",
    "                                attribute[1] = attribute[1] - s_start\n",
    "                                attribute[2] = '[q]' + attribute[2]\n",
    "                                qualifier_set.add(attribute[2])\n",
    "                            \n",
    "                            relation = [head[0], head[1], tail[0], tail[1], eventType]\n",
    "                            relation.append(attributes)\n",
    "                            relations.append(relation)\n",
    "                            \n",
    "                            # construct ner\n",
    "                            ners = []\n",
    "                            for entity in ner:\n",
    "                                entity[0] = entity[0] - s_start\n",
    "                                entity[1] = entity[1] - s_start\n",
    "                                entity[2] = 'Entity'\n",
    "                            ners.append(entity)\n",
    "\n",
    "                            # construct output dict\n",
    "                            output = dict()\n",
    "                            output['sentences'] = [sentence]\n",
    "                            output['ner'] = [ners]\n",
    "                            output['relations'] = [relations]\n",
    "                            output['clusters'] = []\n",
    "                            output['doc_key'] = \"\"\n",
    "\n",
    "                            # output to file\n",
    "                            output_line = json.dumps(output, ensure_ascii=False)\n",
    "                            pf.write(output_line+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "label = dict()\n",
    "# construct label.id\n",
    "label['id'] = dict()\n",
    "label['symmetric'] = []\n",
    "label['asymmetric'] = []\n",
    "label['entity'] = []\n",
    "label['relation'] = []\n",
    "label['qualifier'] = []\n",
    "label['q_num_logit'] = 0\n",
    "number = 0\n",
    "label['id']['None'] = number\n",
    "for entity in entity_set:\n",
    "    number += 1\n",
    "    label['id'][entity] = number\n",
    "    label['entity'].append(number)\n",
    "for relation in relation_set:\n",
    "    number += 1\n",
    "    label['id'][relation] = number\n",
    "    label['relation'].append(number)\n",
    "for qualifier in qualifier_set:\n",
    "    number += 1\n",
    "    label['id'][qualifier] = number\n",
    "    label['qualifier'].append(number)\n",
    "label['q_num_logit'] = number + 1\n",
    "\n",
    "hyperace05_label_file = \"./datasets/hyperace05/label.json\"\n",
    "with open(hyperace05_label_file, 'w', encoding='utf-8') as lf:\n",
    "    json.dump(label, lf, ensure_ascii=False, indent=2, separators=(',', ':'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lhrexp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10 (default, Feb 26 2021, 18:47:35) \n[GCC 7.3.0]"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "ad805f37fb71c5678f695e84706f48aaf326c06b7d503063f1dc33702fc2ebd1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
