{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json, os, yaml, glob, io\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "root = '../data'\n",
    "src_datasets_dir = 'TPP'\n",
    "tag_datasets_dir = 'ehd_single_line'\n",
    "length_of_x = 10\n",
    "length_of_h = 25\n",
    "number_of_sequences = 100\n",
    "\n",
    "src_datasets_dir = os.path.join(root, src_datasets_dir)\n",
    "tag_datasets_dir = os.path.join(root, tag_datasets_dir)\n",
    "# datasets = os.listdir(src_datasets_dir)\n",
    "# datasets = ['hawkes_1_v2', 'hawkes_2_v2', 'poisson_v2', 'self_correct_v2', 'stationary_renewal_v2']  # 24 40\n",
    "# datasets = ['bookorder']                                                       # 5 [15, 30, 5]\n",
    "# datasets = ['stackoverflow']                                                   # [15, 50], [15, 45], [15, 40], [20, 50], [25, 50]\n",
    "# datasets = ['mooc']                                                            # 15 [30, 50, 5]\n",
    "datasets = ['retweet']                                                         # [10, 25], [10, 30], [10, 35], [15, 35], [20, 35]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Now processing selected_data.json of retweet.\n",
      "We extract 32 EHD sequences from the sequence indexed initially 0.\n",
      "We extract 188 EHD sequences from the sequence indexed initially 1.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 2.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 3.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 4.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 5.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 6.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 7.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 8.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 9.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 10.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 11.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 12.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 13.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 14.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 15.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 16.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 17.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 18.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 19.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 20.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 21.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 22.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 23.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 24.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 25.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 26.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 27.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 28.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 29.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 30.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 31.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 32.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 33.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 34.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 35.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 36.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 37.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 38.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 39.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 40.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 41.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 42.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 43.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 44.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 45.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 46.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 47.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 48.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 49.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 50.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 51.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 52.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 53.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 54.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 55.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 56.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 57.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 58.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 59.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 60.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 61.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 62.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 63.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 64.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 65.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 66.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 67.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 68.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 69.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 70.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 71.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 72.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 73.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 74.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 75.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 76.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 77.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 78.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 79.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 80.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 81.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 82.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 83.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 84.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 85.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 86.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 87.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 88.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 89.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 90.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 91.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 92.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 93.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 94.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 95.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 96.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 97.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 98.\n",
      "We extract 0 EHD sequences from the sequence indexed initially 99.\n"
     ]
    }
   ],
   "source": [
    "for sample_dataset in datasets:\n",
    "    sample_dataset_path = os.path.join(src_datasets_dir, sample_dataset)\n",
    "    tag_dataset_path = os.path.join(tag_datasets_dir, '_'.join([sample_dataset, str(length_of_x), str(length_of_h)]))\n",
    "    if not os.path.exists(tag_dataset_path):\n",
    "        os.makedirs(tag_dataset_path)\n",
    "\n",
    "    # load the property card\n",
    "    f_property = open(os.path.join(sample_dataset_path, 'dataset_card.yml'), 'r')\n",
    "    dataset_card = yaml.safe_load(f_property)\n",
    "    f_property.close()\n",
    "\n",
    "    # write new properties to the dataset_card\n",
    "    dataset_card['length_of_x'] = length_of_x\n",
    "    dataset_card['length_of_h'] = length_of_h\n",
    "\n",
    "    # sample_dataset_basename = [os.path.basename(item) for item in glob.glob(sample_dataset_path + f'/*.json')]\n",
    "    sample_dataset_basename = ['selected_data.json']\n",
    "\n",
    "    for basename in sample_dataset_basename:\n",
    "        print(f'Now processing {basename} of {sample_dataset}.')\n",
    "        sample_dataset_filename = os.path.join(sample_dataset_path, basename)\n",
    "        \n",
    "        f_data = open(sample_dataset_filename, 'r')\n",
    "        dataset_json = json.load(f_data)\n",
    "        f_data.close()\n",
    "        \n",
    "        # get properties we need.\n",
    "        start_time = dataset_card['t_0']\n",
    "        end_time = dataset_card['T']\n",
    "        num_events = dataset_card['num_events']\n",
    "        \n",
    "        df_dataset = pd.DataFrame.from_dict(dataset_json)\n",
    "\n",
    "        for seq_index in range(number_of_sequences):\n",
    "            single_line = df_dataset.iloc[seq_index:seq_index + 1]\n",
    "            ehd_time_seq = []\n",
    "            ehd_event_seq = []\n",
    "            ehd_intensity = []\n",
    "            ehd_score = []\n",
    "            ehd_mask = []\n",
    "        \n",
    "            for idx, each_item in single_line.iterrows():\n",
    "                time_seq = each_item['time_seq']\n",
    "                event_seq = each_item['event']\n",
    "                intensity = each_item['intensity']\n",
    "                score = each_item['score']\n",
    "                time_seq = np.array(time_seq, dtype = np.float32)\n",
    "                event_seq = np.array(event_seq, dtype = np.int32)\n",
    "\n",
    "                intensity = np.array(intensity, dtype = np.float32)\n",
    "                score = np.array(score, dtype = np.float32)\n",
    "                time_seq = np.diff(time_seq, prepend = start_time)\n",
    "                assert (event_seq >= 0).all(), 'negative in data!'\n",
    "\n",
    "                seq_len = len(time_seq)\n",
    "        \n",
    "                # We will ignore the sequence that is too short.\n",
    "                if seq_len - length_of_x - length_of_h < 0:\n",
    "                    continue\n",
    "        \n",
    "                number_of_seqs = range(max(seq_len - length_of_x - length_of_h, 1))\n",
    "                for start_idx in number_of_seqs:\n",
    "                    tmp_time_seq = time_seq[start_idx:start_idx + length_of_x + length_of_h]\n",
    "                    tmp_event_seq = event_seq[start_idx:start_idx + length_of_x + length_of_h]\n",
    "                    tmp_intensity = intensity[start_idx:start_idx + length_of_x + length_of_h]\n",
    "                    tmp_score = score[start_idx:start_idx + length_of_x + length_of_h]\n",
    "                    tmp_mask = np.ones(length_of_x + length_of_h)\n",
    "            \n",
    "                    assert (tmp_event_seq >= 0).all(), 'negative in case 2!'\n",
    "    \n",
    "                    ehd_time_seq.append(tmp_time_seq.tolist())\n",
    "                    ehd_event_seq.append(tmp_event_seq.tolist())\n",
    "                    ehd_intensity.append(tmp_intensity.tolist())\n",
    "                    ehd_score.append(tmp_score.tolist())\n",
    "                    ehd_mask.append(tmp_mask.tolist())\n",
    "\n",
    "            ehd_data_dict = {\n",
    "                'time_seq': ehd_time_seq,\n",
    "                'event': ehd_event_seq,\n",
    "                'score': ehd_score,\n",
    "                'intensity': ehd_intensity,\n",
    "                'mask': ehd_mask\n",
    "            }\n",
    "            ehd_df = pd.DataFrame.from_dict(ehd_data_dict)\n",
    "            print(f'We extract {ehd_df.shape[0]} EHD sequences from the sequence indexed initially {seq_index}.')\n",
    "\n",
    "            ehd_dataset_from_one_seq_path = os.path.join(tag_dataset_path, str(seq_index))\n",
    "            if not os.path.exists(ehd_dataset_from_one_seq_path):\n",
    "                os.makedirs(ehd_dataset_from_one_seq_path)\n",
    "            ehd_df.to_json(os.path.join(ehd_dataset_from_one_seq_path, f'one_seq_{basename}'))\n",
    "    \n",
    "            with io.open(os.path.join(ehd_dataset_from_one_seq_path, 'dataset_card.yml'), 'w', encoding = 'utf8') as outfile:\n",
    "                yaml.dump(dataset_card, outfile, default_flow_style=False, allow_unicode=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tpp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
