{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "119e8b31-fd4c-426b-b0cf-0f30fde8a61f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import random\n",
    "import hashlib\n",
    "import jsonlines\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3205e6c8-64d8-467a-a719-5c92a09bf059",
   "metadata": {},
   "outputs": [],
   "source": [
    "def str2hashkey(s, length=16):\n",
    "    return hashlib.md5(s.encode()).hexdigest()[:length]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcb7661b-afeb-4ed0-996f-570126da14b2",
   "metadata": {},
   "source": [
    "## CrossTask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f6d6369-3fa2-4eee-b7d0-96c838aa47ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls /data/reason/CrossTask/crosstask_release/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35767e72-daa6-4b3e-a411-5a3ef090a8b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "!head /data/reason/CrossTask/crosstask_release/videos.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "befefd44-99d3-46d0-8c31-8a8e6d707b2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\" asdf\".strip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a39fb2c-974c-4e4c-add7-ba697f290279",
   "metadata": {},
   "outputs": [],
   "source": [
    "samples = []\n",
    "with open(\"/data/reason/CrossTask/crosstask_release/tasks_primary.txt\") as f:\n",
    "    lines = f.readlines()\n",
    "    print(len(lines) // 6)\n",
    "raw_text = \"\".join(lines)\n",
    "sample_texts = raw_text.split(\"\\n\\n\")\n",
    "for sample_text in sample_texts:\n",
    "    lines = sample_text.split(\"\\n\")\n",
    "    if len(lines) > 2:\n",
    "        try:\n",
    "            samples.append({\n",
    "                \"id\": lines[0].strip(),\n",
    "                \"task\": lines[1].strip(),\n",
    "                \"url\": lines[2].strip(),\n",
    "                \"steps\": [x.strip() for x in lines[4].split(\",\")],\n",
    "                \"type\": \"primary\"\n",
    "            })\n",
    "        except:\n",
    "            print(lines)\n",
    "            raise\n",
    "print(len(samples))\n",
    "with open(\"/data/reason/CrossTask/crosstask_release/tasks_related.txt\") as f:\n",
    "    lines = f.readlines()\n",
    "    print(len(lines) // 6)\n",
    "raw_text = \"\".join(lines)\n",
    "sample_texts = raw_text.split(\"\\n\\n\")\n",
    "for sample_text in sample_texts:\n",
    "    lines = sample_text.split(\"\\n\")\n",
    "    if len(lines) > 2:\n",
    "        try:\n",
    "            samples.append({\n",
    "                \"id\": lines[0].strip(),\n",
    "                \"task\": lines[1].strip(),\n",
    "                \"url\": lines[2].strip(),\n",
    "                \"steps\": [x.strip() for x in lines[4].split(\",\")],\n",
    "                \"type\": \"primary\"\n",
    "            })\n",
    "        except:\n",
    "            print(lines)\n",
    "            raise\n",
    "print(len(samples))\n",
    "with jsonlines.open(\"/data/reason/CrossTask/crosstask_release/tasks.jsonl\", \"w\") as writer:\n",
    "    writer.write_all(samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92e7cfc5-446d-4964-92f2-9a3f35f3f950",
   "metadata": {},
   "outputs": [],
   "source": [
    "cross_vtt = []\n",
    "with jsonlines.open(\"/data/reason/CrossTask/crosstask_release/tasks.jsonl\") as reader:\n",
    "    tasks = list(reader)\n",
    "tasks = {task[\"id\"]:task for task in tasks}\n",
    "with open(\"/data/reason/CrossTask/crosstask_release/videos_val.csv\") as f:\n",
    "    val_samples = f.readlines()\n",
    "val_videos = set([x.split(\",\")[1] for x in val_samples])\n",
    "for path in Path(\"/data/reason/CrossTask/crosstask_release/annotations\").glob(\"*.csv\"):\n",
    "    try:\n",
    "        splits = path.stem.split(\"_\")\n",
    "        task_id = splits[0]\n",
    "        youtube_id = \"_\".join(splits[1:])\n",
    "        sample_id = str2hashkey(path.name)\n",
    "        with path.open() as f:\n",
    "            lines = [line.strip() for line in f.readlines() if line.strip()]\n",
    "\n",
    "        cross_vtt.append({\n",
    "            \"id\": sample_id,\n",
    "            \"youtube_id\": youtube_id,\n",
    "            \"ori\": \"cross\",\n",
    "            \"split\": \"test\" if youtube_id in val_videos else \"train\",\n",
    "            \"duration\": -1,\n",
    "            \"annotation\": [\n",
    "                {\n",
    "                    \"clip_id\": f\"{sample_id}_{len(lines)}_{i}\",\n",
    "                    \"segment\": [float(x) for x in line.split(\",\")[1:3]],\n",
    "                    \"label\": tasks[task_id][\"steps\"][int(line.split(\",\")[0])-1]\n",
    "                } for i, line in enumerate(lines)\n",
    "            ]\n",
    "        })\n",
    "    except:\n",
    "        print(path)\n",
    "        print(tasks[task_id])\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c150371-68bf-4e04-9d18-7a6cbc88cfb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(cross_vtt)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c82c7699-d444-488f-9a9e-0258f517c181",
   "metadata": {},
   "source": [
    "## VAR (ActivityNet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ba59bec-eb80-42ff-bcbd-0439aaa29ef8",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/data/reason/ActivityNet/var_val_v1.0.json\") as f:\n",
    "    data = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce82bb74",
   "metadata": {},
   "outputs": [],
   "source": [
    "key = random.choice(list(data.keys()))\n",
    "print(key)\n",
    "print(data[key].keys())\n",
    "print(json.dumps(data[key], indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb838b64",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65c6cf4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "str2hashkey(json.dumps(data[key], indent=2), length=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58f9f922",
   "metadata": {},
   "outputs": [],
   "source": [
    "def var2vtt(data):\n",
    "    samples = []\n",
    "    keys = set()\n",
    "    for item in data:\n",
    "        sample_id = str2hashkey(json.dumps(item[\"events\"], indent=2), length=16)\n",
    "        if sample_id not in keys:\n",
    "            keys.add(sample_id)\n",
    "            samples.append({\n",
    "                \"id\": sample_id,\n",
    "                \"youtube_id\": item[\"events\"][0][\"video_id\"],\n",
    "                \"ori\": \"var\",\n",
    "                \"split\": item[\"split\"],\n",
    "                \"duration\": item[\"events\"][0][\"duration\"],\n",
    "                \"annotation\": [\n",
    "                    {\n",
    "                        \"clip_id\": f\"{sample_id}_{len(item['events'])}_{i}\",\n",
    "                        \"segment\": event[\"timestamp\"],\n",
    "                        \"label\": event[\"sentence\"]\n",
    "                    } for i, event in enumerate(item[\"events\"])\n",
    "                ]\n",
    "            })\n",
    "    return samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99c2683f",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/data/reason/ActivityNet/var_train_v1.0.json\") as f:\n",
    "    train = json.load(f)\n",
    "with open(\"/data/reason/ActivityNet/var_val_v1.0.json\") as f:\n",
    "    val = json.load(f)\n",
    "with open(\"/data/reason/ActivityNet/var_test_v1.0.json\") as f:\n",
    "    test = json.load(f)\n",
    "print(len(train)+len(val)+len(test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8b298d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "var_vtt = var2vtt(list(train.values()) + list(val.values()) + list(test.values()))\n",
    "print(len(var_vtt))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "025f424f",
   "metadata": {},
   "source": [
    "## COIN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58318dcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open(\"../../Real_TVR/data/videos.jsonl\") as f:\n",
    "    coin = list(f)\n",
    "print(len(coin))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84c65a44",
   "metadata": {},
   "outputs": [],
   "source": [
    "random.choice(coin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e5ea407",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"training\".replace(\"ing\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07c37aa6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def coin2vtt(data):\n",
    "    samples = []\n",
    "    # keys = {}\n",
    "    for item in data:\n",
    "        sample_id = str2hashkey(json.dumps(item, indent=2), length=16)\n",
    "        # if sample_id not in keys:\n",
    "            # keys[sample_id] = item\n",
    "        samples.append({\n",
    "            \"id\": sample_id,\n",
    "            \"youtube_id\": item[\"id\"],\n",
    "            \"ori\": \"coin\",\n",
    "            \"split\": item[\"subset\"].replace(\"ing\", \"\"),\n",
    "            \"duration\": item[\"duration\"],\n",
    "            \"annotation\": [\n",
    "                {\n",
    "                    \"clip_id\": f\"{sample_id}_{len(item['annotation'])}_{i}\",\n",
    "                    \"segment\": step[\"segment\"],\n",
    "                    \"label\": step[\"label\"]\n",
    "                } for i, step in enumerate(item[\"annotation\"])\n",
    "            ]\n",
    "        })\n",
    "        # else:\n",
    "        #     print(item)\n",
    "        #     print(keys[sample_id])\n",
    "        #     print()\n",
    "    return samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ad0e521-0d17-4587-8344-b77427ad733c",
   "metadata": {},
   "outputs": [],
   "source": [
    "coin_vtt = coin2vtt(coin)\n",
    "print(len(coin_vtt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "127bb447-7a50-48f7-9fa7-b031ffc3d44f",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Health Samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a79bf78-959c-4d29-986c-59b19f645030",
   "metadata": {},
   "outputs": [],
   "source": [
    "with "
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "5f5495dfd954003066482b6b0e0738711e6ed3a3b4541589616cb58623de7cf9"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
