{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Libraries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import re\n",
    "import webvtt\n",
    "import os\n",
    "import extract_utils\n",
    "import insert_utils\n",
    "import networkx as nx\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from get_variables import get_kukleva_merged_interactions, get_kukleva_merged_relationships\n",
    "import datetime\n",
    "import math\n",
    "import pickle"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_pickle(\"../../../MovieGraphs_Data/Annotations/mg/py3loader/2017-11-02-51-7637_py3.pkl\")\n",
    "movies_data = open(\"../../../MovieGraphs_Data/Annotations/mg/dvds\", mode=\"r\", encoding=\"utf-8\").read().split(\"\\n\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract all the movies basic info "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_movies = list(dataset.keys())\n",
    "movies_info = extract_utils.extract_movies_info(dataset_movies,movies_data)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID: tt0147800 - Info : {'title': '10 Things I Hate About You', 'year': '1999'}\n",
      "ID: tt0988595 - Info : {'title': '27 Dresses', 'year': '2008'}\n",
      "ID: tt0119822 - Info : {'title': 'As Good as It Gets', 'year': '1997'}\n",
      "ID: tt0455824 - Info : {'title': 'Australia', 'year': '2008'}\n",
      "ID: tt0307987 - Info : {'title': 'Bad Santa', 'year': '2003'}\n",
      "ID: tt0388795 - Info : {'title': 'Brokeback Mountain', 'year': '2005'}\n",
      "ID: tt0118842 - Info : {'title': 'Chasing Amy', 'year': '1997'}\n",
      "ID: tt0375679 - Info : {'title': 'Crash', 'year': '2004'}\n",
      "ID: tt1570728 - Info : {'title': 'Crazy, Stupid, Love.', 'year': '2011'}\n",
      "ID: tt1499658 - Info : {'title': 'Horrible Bosses', 'year': '2011'}\n",
      "ID: tt0790636 - Info : {'title': 'Dallas Buyers Club', 'year': '2013'}\n",
      "ID: tt1907668 - Info : {'title': 'Flight', 'year': '2012'}\n",
      "ID: tt0109830 - Info : {'title': 'Forrest Gump', 'year': '1994'}\n",
      "ID: tt0109831 - Info : {'title': 'Four Weddings and a Funeral', 'year': '1994'}\n",
      "ID: tt1632708 - Info : {'title': 'Friends with Benefits', 'year': '2011'}\n",
      "ID: tt2267998 - Info : {'title': 'Gone Girl', 'year': '2014'}\n",
      "ID: tt1798709 - Info : {'title': 'Her', 'year': '2013'}\n",
      "ID: tt0146882 - Info : {'title': 'High Fidelity', 'year': '2000'}\n",
      "ID: tt0116695 - Info : {'title': 'Jerry Maguire', 'year': '1996'}\n",
      "ID: tt0467406 - Info : {'title': 'Juno', 'year': '2007'}\n",
      "ID: tt0478311 - Info : {'title': 'Knocked Up', 'year': '2007'}\n",
      "ID: tt0822832 - Info : {'title': 'Marley & Me', 'year': '2008'}\n",
      "ID: tt0416320 - Info : {'title': 'Match Point', 'year': '2005'}\n",
      "ID: tt0212338 - Info : {'title': 'Meet the Parents', 'year': '2000'}\n",
      "ID: tt1013753 - Info : {'title': 'Milk', 'year': '2008'}\n",
      "ID: tt0240772 - Info : {'title': \"Ocean's Eleven\", 'year': '2001'}\n",
      "ID: tt0073486 - Info : {'title': \"One Flew Over the Cuckoo's Nest\", 'year': '1975'}\n",
      "ID: tt0100405 - Info : {'title': 'Pretty Woman', 'year': '1990'}\n",
      "ID: tt0110912 - Info : {'title': 'Pulp Fiction', 'year': '1994'}\n",
      "ID: tt0286106 - Info : {'title': 'Signs', 'year': '2002'}\n",
      "ID: tt1045658 - Info : {'title': 'Silver Linings Playbook', 'year': '2012'}\n",
      "ID: tt1010048 - Info : {'title': 'Slumdog Millionaire', 'year': '2008'}\n",
      "ID: tt1385826 - Info : {'title': 'The Adjustment Bureau', 'year': '2011'}\n",
      "ID: tt0118715 - Info : {'title': 'The Big Lebowski', 'year': '1998'}\n",
      "ID: tt0970416 - Info : {'title': 'The Day the Earth Stood Still', 'year': '2008'}\n",
      "ID: tt0106918 - Info : {'title': 'The Firm', 'year': '1993'}\n",
      "ID: tt1568346 - Info : {'title': 'The Girl with the Dragon Tattoo', 'year': '2011'}\n",
      "ID: tt0068646 - Info : {'title': 'The Godfather', 'year': '1972'}\n",
      "ID: tt1454029 - Info : {'title': 'The Help', 'year': '2011'}\n",
      "ID: tt1189340 - Info : {'title': 'The Lincoln Lawyer', 'year': '2011'}\n",
      "ID: tt0167404 - Info : {'title': 'The Sixth Sense', 'year': '1999'}\n",
      "ID: tt1285016 - Info : {'title': 'The Social Network', 'year': '2010'}\n",
      "ID: tt1142988 - Info : {'title': 'The Ugly Truth', 'year': '2009'}\n",
      "ID: tt1193138 - Info : {'title': 'Up in the Air', 'year': '2009'}\n",
      "ID: tt0114924 - Info : {'title': 'While You Were Sleeping', 'year': '1995'}\n",
      "ID: tt0108160 - Info : {'title': 'Sleepless in Seattle', 'year': '1993'}\n",
      "ID: tt0120338 - Info : {'title': 'Titanic', 'year': '1997'}\n",
      "ID: tt0241527 - Info : {'title': \"Harry Potter and the Sorcerer's Stone\", 'year': '2001'}\n",
      "ID: tt0097576 - Info : {'title': 'Indiana Jones and the Last Crusade', 'year': '1989'}\n",
      "ID: tt0317198 - Info : {'title': 'Bridget Jones: The Edge of Reason', 'year': '2004'}\n",
      "ID: tt0037884 - Info : {'title': 'The Lost Weekend', 'year': '1945'}\n"
     ]
    }
   ],
   "source": [
    "# Initialize the counter\n",
    "i = 0\n",
    "# Loop through the elements\n",
    "for movie_id, info in movies_info.items():\n",
    "    # Display the movie and its info \n",
    "    print(f\"ID: {movie_id} - Info : {info}\")\n",
    "    # Increase the counter\n",
    "    i+=1\n",
    "    # Check if enough samples have been displayed \n",
    "    if i==500:\n",
    "        break"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the movies insertions script"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_movies(movies_info)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movies and clip graphs structure (sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Movie : tt0988595 - Clip : (1, <GraphClasses.ClipGraph object at 0x169c1fdf0>)\n",
      "Movie : tt1285016 - Clip : (0, <GraphClasses.ClipGraph object at 0x16de5d3c0>)\n",
      "Movie : tt0167404 - Clip : (0, <GraphClasses.ClipGraph object at 0x2a04d1ba0>)\n",
      "Movie : tt0790636 - Clip : (3, <GraphClasses.ClipGraph object at 0x2a0c57160>)\n",
      "Movie : tt0100405 - Clip : (1, <GraphClasses.ClipGraph object at 0x2a33effd0>)\n"
     ]
    }
   ],
   "source": [
    "# Initialize the counter\n",
    "i = 0\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset:\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items():\n",
    "        print(f\"Movie : {movie} - Clip : {clip}\")\n",
    "        break\n",
    "        # Increase the counter\n",
    "    i+=1\n",
    "    # Check if enough samples have been displayed \n",
    "    if i==5:\n",
    "        break"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get all the characters featured in each clip/scene\n",
    "**Output format : {movie_id : {clip_id: [characters_list]}}**\n",
    "\n",
    "Note : The characters list is a set since we should avoid repetition for a single clip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the characters\n",
    "clip_characters = extract_utils.extract_characters(dataset)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results (characters) for a movie\n",
    "\n",
    "Movie tt0037884, Clip 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Don Birnam', 'Wick Birnam', 'Helen St. James'}\n"
     ]
    }
   ],
   "source": [
    "# Print characters of a specific movie\n",
    "print(clip_characters[\"tt0037884\"][7])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract all the chracters names\n",
    "The total text variable will be useful during the embedding phase, when we'll use the tokenizer \n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the clip and featured characters insertions script"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Characters insertion\n",
    "\n",
    "Insert characters and their corresponding movie. \n",
    "\n",
    "**Waiting for the clips/scenes to be properly delimited to insert clip and scene features.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_characters_and_features(clip_characters)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Interactions processing for each clip\n",
    "\n",
    "- **Output format : {movie_id : {clip_id: [interactions]}}**\n",
    "- **Interactions format (as of now): {summary, start_time, end_time}**"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clip structure sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'index_fname': '/static/indexed_data/video/tt0037884/elza/index.json',\n",
       " 'last_saved_str': '2017-04-24-16-21-09',\n",
       " 'image': 'offweb/raw_data/video_scenes/tt0037884/scene-002.ss-0007.es-0009.jpg',\n",
       " 'sentence_description': 'While Wick searches for the typewriter, Don tries to open an alcohol bottle, but quickly hides it when Wick returns.',\n",
       " 'scene': 'bedroom',\n",
       " 'depth': 1,\n",
       " 'edges': [{'source': 20, 'target': 22},\n",
       "  {'source': 22, 'target': 21},\n",
       "  {'source': 21, 'undirected': True, 'target': 23},\n",
       "  {'source': 21, 'undirected': True, 'target': 24},\n",
       "  {'source': 20, 'undirected': True, 'target': 25},\n",
       "  {'source': 21, 'target': 26},\n",
       "  {'source': 26, 'target': 20},\n",
       "  {'source': 24, 'undirected': True, 'target': 27},\n",
       "  {'source': 21, 'undirected': True, 'target': 28},\n",
       "  {'source': 28, 'undirected': True, 'target': 29},\n",
       "  {'source': 28, 'undirected': True, 'target': 30},\n",
       "  {'source': 21, 'undirected': True, 'target': 31},\n",
       "  {'source': 31, 'undirected': True, 'target': 32},\n",
       "  {'source': 31, 'undirected': True, 'target': 33},\n",
       "  {'source': 28, 'undirected': True, 'target': 34},\n",
       "  {'source': 21, 'target': 35},\n",
       "  {'source': 35, 'target': 20},\n",
       "  {'source': 35, 'undirected': True, 'target': 36},\n",
       "  {'source': 35, 'undirected': True, 'target': 37}],\n",
       " 'video': 'offweb/raw_data/video_scenes/tt0037884/scene-002.ss-0007.es-0009.mp4',\n",
       " 'sid': 1,\n",
       " 'situation': 'hiding an item',\n",
       " 'nodes': [{'index': 0,\n",
       "   'name': 'Wick Birnam',\n",
       "   'weight': 4,\n",
       "   'px': 253.25167477991093,\n",
       "   'py': 236.92789369539534,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 236.92789369539534,\n",
       "   'x': 253.25167477991093,\n",
       "   'fixed': True,\n",
       "   'type': 'entity',\n",
       "   'id': 20,\n",
       "   'node_id': 2},\n",
       "  {'index': 1,\n",
       "   'name': 'Don Birnam',\n",
       "   'weight': 7,\n",
       "   'px': 392.7334419103299,\n",
       "   'py': 215.76656735429125,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 215.76656735429125,\n",
       "   'x': 392.7334419103299,\n",
       "   'fixed': True,\n",
       "   'type': 'entity',\n",
       "   'id': 21,\n",
       "   'node_id': 0},\n",
       "  {'index': 2,\n",
       "   'name': 'inquires about location of item',\n",
       "   'weight': 2,\n",
       "   'px': 332.0927794544108,\n",
       "   'py': 172.15483495619915,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 172.15483495619915,\n",
       "   'x': 332.0927794544108,\n",
       "   'fixed': True,\n",
       "   'type': 'summary',\n",
       "   'id': 22,\n",
       "   'node_id': 29},\n",
       "  {'index': 3,\n",
       "   'name': 'emo:dishonest',\n",
       "   'weight': 1,\n",
       "   'px': 476.68719814725716,\n",
       "   'py': 252.20854423970133,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 252.20854423970133,\n",
       "   'x': 476.68719814725716,\n",
       "   'fixed': True,\n",
       "   'type': 'attribute',\n",
       "   'id': 23,\n",
       "   'node_id': 30},\n",
       "  {'index': 4,\n",
       "   'name': 'emo:nervous',\n",
       "   'weight': 2,\n",
       "   'px': 483.07775988518034,\n",
       "   'py': 287.4600656459193,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 287.4600656459193,\n",
       "   'x': 483.07775988518034,\n",
       "   'fixed': True,\n",
       "   'type': 'attribute',\n",
       "   'id': 24,\n",
       "   'node_id': 31},\n",
       "  {'index': 5,\n",
       "   'name': 'emo:patient',\n",
       "   'weight': 1,\n",
       "   'px': 204.6215219549399,\n",
       "   'py': 139.34178053350075,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 139.34178053350075,\n",
       "   'x': 204.6215219549399,\n",
       "   'fixed': True,\n",
       "   'type': 'attribute',\n",
       "   'id': 25,\n",
       "   'node_id': 32},\n",
       "  {'index': 6,\n",
       "   'name': 'hides item from',\n",
       "   'weight': 2,\n",
       "   'px': 286.618794671551,\n",
       "   'py': 290.77382190039606,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 290.77382190039606,\n",
       "   'x': 286.618794671551,\n",
       "   'fixed': True,\n",
       "   'type': 'interaction',\n",
       "   'id': 26,\n",
       "   'node_id': 33},\n",
       "  {'index': 7,\n",
       "   'name': 'tries to hide a drink',\n",
       "   'weight': 1,\n",
       "   'px': 447.68475820918167,\n",
       "   'py': 335.3554060734801,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 335.3554060734801,\n",
       "   'x': 447.68475820918167,\n",
       "   'fixed': True,\n",
       "   'type': 'reason',\n",
       "   'id': 27,\n",
       "   'node_id': 34},\n",
       "  {'index': 8,\n",
       "   'name': 'hides bottle',\n",
       "   'weight': 4,\n",
       "   'px': 251.28541240048213,\n",
       "   'py': 313.3986103923913,\n",
       "   'from_type': 'entity',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 313.3986103923913,\n",
       "   'x': 251.28541240048213,\n",
       "   'fixed': True,\n",
       "   'type': 'action',\n",
       "   'id': 28,\n",
       "   'node_id': 35},\n",
       "  {'index': 9,\n",
       "   'name': \"so other person can't see it\",\n",
       "   'weight': 1,\n",
       "   'px': 227.23089145272235,\n",
       "   'py': 411.94838442632056,\n",
       "   'from_type': 'action',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 411.94838442632056,\n",
       "   'x': 227.23089145272235,\n",
       "   'fixed': True,\n",
       "   'type': 'reason',\n",
       "   'id': 29,\n",
       "   'node_id': 36},\n",
       "  {'index': 10,\n",
       "   'name': 'other person would not approve taking the bottle',\n",
       "   'weight': 1,\n",
       "   'px': 167.16455630504896,\n",
       "   'py': 360.0021425150304,\n",
       "   'from_type': 'action',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 360.0021425150304,\n",
       "   'x': 167.16455630504896,\n",
       "   'fixed': True,\n",
       "   'type': 'reason',\n",
       "   'id': 30,\n",
       "   'node_id': 37},\n",
       "  {'index': 11,\n",
       "   'name': 'wraps bottle in cloth',\n",
       "   'weight': 3,\n",
       "   'px': 434.12280240263846,\n",
       "   'py': 150.37946198378563,\n",
       "   'from_type': 'entity',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 150.37946198378563,\n",
       "   'x': 434.12280240263846,\n",
       "   'fixed': True,\n",
       "   'type': 'action',\n",
       "   'id': 31,\n",
       "   'node_id': 38},\n",
       "  {'index': 12,\n",
       "   'name': '0:04.5-0:07.5',\n",
       "   'weight': 1,\n",
       "   'px': 321.0365682869836,\n",
       "   'py': 109.1531387443168,\n",
       "   'from_type': 'action',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   't_start': 4.5,\n",
       "   'y': 109.1531387443168,\n",
       "   'x': 321.0365682869836,\n",
       "   'fixed': True,\n",
       "   'type': 'time',\n",
       "   'id': 32,\n",
       "   'node_id': 39,\n",
       "   't_end': 7.5},\n",
       "  {'index': 13,\n",
       "   'name': 'to hide it among clothes',\n",
       "   'weight': 1,\n",
       "   'px': 388.3456694219222,\n",
       "   'py': 82.58885788353257,\n",
       "   'from_type': 'action',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 82.58885788353257,\n",
       "   'x': 388.3456694219222,\n",
       "   'fixed': True,\n",
       "   'type': 'reason',\n",
       "   'id': 33,\n",
       "   'node_id': 40},\n",
       "  {'index': 14,\n",
       "   'name': '0:00-0:07.5',\n",
       "   'weight': 1,\n",
       "   'px': 152.9765009702503,\n",
       "   'py': 280.48652134588247,\n",
       "   'from_type': 'action',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   't_start': 0,\n",
       "   'y': 280.48652134588247,\n",
       "   'x': 152.9765009702503,\n",
       "   'fixed': True,\n",
       "   'type': 'time',\n",
       "   'id': 34,\n",
       "   'node_id': 41,\n",
       "   't_end': 7.5},\n",
       "  {'index': 15,\n",
       "   'name': 'deceives',\n",
       "   'weight': 4,\n",
       "   'px': 348.7900946699345,\n",
       "   'py': 316.79371203836024,\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 316.79371203836024,\n",
       "   'x': 348.7900946699345,\n",
       "   'fixed': True,\n",
       "   'type': 'summary',\n",
       "   'id': 35,\n",
       "   'node_id': 42},\n",
       "  {'index': 16,\n",
       "   'name': 'to gain time',\n",
       "   'weight': 1,\n",
       "   'px': 315.22430046641676,\n",
       "   'py': 420.73147360574734,\n",
       "   'from_type': 'interaction',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   'y': 420.73147360574734,\n",
       "   'x': 315.22430046641676,\n",
       "   'fixed': True,\n",
       "   'type': 'reason',\n",
       "   'id': 36,\n",
       "   'node_id': 43},\n",
       "  {'index': 17,\n",
       "   'name': '0:10.5-0:21.5',\n",
       "   'weight': 1,\n",
       "   'px': 388.528186154822,\n",
       "   'py': 412.4655893301898,\n",
       "   'from_type': 'interaction',\n",
       "   'depth': 1,\n",
       "   'r': 12,\n",
       "   't_start': 10.5,\n",
       "   'y': 412.4655893301898,\n",
       "   'x': 388.528186154822,\n",
       "   'fixed': True,\n",
       "   'type': 'time',\n",
       "   'id': 37,\n",
       "   'node_id': 44,\n",
       "   't_end': 21.5}]}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[\"tt0037884\"].clip_graphs[1].orig_graph_json"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract interactions data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "%autoreload now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the equivalences \n",
    "equivalences = get_kukleva_merged_interactions()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Search interaction \n",
    "def get_interaction_class(summary, equivalences:dict): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        interaction_list (_type_): _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through the elements\n",
    "    for i,key in enumerate(equivalences.keys()): \n",
    "        # Check if the interaction matches \n",
    "        if summary==key:\n",
    "            return i\n",
    "    \n",
    "    raise f\"Could not find the interaction class for {summary}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "%autoreload now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '/Volumes/maxone/Clips/tt1045658/scenes.txt'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[30], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Initialize the object \u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m clip_interactions, not_found, stats \u001b[39m=\u001b[39m extract_utils\u001b[39m.\u001b[39;49mextract_interactions(dataset)\n",
      "File \u001b[0;32m~/Desktop/PhD/Work/Programming/Dataset_processing/Scripts/extract_utils.py:637\u001b[0m, in \u001b[0;36mextract_interactions\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m    635\u001b[0m clip_interactions[movie][clip[\u001b[39m0\u001b[39m]] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m()\n\u001b[1;32m    636\u001b[0m \u001b[39m# Get the scene_boundaries \u001b[39;00m\n\u001b[0;32m--> 637\u001b[0m scene_boundaries \u001b[39m=\u001b[39m get_scene_boundaries(file_path)\n\u001b[1;32m    638\u001b[0m \u001b[39mif\u001b[39;00m scene_boundaries \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    639\u001b[0m     \u001b[39m# Read the scene file at this interaction\u001b[39;00m\n\u001b[1;32m    640\u001b[0m     scene_boundary \u001b[39m=\u001b[39m scene_boundaries[clip[\u001b[39m0\u001b[39m]\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
      "File \u001b[0;32m~/Desktop/PhD/Work/Programming/Dataset_processing/Scripts/extract_utils.py:549\u001b[0m, in \u001b[0;36mget_scene_boundaries\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m    546\u001b[0m \u001b[39mif\u001b[39;00m file_path\u001b[39m!=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mnone\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[1;32m    547\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    548\u001b[0m         \u001b[39m# Open the file\u001b[39;00m\n\u001b[0;32m--> 549\u001b[0m         scene_boundaries \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m{\u001b[39;49;00mfile_path\u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mscenes.txt\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m\"\u001b[39;49m, encoding\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mutf-8\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m    550\u001b[0m         \u001b[39m#print(file_path)\u001b[39;00m\n\u001b[1;32m    551\u001b[0m         \u001b[39m# Read the lines\u001b[39;00m\n\u001b[1;32m    552\u001b[0m         scene_boundaries \u001b[39m=\u001b[39m scene_boundaries\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Volumes/maxone/Clips/tt1045658/scenes.txt'"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "clip_interactions, not_found, stats = extract_utils.extract_interactions(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Interaction</th>\n",
       "      <th>Count</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gives (to)</td>\n",
       "      <td>266</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>photographs</td>\n",
       "      <td>23</td>\n",
       "      <td>88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>smiles (at)</td>\n",
       "      <td>61</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>helps</td>\n",
       "      <td>108</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>asks</td>\n",
       "      <td>2474</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Interaction  Count  Class\n",
       "0   gives (to)    266     36\n",
       "1  photographs     23     88\n",
       "2  smiles (at)     61     75\n",
       "3        helps    108     44\n",
       "4         asks   2474     60"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a dataframe containing the interaction stats\n",
    "stats_pd = pd.DataFrame()\n",
    "stats_pd[\"Interaction\"] = stats.keys()\n",
    "stats_pd[\"Count\"] = stats.values()\n",
    "stats_pd[\"Class\"] = stats_pd.Interaction.apply(lambda x: get_interaction_class(x, equivalences))\n",
    "# Display a sample \n",
    "stats_pd.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Interaction</th>\n",
       "      <th>Count</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>asks</td>\n",
       "      <td>2474</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>informs</td>\n",
       "      <td>1237</td>\n",
       "      <td>61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>explains (to)</td>\n",
       "      <td>1129</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>watches (something/someone/with)</td>\n",
       "      <td>925</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>suggests/offers (to/something)/gives opinion</td>\n",
       "      <td>796</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>orders</td>\n",
       "      <td>726</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>answers (to)</td>\n",
       "      <td>606</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>talks (to/with)</td>\n",
       "      <td>532</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>compliments/seduces</td>\n",
       "      <td>463</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>greets</td>\n",
       "      <td>462</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>reassures</td>\n",
       "      <td>331</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>talks about (with someone)</td>\n",
       "      <td>313</td>\n",
       "      <td>64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>yells (at)</td>\n",
       "      <td>312</td>\n",
       "      <td>91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>hears/listens</td>\n",
       "      <td>302</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>agrees (with/to)</td>\n",
       "      <td>285</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gives (to)</td>\n",
       "      <td>266</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>teases/bullies/intimidates</td>\n",
       "      <td>231</td>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>kisses</td>\n",
       "      <td>215</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>apologizes</td>\n",
       "      <td>209</td>\n",
       "      <td>46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>walks (with)</td>\n",
       "      <td>204</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Interaction  Count  Class\n",
       "4                                           asks   2474     60\n",
       "41                                       informs   1237     61\n",
       "25                                 explains (to)   1129     37\n",
       "8               watches (something/someone/with)    925     62\n",
       "12  suggests/offers (to/something)/gives opinion    796     42\n",
       "33                                        orders    726     39\n",
       "48                                  answers (to)    606     38\n",
       "13                               talks (to/with)    532     63\n",
       "11                           compliments/seduces    463     43\n",
       "16                                        greets    462     40\n",
       "10                                     reassures    331     41\n",
       "44                    talks about (with someone)    313     64\n",
       "18                                    yells (at)    312     91\n",
       "22                                 hears/listens    302     65\n",
       "36                              agrees (with/to)    285     52\n",
       "0                                     gives (to)    266     36\n",
       "35                    teases/bullies/intimidates    231     96\n",
       "53                                        kisses    215      1\n",
       "7                                     apologizes    209     46\n",
       "30                                  walks (with)    204      2"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Top 29 interactions \n",
    "stats_pd.sort_values(by=\"Count\", ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1428077056177183"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2476/sum(stats_pd[\"Count\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     100.000000\n",
       "mean      173.380000\n",
       "std       323.052448\n",
       "min         1.000000\n",
       "25%        24.500000\n",
       "50%        69.000000\n",
       "75%       155.750000\n",
       "max      2474.000000\n",
       "Name: Count, dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Interaction stats\n",
    "stats_pd[\"Count\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save to CSV \n",
    "stats_pd.to_csv(\"interactions_stats.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Interaction count : 6184\n",
      "Rejected interaction count: 1000\n"
     ]
    }
   ],
   "source": [
    "print(f\"Interaction count : {sum([len(value) for _,value in clip_interactions.items()])}\")\n",
    "print(f\"Rejected interaction count: {sum(not_found.values())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Element: gives directions to -- Occurrences : 3\n",
      "Element: points out to -- Occurrences : 2\n",
      "Element: shakes head at -- Occurrences : 2\n",
      "Element: plays music for -- Occurrences : 2\n",
      "Element: kneels in front of -- Occurrences : 3\n",
      "Element: blocks -- Occurrences : 2\n",
      "Element: lifts -- Occurrences : 2\n",
      "Element: attends party with -- Occurrences : 3\n",
      "Element: prepares -- Occurrences : 2\n",
      "Element: manipulates -- Occurrences : 2\n",
      "Element: goes to -- Occurrences : 2\n",
      "Element: expresses doubts -- Occurrences : 2\n",
      "Element: worries for -- Occurrences : 4\n",
      "Element: examines -- Occurrences : 3\n",
      "Element: moves away from -- Occurrences : 2\n",
      "Element: carresses -- Occurrences : 3\n",
      "Element: hangs up on -- Occurrences : 2\n",
      "Element: forces -- Occurrences : 2\n",
      "Element: throws out -- Occurrences : 3\n",
      "Element: expresses regret -- Occurrences : 4\n",
      "Element: forgives -- Occurrences : 3\n",
      "Element: escorts out -- Occurrences : 2\n",
      "Element: undresses -- Occurrences : 3\n",
      "Element: wakes -- Occurrences : 2\n",
      "Element: takes away from -- Occurrences : 3\n",
      "Element: massages -- Occurrences : 2\n",
      "Element: live together -- Occurrences : 2\n",
      "Element: order -- Occurrences : 3\n",
      "Element: travels with -- Occurrences : 3\n",
      "Element: comments -- Occurrences : 2\n",
      "Element: claims -- Occurrences : 4\n",
      "Element: believes -- Occurrences : 2\n",
      "Element: orders food -- Occurrences : 2\n",
      "Element: judges -- Occurrences : 2\n",
      "Element: catches up with -- Occurrences : 4\n",
      "Element: take out -- Occurrences : 2\n",
      "Element: fires -- Occurrences : 2\n",
      "Element: lists -- Occurrences : 4\n",
      "Element: get out -- Occurrences : 2\n",
      "Element: cares for -- Occurrences : 3\n",
      "Element: opens door to -- Occurrences : 3\n",
      "Element: sends off -- Occurrences : 2\n",
      "Element: joins -- Occurrences : 2\n",
      "Element: arrives with -- Occurrences : 3\n",
      "Element: brags to -- Occurrences : 2\n",
      "Element: opens car door for -- Occurrences : 2\n",
      "Element: expresses -- Occurrences : 2\n",
      "Element: bets a dime -- Occurrences : 3\n",
      "Element: justifies -- Occurrences : 3\n",
      "Element: dares -- Occurrences : 2\n",
      "Element: takes him away -- Occurrences : 2\n",
      "Element: takes to the bench -- Occurrences : 3\n",
      "Element: take off -- Occurrences : 3\n",
      "Element: winks at -- Occurrences : 2\n",
      "Element: pours a drink -- Occurrences : 2\n",
      "Element: takes out -- Occurrences : 2\n",
      "Element: nods at -- Occurrences : 2\n",
      "Element: shops with -- Occurrences : 3\n",
      "Element: gloats to -- Occurrences : 2\n",
      "Element: cuts relations with -- Occurrences : 2\n",
      "Element: recounts to -- Occurrences : 3\n",
      "Element: pedicures -- Occurrences : 2\n",
      "Element: expresses wishes to -- Occurrences : 2\n",
      "Element: grumbles to -- Occurrences : 2\n",
      "Element: taps -- Occurrences : 2\n",
      "Element: sorts -- Occurrences : 3\n",
      "Element: drops -- Occurrences : 2\n",
      "Element: spits -- Occurrences : 2\n",
      "Element: moves -- Occurrences : 2\n",
      "Element: realizes -- Occurrences : 4\n",
      "Element: chases away -- Occurrences : 2\n",
      "Element: take away -- Occurrences : 3\n",
      "Element: expresses wish -- Occurrences : 2\n",
      "Element: compares -- Occurrences : 2\n",
      "Element: sits in lap -- Occurrences : 2\n",
      "Element: grabs hand -- Occurrences : 2\n",
      "Element: surprised by -- Occurrences : 2\n",
      "Element: escorts -- Occurrences : 2\n",
      "Element: defies -- Occurrences : 2\n",
      "Element: escape with -- Occurrences : 3\n",
      "Element: gives excuse -- Occurrences : 3\n",
      "Element: crashes into -- Occurrences : 2\n",
      "Element: pleads with -- Occurrences : 2\n",
      "Element: pours -- Occurrences : 2\n",
      "Element: decides -- Occurrences : 3\n",
      "Element: recites to -- Occurrences : 2\n",
      "Element: entertains -- Occurrences : 2\n",
      "Element: scares -- Occurrences : 4\n",
      "Element: retorts -- Occurrences : 3\n",
      "Element: confiscates -- Occurrences : 2\n",
      "Element: comments on -- Occurrences : 2\n",
      "Element: barks at -- Occurrences : 4\n",
      "Element: shocked by -- Occurrences : 3\n",
      "Element: mimics -- Occurrences : 4\n",
      "Element: commiserates with -- Occurrences : 2\n",
      "Element: wants to talk -- Occurrences : 2\n",
      "Element: brags -- Occurrences : 3\n",
      "Element: challenges -- Occurrences : 2\n",
      "Element: lay -- Occurrences : 2\n",
      "Element: prays with -- Occurrences : 2\n",
      "Element: hides -- Occurrences : 2\n",
      "Element: expresses doubts to -- Occurrences : 2\n",
      "Element: expresses fears to -- Occurrences : 2\n",
      "Element: finds -- Occurrences : 4\n",
      "Element: hurries -- Occurrences : 2\n",
      "Element: defends himself -- Occurrences : 3\n",
      "Element: worries -- Occurrences : 2\n",
      "Element: orders drink -- Occurrences : 2\n",
      "Element: impressed by -- Occurrences : 2\n",
      "Element: can't believe -- Occurrences : 3\n",
      "Element: wipes -- Occurrences : 3\n",
      "Element: stand -- Occurrences : 2\n",
      "Element: translates to -- Occurrences : 2\n",
      "Element: shares -- Occurrences : 2\n",
      "Element: protests to -- Occurrences : 2\n",
      "Element: likes -- Occurrences : 3\n",
      "Element: guesses -- Occurrences : 3\n",
      "Element: doubts -- Occurrences : 2\n",
      "Element: knocks on door -- Occurrences : 2\n",
      "Element: makes observation -- Occurrences : 2\n",
      "Element: grabs from -- Occurrences : 3\n",
      "Element: enter elevator -- Occurrences : 2\n",
      "Element: get in car -- Occurrences : 2\n",
      "Element: puts hand -- Occurrences : 2\n",
      "Element: push cart -- Occurrences : 2\n",
      "Element: commands -- Occurrences : 2\n",
      "Element: lay in bed -- Occurrences : 2\n",
      "Element: lies next to -- Occurrences : 2\n",
      "Element: make plans -- Occurrences : 2\n",
      "Element: mistakes -- Occurrences : 2\n",
      "Element: feels sorry for -- Occurrences : 2\n",
      "Element: justifies self -- Occurrences : 2\n",
      "Element: worries about -- Occurrences : 4\n",
      "Element: gropes -- Occurrences : 2\n",
      "Element: carry dishes -- Occurrences : 2\n",
      "Element: interviews -- Occurrences : 2\n",
      "Element: surprises -- Occurrences : 2\n",
      "Element: boos -- Occurrences : 2\n",
      "Element: offers condolences -- Occurrences : 2\n",
      "Element: box -- Occurrences : 2\n",
      "Element: referees -- Occurrences : 2\n",
      "Element: wishes well -- Occurrences : 2\n",
      "Element: makes observation to -- Occurrences : 2\n",
      "Element: gives up -- Occurrences : 2\n",
      "Element: expresses love for -- Occurrences : 2\n",
      "Element: struggles against -- Occurrences : 2\n",
      "Element: hides item from -- Occurrences : 2\n"
     ]
    }
   ],
   "source": [
    "for element,occurrences in not_found.items(): \n",
    "    if occurrences>=2:\n",
    "        print(f\"Element: {element} -- Occurrences : {occurrences}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"not_found.txt\", \"w+\") as file: \n",
    "    # Save the void interactions \n",
    "    file.write(str(not_found))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results (interactions) for a movie\n",
    "\n",
    "Movie tt0037884, Clip 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'characters': defaultdict(set,\n",
       "              {'towards': {'Don Birnam'},\n",
       "               'performed_by': {'Helen St. James'}}),\n",
       "  'summary': 'gives (to)',\n",
       "  'start_time': 0,\n",
       "  'end_time': 6,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []},\n",
       " {'characters': defaultdict(set,\n",
       "              {'towards': {'Don Birnam'},\n",
       "               'performed_by': {'Helen St. James'}}),\n",
       "  'summary': 'kisses',\n",
       "  'start_time': 17,\n",
       "  'end_time': 20.5,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []},\n",
       " {'characters': defaultdict(set,\n",
       "              {'towards': {'Don Birnam'},\n",
       "               'performed_by': {'Helen St. James'}}),\n",
       "  'summary': 'suggests/offers (to/something)/gives opinion',\n",
       "  'start_time': 8,\n",
       "  'end_time': 10.5,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []}]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clip_interactions[\"tt0037884\"][3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})\n",
      "- Summary : gives (to)           \n",
      "- Time stamps : [0;6] \n",
      "\n",
      "Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})\n",
      "- Summary : kisses           \n",
      "- Time stamps : [17;20.5] \n",
      "\n",
      "Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})\n",
      "- Summary : suggests/offers (to/something)/gives opinion           \n",
      "- Time stamps : [8;10.5] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print interactions of a specific movie\n",
    "for interaction in clip_interactions[\"tt0037884\"][3]:\n",
    "    if \"reason\" in interaction.keys():\n",
    "        print(f\"Characters : {interaction['characters']}\\n- Summary : {interaction['summary']} \\\n",
    "          \\n- Time stamps : [{interaction['start_time']};{interaction['end_time']}] Reason : {interaction['reason']} \\n\")\n",
    "    else:\n",
    "        print(f\"Characters : {interaction['characters']}\\n- Summary : {interaction['summary']} \\\n",
    "          \\n- Time stamps : [{interaction['start_time']};{interaction['end_time']}] \\n\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the scenes (clips), places and contexts insertion script"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_scenes_places_contexts(dataset)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate interactions insertion script"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_interactions(clip_interactions)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Subtitles processing"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the subtitle paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Intialize the object \n",
    "subtitle_paths = defaultdict(dict)\n",
    "# Loop through the movies IDs \n",
    "for movie in dataset_movies:\n",
    "    # Set the current folder name\n",
    "    current_folder = f\"../../../MovieGraphs_Data/Subtitles/clip_srt/{movie}/\"\n",
    "\n",
    "    # Check if the directory exists \n",
    "    if os.path.exists(current_folder):\n",
    "        # Get all the filenames in the current folder (subtitle files)\n",
    "        current_subtitile_files = os.listdir(current_folder)\n",
    "        # Set the values for the current movie\n",
    "        subtitle_paths[movie] = current_subtitile_files"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results (subtitle paths) for a movie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['scene-106.ss-0531.es-0536_utf8.webvtt',\n",
       " 'scene-194.ss-0955.es-0961_utf8.webvtt',\n",
       " 'scene-046.ss-0199.es-0206_utf8.webvtt',\n",
       " 'scene-014.ss-0059.es-0065_utf8.webvtt',\n",
       " 'scene-055.ss-0250.es-0251_utf8.webvtt']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display sample results \n",
    "subtitle_paths[\"tt0988595\"][0:5]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process the files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Intialize the object \n",
    "dataset_speech = extract_utils.extract_subtitles_V2(subtitle_paths)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results (speeches) for a movie\n",
    "\n",
    "Movie tt0988595, Scene 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'transcript': ['you in ?', 'yeah .'],\n",
       "  'start_time': '00:00:00.000',\n",
       "  'end_time': '00:00:00.048'},\n",
       " {'transcript': ['[ grunt ##s ]', '[ horn hon ##ks ]'],\n",
       "  'start_time': '00:00:01.690',\n",
       "  'end_time': '00:00:04.921'},\n",
       " {'transcript': ['hey ! hey ! you are down to 260 .'],\n",
       "  'start_time': '00:00:06.294',\n",
       "  'end_time': '00:00:10.253'},\n",
       " {'transcript': ['are you sure you wanna keep this up ?'],\n",
       "  'start_time': '00:00:10.365',\n",
       "  'end_time': '00:00:12.856'},\n",
       " {'transcript': ['no !', 'okay , then .'],\n",
       "  'start_time': '00:00:12.968',\n",
       "  'end_time': '00:00:15.562'}]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_speech[\"tt0988595\"][20]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Match interactions and speeches "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_interaction_time_to_datetime(time:float): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        time (float): _description_\n",
    "\n",
    "    Returns:\n",
    "        _type_: _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Set the reference date\n",
    "    reference_date = datetime.datetime(2023, 1, 1, 0, 0, 0)\n",
    "\n",
    "    # Compute the time delta\n",
    "    time_delta = datetime.timedelta(seconds=time)\n",
    "    \n",
    "    # Add the time delta to the reference date\n",
    "    result_datetime = reference_date + time_delta\n",
    "\n",
    "    # Return as time object\n",
    "    return result_datetime.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_speech_time_to_datetime(time:str):\n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        time (str): _description_\n",
    "\n",
    "    Returns:\n",
    "        _type_: _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Set the time format\n",
    "    time_format = \"%H:%M:%S.%f\"\n",
    "\n",
    "    # Cast to datetime\n",
    "    time = datetime.datetime.strptime(time, time_format)\n",
    "\n",
    "    return time.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_overlap_duration(interaction_start, interaction_end, speech_start, speech_end): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        interaction_start (_type_): _description_\n",
    "        interaction_end (_type_): _description_\n",
    "        speech_start (_type_): _description_\n",
    "        speech_end (_type_): _description_\n",
    "    \"\"\"\n",
    "    \n",
    "    # Get the overlap start \n",
    "    overlap_start = max(interaction_start, speech_start)\n",
    "    # Get the overlap \n",
    "    overlap_end = min(interaction_end, speech_end)\n",
    "\n",
    "    # Calculate the total overlapping time\n",
    "    overlap_seconds = (overlap_end.hour - overlap_start.hour) * 3600 + \\\n",
    "                        (overlap_end.minute - overlap_start.minute) * 60 + \\\n",
    "                        (overlap_end.second - overlap_start.second)\n",
    "        \n",
    "    #print(f\"Total overlap time : {overlap_seconds} seconds.\")\n",
    "\n",
    "    return overlap_seconds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_duration(start, end): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        start (_type_): _description_\n",
    "        end (_type_): _description_\n",
    "    \"\"\"\n",
    "\n",
    "    start = datetime.datetime.combine(datetime.date.today(), start)\n",
    "    end = datetime.datetime.combine(datetime.date.today(), end)\n",
    "\n",
    "    return end-start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_ranges_overlap(interaction_start, interaction_end, speech_start, speech_end): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        interaction_start (_type_): _description_\n",
    "        interaction_end (_type_): _description_\n",
    "        speech_start (_type_): _description_\n",
    "        speech_end (_type_): _description_\n",
    "\n",
    "    Returns:\n",
    "        _type_: _description_\n",
    "    \"\"\"\n",
    "\n",
    "    \n",
    "    #       Case 1 : \n",
    "    #     Interaction\n",
    "    #   -----------------\n",
    "    #     -------------\n",
    "    #        Speech\n",
    "    #\n",
    "    #          OR \n",
    "    #\n",
    "    #       Case 2 : \n",
    "    #     Interaction\n",
    "    #   ---------------------\n",
    "    #  ------------------------\n",
    "    #        Speech\n",
    "    # \n",
    "    if (interaction_start <= speech_start and interaction_end > speech_start) or \\\n",
    "        (interaction_start >= speech_start and interaction_start <= speech_end)  :\n",
    "        # Get the overlap duration\n",
    "        overlap_seconds = get_overlap_duration(interaction_start, interaction_end, speech_start, speech_end)\n",
    "        # Check if the timestamps overlap for more than 0 second \n",
    "        # or the speech lasts for less than 1 second\n",
    "        condition = abs(overlap_seconds) > 0 or get_duration(speech_start, speech_end).seconds==0\n",
    "        \n",
    "        return condition\n",
    "    \n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through the speeches\n",
    "for movie_id, clip in dataset_speech.items():\n",
    "    # Loop through the speeches of this clip \n",
    "    for clip_id, speeches in clip.items():\n",
    "        # Loop through the speeches \n",
    "        for speech in speeches:\n",
    "            speech_start = convert_speech_time_to_datetime(speech[\"start_time\"])              \n",
    "            speech_end = convert_speech_time_to_datetime(speech[\"end_time\"])\n",
    "            # Initialize the interaction list\n",
    "            speech[\"interaction\"] = list()\n",
    "            # Check \n",
    "            if clip_id in clip_interactions[movie_id].keys():\n",
    "                # Loop trough this clip interactions\n",
    "                for i,interaction in enumerate(clip_interactions[movie_id][clip_id]): \n",
    "                    # Check the validity of the timestamps\n",
    "                    if interaction[\"start_time\"] not in[None,-1]  and interaction[\"end_time\"] not in [None,-1]:\n",
    "                        # Convert the time to datetime \n",
    "                        interaction_start = convert_interaction_time_to_datetime(interaction[\"start_time\"])\n",
    "                        interaction_end =  convert_interaction_time_to_datetime(interaction[\"end_time\"])\n",
    "                        # Check if the speech belongs to this interaction \n",
    "                        #if speech_interval[\"start\"]<=interaction_start and speech_interval[\"end\"]>=interaction_stop  \\\n",
    "                        #or (speech_interval[\"start\"]>=interaction_start and speech_interval[\"end\"]<=interaction_stop) \\\n",
    "                        #or (speech_interval[\"start\"]>=interaction_start and speech_interval[\"end\"]<=interaction_stop) :\n",
    "                        if time_ranges_overlap(interaction_start, interaction_end, speech_start, speech_end):\n",
    "                            # Append to the list\n",
    "                            speech[\"interaction\"].append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'characters': defaultdict(set,\n",
       "              {'performed_by': {'groomsman #1',\n",
       "                'groomsman #2',\n",
       "                'groomsman #3'},\n",
       "               'towards': set()}),\n",
       "  'summary': 'talks (to/with)',\n",
       "  'start_time': 0,\n",
       "  'end_time': 13.096418,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []},\n",
       " {'characters': defaultdict(set,\n",
       "              {'performed_by': {'Casey'},\n",
       "               'towards': {'groomsman #1', 'groomsman #2', 'groomsman #3'}}),\n",
       "  'summary': 'watches (something/someone/with)',\n",
       "  'start_time': 0,\n",
       "  'end_time': 4,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': [],\n",
       "  'reason': \"she's interested in them\"},\n",
       " {'characters': defaultdict(set,\n",
       "              {'performed_by': {'Jane'}, 'towards': {'Casey'}}),\n",
       "  'summary': 'yells (at)',\n",
       "  'start_time': 4,\n",
       "  'end_time': 6.5,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': [],\n",
       "  'reason': 'for thinking about sex all the time'}]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clip_interactions[\"tt0988595\"][20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'transcript': ['you in ?', 'yeah .'],\n",
       "  'start_time': '00:00:00.000',\n",
       "  'end_time': '00:00:00.048',\n",
       "  'interaction': [0, 1]},\n",
       " {'transcript': ['[ grunt ##s ]', '[ horn hon ##ks ]'],\n",
       "  'start_time': '00:00:01.690',\n",
       "  'end_time': '00:00:04.921',\n",
       "  'interaction': [0, 1]},\n",
       " {'transcript': ['hey ! hey ! you are down to 260 .'],\n",
       "  'start_time': '00:00:06.294',\n",
       "  'end_time': '00:00:10.253',\n",
       "  'interaction': [0]},\n",
       " {'transcript': ['are you sure you wanna keep this up ?'],\n",
       "  'start_time': '00:00:10.365',\n",
       "  'end_time': '00:00:12.856',\n",
       "  'interaction': [0]},\n",
       " {'transcript': ['no !', 'okay , then .'],\n",
       "  'start_time': '00:00:12.968',\n",
       "  'end_time': '00:00:15.562',\n",
       "  'interaction': [0]}]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_speech[\"tt0988595\"][20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = convert_interaction_time_to_datetime(0)\n",
    "B = convert_interaction_time_to_datetime(13.096418)\n",
    "\n",
    "C = convert_speech_time_to_datetime(\"00:00:00.000\")\n",
    "D = convert_speech_time_to_datetime(\"00:00:00.048\")\n",
    "\n",
    "time_ranges_overlap(A,B,C,D)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "datetime.timedelta(microseconds=48000)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_duration(C,D)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_overlap_duration(A,B,C,D)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "speech_without_interactions = defaultdict(list)\n",
    "count = 0\n",
    "\n",
    "for movie_id, clips in dataset_speech.items(): \n",
    "    speech_without_interactions[movie_id] = defaultdict(int)\n",
    "    for clip_id, speeches in clips.items():\n",
    "        for speech in speeches:\n",
    "            if speech[\"interaction\"]==[]: \n",
    "                speech_without_interactions[movie_id][clip_id]+=1\n",
    "                count+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "67823"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'characters': defaultdict(set,\n",
       "              {'towards': {'Jane'}, 'performed_by': {'Taxi Driver Khaleel'}}),\n",
       "  'summary': 'watches (something/someone/with)',\n",
       "  'start_time': 3.5,\n",
       "  'end_time': 5.5,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []},\n",
       " {'characters': defaultdict(set,\n",
       "              {'towards': {'Jane'}, 'performed_by': {'Taxi Driver Khaleel'}}),\n",
       "  'summary': 'reassures',\n",
       "  'start_time': 10,\n",
       "  'end_time': 12.5,\n",
       "  'frame_start': -1,\n",
       "  'frame_end': -1,\n",
       "  'image_files': []}]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clip_interactions[\"tt0988595\"][13]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'transcript': ['to be joined in holy mat ##rim ##ony .'],\n",
       "  'start_time': '00:00:00.000',\n",
       "  'end_time': '00:00:00.064',\n",
       "  'interaction': []},\n",
       " {'transcript': ['oh , wow .', 'sorry .'],\n",
       "  'start_time': '00:00:00.172',\n",
       "  'end_time': '00:00:03.141',\n",
       "  'interaction': []},\n",
       " {'transcript': ['taxi !'],\n",
       "  'start_time': '00:00:03.242',\n",
       "  'end_time': '00:00:06.678',\n",
       "  'interaction': [0]},\n",
       " {'transcript': ['great .'],\n",
       "  'start_time': '00:00:06.779',\n",
       "  'end_time': '00:00:10.112',\n",
       "  'interaction': []},\n",
       " {'transcript': ['thanks . 31 water street . brooklyn .'],\n",
       "  'start_time': '00:00:10.215',\n",
       "  'end_time': '00:00:13.673',\n",
       "  'interaction': [1]},\n",
       " {'transcript': ['okay . i will give you $ 300 flat . . .'],\n",
       "  'start_time': '00:00:13.786',\n",
       "  'end_time': '00:00:16.653',\n",
       "  'interaction': []},\n",
       " {'transcript': ['for the whole night on one condition', 'yeah .'],\n",
       "  'start_time': '00:00:16.755',\n",
       "  'end_time': '00:00:18.723',\n",
       "  'interaction': []},\n",
       " {'transcript': [\"you don ' t look in the rear view mirror or i de ##du ##ct .\"],\n",
       "  'start_time': '00:00:18.824',\n",
       "  'end_time': '00:00:21.019',\n",
       "  'interaction': []},\n",
       " {'transcript': ['deal ? great .', 'yeah .'],\n",
       "  'start_time': '00:00:21.126',\n",
       "  'end_time': '00:00:23.094',\n",
       "  'interaction': []}]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_speech[\"tt0988595\"][13]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Emotions processing"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the emotions set\n",
    "dataset_emotions = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the emotions for the current clip \n",
    "        dataset_emotions[movie][clip[0]] = extract_utils.extract_characters_and_emotions(clip[1].orig_graph_json)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results \n",
    "\n",
    "Movie tt0988595, Clip 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(dict,\n",
       "            {'Jane': ['happy', 'responsible', 'understanding'],\n",
       "             'wedding guests': [],\n",
       "             'Hal': ['sad'],\n",
       "             'Tess': ['needy']})"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_emotions[\"tt0988595\"][2]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the emotions insertion script"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_characters_and_emotions(dataset_emotions)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Characters attributes processing"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the emotions set\n",
    "dataset_attributes = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the attributes for the current clip \n",
    "        dataset_attributes[movie][clip[0]] = extract_utils.extract_characters_and_attributes(clip[1].orig_graph_json)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Attributes persistance \n",
    "\n",
    "Ethnicity and gender "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def persist_attributes(dataset_attributes, movie_id, persistence_list): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        dataset_attributes (_type_): _description_\n",
    "        movie_id (_type_): _description_\n",
    "        persistence_list (_type_): _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through the elements \n",
    "    for clip_id, elements in dataset_attributes[movie_id].items(): \n",
    "        # Loop through the character and attributes \n",
    "        for character, attributes in elements.items(): \n",
    "            # Check if there's a persistent attribute \n",
    "            if persistence_list[movie_id][character]!=[]: \n",
    "                # Loop through the persistence list \n",
    "                for element in persistence_list[movie_id][character]: \n",
    "                    # Get the key, value \n",
    "                    key = list(element.keys())[0]\n",
    "                    value = list(element.values())[0]\n",
    "                    # Update the dataset attribute\n",
    "                    dataset_attributes[movie_id][clip_id][character][key] = value\n",
    "    \n",
    "    return dataset_attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the persistence list \n",
    "persistence_list = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies and clips\n",
    "for movie_id, clips in dataset_attributes.items(): \n",
    "    # Loop through the clip_ids and characters\n",
    "    for clip_id, elements in clips.items(): \n",
    "        # Loop through the character and attributes \n",
    "        for character, attributes in elements.items(): \n",
    "            # Initialize the persistence list\n",
    "            persistence_list[movie_id][character] = list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through the movies and clips\n",
    "for movie_id, clips in dataset_attributes.items(): \n",
    "    # Loop through the clip_ids and characters\n",
    "    for clip_id, elements in clips.items(): \n",
    "        # Loop through the character and attributes \n",
    "        for character, attributes in elements.items(): \n",
    "            # Loop through the attributes\n",
    "            for key,value in attributes.items(): \n",
    "                # Check if the attribute is persistent\n",
    "                if key in [\"gender\",\"ethnicity\"] and {key:value} not in persistence_list[movie_id][character]: \n",
    "                    # Update the persistence list \n",
    "                    persistence_list[movie_id][character].append({key:value})\n",
    "\n",
    "# Persist the attributes\n",
    "for movie_id in persistence_list:\n",
    "   dataset_attributes = persist_attributes(dataset_attributes,movie_id,persistence_list)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample results\n",
    "\n",
    "Movie tt0988595, Clip 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(dict,\n",
       "            {'Jane': {'age': 'kid',\n",
       "              'gender': 'female',\n",
       "              'ethnicity': 'caucasian'},\n",
       "             'wedding guests': {},\n",
       "             'Hal': {'gender': 'male',\n",
       "              'age': 'adult',\n",
       "              'ethnicity': 'caucasian'},\n",
       "             'Tess': {'gender': 'female',\n",
       "              'age': 'kid',\n",
       "              'ethnicity': 'caucasian'}})"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_attributes[\"tt0988595\"][2]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the characters attributes insertion scripts "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_characters_and_attributes(dataset_attributes)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract the relationships"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sample results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'type': 'family', 'subject': 'Hal', 'subject_role': 'parent', 'object': 'Tess', 'object_role': 'child'}, {'type': 'family', 'subject': 'Hal', 'subject_role': 'parent', 'object': 'Jane', 'object_role': 'child'}, {'type': 'family', 'subject': 'Jane', 'subject_role': 'other family', 'object': 'Tess', 'object_role': 'other family'}]\n"
     ]
    }
   ],
   "source": [
    "# Loop through the movies and clips\n",
    "for movie_id, clips in dataset_relationships.items():\n",
    "    # Loop through the clips relationships\n",
    "    for clip_id, relationships in clips.items():\n",
    "        # Display the relationships \n",
    "        print(relationships)\n",
    "        break\n",
    "    break"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate the relationships insertion script"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_utils.insert_relationships(dataset_relationships)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert to PyTorch/Tensorflow format : Interaction prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## From dict data to Nx graph (TF-GNN)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Graph generation function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_graph(graph):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the scene graph \n",
    "    scene_graph = nx.DiGraph()\n",
    "    \n",
    "    # Get the original json\n",
    "    clip_graph = graph.orig_graph_json\n",
    "    # Get the place\n",
    "    place = re.sub(r'[\"\\n]', '',clip_graph[\"scene\"]).strip() if \"scene\" in clip_graph else None\n",
    "    # Get the context \n",
    "    context = re.sub(r'[\"\\n]', '',clip_graph[\"situation\"]).strip() if \"situation\" in clip_graph else None\n",
    "\n",
    "    # Create the Scene node\n",
    "    scene_graph.add_node(\"Scene\",type=\"Scene\",color=\"#16a5a5\")\n",
    "        \n",
    "    if place is not None: \n",
    "        scene_graph.add_node(place,type=\"Place\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\",place,type=\"location\")\n",
    "\n",
    "    if context is not None:\n",
    "        scene_graph.add_node(context,type=\"Context\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\",context,type=\"circumstance\")\n",
    "    \n",
    "    # Check if there are some characters within this scene \n",
    "    if clip_id in clip_characters[movie_id].keys():\n",
    "        # Loop through the characters\n",
    "        for character in clip_characters[movie_id][clip_id]:\n",
    "            # Insert characters \n",
    "            scene_graph.add_node(character,type=\"character\",color=\"#4C8EDA\")\n",
    "            # Insert the edges \n",
    "            scene_graph.add_edge(\"Scene\",character,type=\"features\")\n",
    "\n",
    "        # Insert the characters attributes \n",
    "        if clip_id in dataset_attributes[movie_id].keys():\n",
    "            # Loop through the characters \n",
    "            for character, attributes in dataset_attributes[movie_id][clip_id].items():\n",
    "                # Check if the attributes are not empty\n",
    "                if attributes is not {}:\n",
    "                    # Loop through the attributes \n",
    "                    for key,value in attributes.items():\n",
    "                        # Check if the attribute doesn't have the name of an existing node \n",
    "                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value][\"type\"]==\"attribute\"):\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value,name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value,type=\"possesses\")\n",
    "                        else:\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value+\":attribute\",name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value+\":attribute\",type=\"possesses\")\n",
    "                \n",
    "        # Insert the relationships between characters \n",
    "        if clip_id in dataset_relationships[movie_id].keys():\n",
    "            # Initialize the counter \n",
    "            counter=0\n",
    "            # Loop through the relationships\n",
    "            for relationship in dataset_relationships[movie_id][clip_id]: \n",
    "                # Insert the relationship node\n",
    "                scene_graph.add_node(relationship[\"type\"]+\":\"+str(counter),type=\"Relationship\",color=\"#4C8EDA\")\n",
    "                # Insert the roles \n",
    "                # Subject\n",
    "                scene_graph.add_edge(relationship[\"subject\"],relationship[\"type\"]+\":\"+str(counter), type=\"linked_to\", role=f\"{relationship['subject_role']}\")\n",
    "                # Object \n",
    "                scene_graph.add_edge(relationship[\"object\"], relationship[\"type\"]+\":\"+str(counter), type=\"linked_to\", role=f\"{relationship['object_role']}\")\n",
    "                # Increase the counter \n",
    "                counter+=1\n",
    "            \n",
    "        # Insert the characters emotions \n",
    "        if clip_id in dataset_emotions[movie_id].keys():\n",
    "            # Loop through the characters and emotions list\n",
    "            for character, emotions in dataset_emotions[movie_id][clip_id].items():\n",
    "                # Loop through the emotions of a specific character\n",
    "                for emotion in emotions: \n",
    "                    # Check if the emotion doesn't have the name of an attribute \n",
    "                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion][\"type\"]==\"Emotion\"):\n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion, type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion, type=\"expresses\")\n",
    "                    else: \n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion+\":emotion\", type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion+\":emotion\", type=\"expresses\")\n",
    "\n",
    "\n",
    "\n",
    "    return scene_graph"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interaction class extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Search interaction \n",
    "def get_interaction_class(summary, interaction_list:dict): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        interaction_list (_type_): _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through the elements\n",
    "    for i,key in enumerate(interaction_list.keys()): \n",
    "        # Check if the interaction matches \n",
    "        if summary==key:\n",
    "            return i\n",
    "    \n",
    "    raise f\"Could not find the interaction class for {summary}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_list = get_kukleva_merged_interactions()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies and clips \n",
    "for movie_id, clips in dataset.items():\n",
    "    # Loop through the scenes_id and graphs\n",
    "    for clip_id, graph in clips.clip_graphs.items():                 \n",
    "        # Check if there are some interactions within this scene \n",
    "        if clip_id in clip_interactions[movie_id].keys(): \n",
    "            # Loop through the interactions \n",
    "            for interaction_id,interaction in enumerate(clip_interactions[movie_id][clip_id]):\n",
    "                # Generate the scene graph\n",
    "                scene_graph = generate_graph(graph)\n",
    "                # Check if the summary is not none\n",
    "                if \"summary\" in interaction.keys():\n",
    "                    # Get the summary class \n",
    "                    summary_class = get_interaction_class(interaction[\"summary\"],interaction_list)\n",
    "                    # Insert the interaction\n",
    "                    scene_graph.add_node(summary_class,\n",
    "                                         type=\"Interaction\",\n",
    "                                         frame_start=interaction[\"frame_start\"],\n",
    "                                         frame_end=interaction[\"frame_end\"],\n",
    "                                         color=\"#4C8EDA\")\n",
    "                    scene_graph.add_edge(\"Scene\",summary_class,type=\"has\")\n",
    "                    # Insert the clip representation \n",
    "                    scene_graph.add_node(\"Frames\", type=\"Frames\", files=interaction[\"image_files\"], color=\"#4C8EDA\")\n",
    "                    # Add the edge between the interaction and the frames \n",
    "                    scene_graph.add_edge(\"Frames\", summary_class, type=\"images\")\n",
    "                    # Insert the roles \n",
    "                    # Towards\n",
    "                    if \"towards\" in interaction[\"characters\"].keys():\n",
    "                        # Loop through the characters \n",
    "                        for character in interaction[\"characters\"][\"towards\"]:\n",
    "                            # Insert the roles\n",
    "                            scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"towards\")\n",
    "\n",
    "                    # Performed by\n",
    "                    if \"performed_by\" in interaction[\"characters\"].keys():\n",
    "                        # Loop through the characters\n",
    "                        for character in interaction[\"characters\"][\"performed_by\"]:\n",
    "                            # Insert the roles\n",
    "                            scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"performed_by\")\n",
    "\n",
    "                # Insert the subtitles\n",
    "                if clip_id in dataset_speech[movie_id].keys(): \n",
    "                    # Loop through the speeches and tokens list \n",
    "                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):\n",
    "                        # Check \n",
    "                        if interaction_id in speech[\"interaction\"]: \n",
    "                            for line_number, line in enumerate(speech[\"transcript\"]): \n",
    "                                #print(f\"yes {movie_id} - {clip_id} - {i}\", end=\"\\r\")\n",
    "                                # Add the node \n",
    "                                scene_graph.add_node(f\"Speech_{movie_id}_{interaction_id}_{speech_number}_{line_number}\", type=\"Speech\", color=\"#4C8EDA\", transcript=line)\n",
    "                                # Add an edge \n",
    "                                scene_graph.add_edge(summary_class, f\"Speech_{movie_id}_{interaction_id}_{speech_number}_{line_number}\", type=\"has_subs\")\n",
    "\n",
    "                #if \"image_files\" in interaction.keys() and interaction[\"image_files\"]!=[]:\n",
    "                # Set the clip graph for this interaction\n",
    "                nx_dataset[movie_id+\"_\"+str(clip_id)+\"_\"+str(interaction_id)] = scene_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NodeView(('Scene', 'dining room', 'talk about work', 'Tom Hagen', 'Sonny Corleone', 'Clemenza', 'Michael Corleone', 'Tessio', 'male', 'caucasian', 'worried', 'scheming', 'nervous', 'frustrated', 23, 'Frames', 'Speech_tt0068646_0_0_0', 'Speech_tt0068646_0_1_0', 'Speech_tt0068646_0_2_0', 'Speech_tt0068646_0_3_0', 'Speech_tt0068646_0_4_0', 'Speech_tt0068646_0_4_1', 'Speech_tt0068646_0_4_2', 'Speech_tt0068646_0_4_3', 'Speech_tt0068646_0_5_0', 'Speech_tt0068646_0_6_0', 'Speech_tt0068646_0_6_1', 'Speech_tt0068646_0_7_0', 'Speech_tt0068646_0_8_0', 'Speech_tt0068646_0_9_0', 'Speech_tt0068646_0_10_0', 'Speech_tt0068646_0_11_0', 'Speech_tt0068646_0_12_0', 'Speech_tt0068646_0_13_0', 'Speech_tt0068646_0_14_0', 'Speech_tt0068646_0_14_1', 'Speech_tt0068646_0_15_0', 'Speech_tt0068646_0_16_0', 'Speech_tt0068646_0_17_0', 'Speech_tt0068646_0_18_0', 'Speech_tt0068646_0_19_0', 'Speech_tt0068646_0_20_0'))"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nx_dataset[\"tt0068646_100_0\"].nodes"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the nx dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17338"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(nx_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Create the file if it doesn't exist \n",
    "with open(\"nx_dataset_V3.pkl\", \"wb\") as file: \n",
    "    # Save the dictionary \n",
    "    pickle.dump(nx_dataset,file)\n",
    "    # Print a success message £\n",
    "    print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Display sample results\n",
    "Movie : tt0988595, Scene : 2, Interaction 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "A = nx_dataset[\"tt0988595_4_1\"]\n",
    "node_colors = [A.nodes[node][\"color\"] for node in A.nodes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NodeView(('Scene', 'church', 'wedding', 'Jane', 'Father', 'UNLISTED CHARACTER', 'Tess', 'Cousin Lisa', 'kid', 'female', 'caucasian', 'male', 'adult', 'family:0', 'happy', 'helpful', 'proud', 'relieved', 44, 'Frames', 'Speech_tt0988595_1_3_0', 'Speech_tt0988595_1_3_1', 'Speech_tt0988595_1_4_0', 'Speech_tt0988595_1_5_0'))"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A.nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "#plt.figure(figsize=(30,14))\n",
    "\n",
    "#nx.draw_networkx(A, node_size=800, node_color=node_colors, pos=nx.circular_layout(A), font_size=14)\n",
    "#edge_labels = nx.draw_networkx_edge_labels(A, pos=nx.circular_layout(A), label_pos=0.4, font_size=10, clip_on=False)\n",
    "\n",
    "#plt.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert to PyTorch/Tensorflow format : Relaltionship prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## From dict data to Nx graph (TF-GNN)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Graph generation function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_graph_2(graph):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the scene graph \n",
    "    scene_graph = nx.DiGraph()\n",
    "    \n",
    "    # Get the original json\n",
    "    clip_graph = graph.orig_graph_json\n",
    "    # Get the place\n",
    "    place = re.sub(r'[\"\\n]', '',clip_graph[\"scene\"]).strip() if \"scene\" in clip_graph else None\n",
    "    # Get the context \n",
    "    context = re.sub(r'[\"\\n]', '',clip_graph[\"situation\"]).strip() if \"situation\" in clip_graph else None\n",
    "\n",
    "    # Create the Scene node\n",
    "    scene_graph.add_node(\"Scene\",type=\"Scene\",color=\"#16a5a5\")\n",
    "        \n",
    "    if place is not None: \n",
    "        scene_graph.add_node(place,type=\"Place\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\",place,type=\"location\")\n",
    "\n",
    "    if context is not None:\n",
    "        scene_graph.add_node(context,type=\"Context\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\",context,type=\"circumstance\")\n",
    "    \n",
    "    # Check if there are some characters within this scene \n",
    "    if clip_id in clip_characters[movie_id].keys():\n",
    "        # Loop through the characters\n",
    "        for character in clip_characters[movie_id][clip_id]:\n",
    "            # Insert characters \n",
    "            scene_graph.add_node(character,type=\"character\",color=\"#4C8EDA\")\n",
    "            # Insert the edges \n",
    "            scene_graph.add_edge(\"Scene\",character,type=\"features\")\n",
    "\n",
    "        # Insert the characters attributes \n",
    "        if clip_id in dataset_attributes[movie_id].keys():\n",
    "            # Loop through the characters \n",
    "            for character, attributes in dataset_attributes[movie_id][clip_id].items():\n",
    "                # Check if the attributes are not empty\n",
    "                if attributes is not {}:\n",
    "                    # Loop through the attributes \n",
    "                    for key,value in attributes.items():\n",
    "                        # Check if the attribute doesn't have the name of an existing node \n",
    "                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value][\"type\"]==\"attribute\"):\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value,name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value,type=\"possesses\")\n",
    "                        else:\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value+\":attribute\",name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value+\":attribute\",type=\"possesses\")\n",
    "            \n",
    "        # Insert the characters emotions \n",
    "        if clip_id in dataset_emotions[movie_id].keys():\n",
    "            # Loop through the characters and emotions list\n",
    "            for character, emotions in dataset_emotions[movie_id][clip_id].items():\n",
    "                # Loop through the emotions of a specific character\n",
    "                for emotion in emotions: \n",
    "                    # Check if the emotion doesn't have the name of an attribute \n",
    "                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion][\"type\"]==\"Emotion\"):\n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion, type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion, type=\"expresses\")\n",
    "                    else: \n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion+\":emotion\", type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion+\":emotion\", type=\"expresses\")\n",
    "    \n",
    "    # Check if there's an interaction within this scene\n",
    "    if clip_id in clip_interactions[movie_id].keys(): \n",
    "    # Loop through the interactions \n",
    "        for i,interaction in enumerate(clip_interactions[movie_id][clip_id]):\n",
    "            # Check if the summary is not none\n",
    "            if \"summary\" in interaction.keys():\n",
    "                summary_class = interaction[\"summary\"]\n",
    "                # Insert the interaction\n",
    "                scene_graph.add_node(summary_class,\n",
    "                                     type=\"Interaction\",\n",
    "                                     frame_start=interaction[\"frame_start\"], \n",
    "                                     frame_end = interaction[\"frame_end\"],\n",
    "                                     color=\"#4C8EDA\")\n",
    "                scene_graph.add_edge(\"Scene\",summary_class,type=\"has\")\n",
    "                # Insert the roles \n",
    "                # Towards\n",
    "                if \"towards\" in interaction[\"characters\"].keys():\n",
    "                # Loop through the characters \n",
    "                    for character in interaction[\"characters\"][\"towards\"]:\n",
    "                        # Insert the roles\n",
    "                        scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"towards\")\n",
    "                # Performed by\n",
    "                if \"performed_by\" in interaction[\"characters\"].keys():\n",
    "                    # Loop through the characters\n",
    "                    for character in interaction[\"characters\"][\"performed_by\"]:\n",
    "                        # Insert the roles\n",
    "                        scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"performed_by\")\n",
    "            \n",
    "            if clip_id in dataset_speech[movie_id].keys(): \n",
    "                    # Loop through the speeches and tokens list \n",
    "                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):\n",
    "                        # Check \n",
    "                        if i in speech[\"interaction\"]: \n",
    "                            for line_number, line in enumerate(speech[\"transcript\"]): \n",
    "                                #print(f\"yes {movie_id} - {clip_id} - {i}\", end=\"\\r\")\n",
    "                                # Add the node \n",
    "                                scene_graph.add_node(f\"Speech_{movie_id}_{i}_{speech_number}_{line_number}\", type=\"Speech\", color=\"#4C8EDA\", transcript=line)\n",
    "                                # Add an edge \n",
    "                                scene_graph.add_edge(summary_class, f\"Speech_{movie_id}_{i}_{speech_number}_{line_number}\", type=\"has_subs\")\n",
    "                \n",
    "                \n",
    "\n",
    "    return scene_graph"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Search relationship\n",
    "def get_relationship_class(relationship_class, relationship_list:dict): \n",
    "    \"\"\"_summary_\n",
    "\n",
    "    Args:\n",
    "        interaction_list (_type_): _description_\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through the elements\n",
    "    for i, (_, values) in enumerate(relationship_list.items()): \n",
    "        # Check if the interaction matches \n",
    "        if relationship_class in values:\n",
    "            return i\n",
    "    \n",
    "    raise NameError(f\"Could not find the relationship class for {relationship_class}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "%autoreload now\n",
    "relationship_list = get_kukleva_merged_relationships()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_kukleva_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "nx_dataset = defaultdict()\n",
    "# Count the types of relationships\n",
    "relationship_counts = defaultdict(int)\n",
    "\n",
    "# Loop through the movies and clips \n",
    "for movie_id, clips in dataset.items():\n",
    "    # Loop through the scenes_id and graphs\n",
    "    for clip_id, graph in clips.clip_graphs.items():                 \n",
    "        # Check if there are some interactions within this scene \n",
    "        if clip_id in dataset_relationships[movie_id].keys(): \n",
    "            # Generate the scene graph\n",
    "            scene_graph = generate_graph_2(graph)\n",
    "            # Loop through the relationships\n",
    "            # for relationship in dataset_relationships[movie_id][clip_id]: \n",
    "            for i,relationship in enumerate(dataset_relationships[movie_id][clip_id]):\n",
    "                # Get the relationship class \n",
    "                relationship_class = get_relationship_class(relationship[\"class\"], relationship_list)\n",
    "                # Update the count of relationships for this class\n",
    "                relationship_counts[list(relationship_list.keys())[relationship_class]]+=1\n",
    "                # Insert the relationship node\n",
    "                scene_graph.add_node(relationship_class,type=\"Relationship\", id=f\"{movie_id}_{clip_id}_{i}\", color=\"#4C8EDA\")\n",
    "                # Insert the roles \n",
    "                # Subject\n",
    "                scene_graph.add_edge(relationship[\"subject\"],relationship_class, type=\"linked_to\")\n",
    "                # Object \n",
    "                scene_graph.add_edge(relationship_class, relationship[\"object\"], type=\"linked_to\")\n",
    "            if scene_graph is not None:\n",
    "                # Set the clip graph for this relationship\n",
    "                nx_dataset[movie_id+\"_\"+str(clip_id)] = scene_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "#nx_dataset[\"tt0988595_2\"].nodes(data=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('tt0988595',\n",
       " {2: [{'class': 'parent', 'subject': 'Hal', 'object': 'Tess'},\n",
       "   {'class': 'parent', 'subject': 'Hal', 'object': 'Jane'},\n",
       "   {'class': 'sibling', 'subject': 'Jane', 'object': 'Tess'}],\n",
       "  3: [{'class': 'cousin', 'subject': 'Cousin Lisa', 'object': 'Jane'},\n",
       "   {'class': 'cousin', 'subject': 'Cousin Lisa', 'object': 'Tess'}],\n",
       "  4: [{'class': 'parent', 'subject': 'Father', 'object': 'Cousin Lisa'}],\n",
       "  5: [{'class': 'customer',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'Bridal Salesgirl #2'},\n",
       "   {'class': 'customer', 'subject': 'Jane', 'object': 'Bridal Salesgirl #1'}],\n",
       "  6: [{'class': 'friend', 'subject': 'Jane', 'object': 'Bride Suzanne'}],\n",
       "  8: [{'class': 'friend', 'subject': 'Jane', 'object': 'Casey'}],\n",
       "  10: [{'class': 'friend', 'subject': 'Bride Suzanne', 'object': 'Casey'},\n",
       "   {'class': 'customer',\n",
       "    'subject': 'Bride Suzanne',\n",
       "    'object': 'photographer'}],\n",
       "  11: [{'class': 'stranger', 'subject': 'Jane', 'object': 'Kevin'}],\n",
       "  12: [{'class': 'customer',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'Taxi Driver Khaleel'}],\n",
       "  15: [{'class': 'friend', 'subject': 'Jane', 'object': 'Hip Bridesmaid'}],\n",
       "  16: [{'class': 'friend', 'subject': 'Shari Rabinowitz', 'object': 'Jane'}],\n",
       "  17: [{'class': 'spouse',\n",
       "    'subject': 'Shari Rabinowitz',\n",
       "    'object': \"Shari's Husband\"}],\n",
       "  20: [{'class': 'stranger', 'subject': 'Casey', 'object': 'groomsman #2'}],\n",
       "  29: [{'class': 'spouse',\n",
       "    'subject': 'Bride Suzanne',\n",
       "    'object': \"Suzanne's husband\"}],\n",
       "  42: [{'class': 'customer',\n",
       "    'subject': 'Kevin',\n",
       "    'object': 'Taxi Driver Khaleel'}],\n",
       "  45: [{'class': 'friend', 'subject': 'Kevin', 'object': 'Trent'},\n",
       "   {'class': 'colleague', 'subject': 'Kevin', 'object': 'Trent'}],\n",
       "  47: [{'class': 'colleague', 'subject': 'Jane', 'object': 'Casey'}],\n",
       "  51: [{'class': 'boss', 'subject': 'Maureen', 'object': 'Kevin'}],\n",
       "  54: [{'class': 'stranger',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'Florist Delivery Guy'}],\n",
       "  57: [{'class': 'boss', 'subject': 'George', 'object': 'Jane'}],\n",
       "  78: [{'class': 'would like to know', 'subject': 'Tess', 'object': 'George'}],\n",
       "  83: [{'class': 'stranger',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'wedding anniversary guests'}],\n",
       "  100: [{'class': 'lover', 'subject': 'Tess', 'object': 'George'}],\n",
       "  105: [{'class': 'friend', 'subject': 'Jane', 'object': 'Pedro'}],\n",
       "  106: [{'class': 'stranger', 'subject': 'Pedro', 'object': 'Tess'},\n",
       "   {'class': 'mentor', 'subject': 'George', 'object': 'Pedro'}],\n",
       "  108: [{'class': 'customer', 'subject': 'George', 'object': 'cashier'},\n",
       "   {'class': 'customer', 'subject': 'Jane', 'object': 'cashier'},\n",
       "   {'class': 'customer', 'subject': 'Tess', 'object': 'cashier'}],\n",
       "  115: [{'class': 'customer', 'subject': 'Jane', 'object': 'delivery man'}],\n",
       "  131: [{'class': 'engaged', 'subject': 'Tess', 'object': 'George'}],\n",
       "  152: [{'class': 'customer', 'subject': 'Jane', 'object': 'Antoine'}],\n",
       "  208: [{'class': 'stranger', 'subject': 'Dive Bartender', 'object': 'Kevin'},\n",
       "   {'class': 'stranger', 'subject': 'Jane', 'object': 'Dive Bartender'}],\n",
       "  215: [{'class': 'stranger', 'subject': 'Kevin', 'object': 'bar patrons'},\n",
       "   {'class': 'stranger', 'subject': 'Jane', 'object': 'bar patrons'}],\n",
       "  217: [{'class': 'stranger', 'subject': 'Jane', 'object': 'Bar Dude'}],\n",
       "  225: [{'class': 'knows by reputation',\n",
       "    'subject': 'Diner Waitress #1',\n",
       "    'object': 'Jane'}],\n",
       "  248: [{'class': 'acquaintance', 'subject': 'Jane', 'object': 'media man'}],\n",
       "  267: [{'class': 'engaged', 'subject': 'George', 'object': 'Tess'}],\n",
       "  277: [{'class': 'customer', 'subject': 'Flo', 'object': 'Hal'}],\n",
       "  293: [{'class': 'knows by reputation',\n",
       "    'subject': 'Trent',\n",
       "    'object': 'Jane'}],\n",
       "  295: [{'class': 'stranger',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'parking attendant'}],\n",
       "  298: [{'class': 'stranger',\n",
       "    'subject': 'Jane',\n",
       "    'object': 'female wedding guest'},\n",
       "   {'class': 'stranger', 'subject': 'Jane', 'object': 'male wedding guest'}],\n",
       "  300: [{'class': 'knows by reputation',\n",
       "    'subject': 'Boat Bride',\n",
       "    'object': 'Jane'}],\n",
       "  314: [{'class': 'spouse', 'subject': 'Jane', 'object': 'Kevin'}]})"
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(dataset_relationships.items())[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Create the file if it doesn't exist \n",
    "with open(\"nx_dataset_relationships_multiple.pkl\", \"wb\") as file: \n",
    "    # Save the dictionary \n",
    "    pickle.dump(nx_dataset,file)\n",
    "    # Print a success message \n",
    "    print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1537"
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(nx_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15\n",
      "defaultdict(<class 'int'>, {'parent': 148, 'sibling': 58, 'customer': 89, 'friend': 203, 'stranger': 629, 'lover': 234, 'colleague': 250, 'boss/owner': 130, 'kbr': 121, 'acquaintance': 141, 'enemy': 85, 'worker': 64, 'manager': 76, 'child': 52, 'ex-lover': 18})\n",
      "153.2\n"
     ]
    }
   ],
   "source": [
    "print(len(relationship_counts))\n",
    "print(relationship_counts)\n",
    "print(np.mean(list(relationship_counts.values())))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "| Relationship | Count |\n",
    "|---|---|\n",
    "| Parent | 148 |\n",
    "| Sibling | 58 |\n",
    "| Customer | 89 |\n",
    "| Friend | 203 |\n",
    "| Stranger | 629 |\n",
    "| Lover | 234 |\n",
    "| Colleague | 250 |\n",
    "| Boss/Owner | 130 |\n",
    "| KBR | 121 |\n",
    "| Acquaintance | 141 |\n",
    "| Enemy | 85 |\n",
    "| Worker | 64 |\n",
    "| Manager | 76 |\n",
    "| Child | 52 |\n",
    "| Ex-Lover | 18 |\n",
    "\n",
    "\n",
    "Distributions of relationships. \n",
    "\n",
    "Augmenting data for the classes : \n",
    "- Work * 2\n",
    "- Romantic * 2 \n",
    "- Dependecy/caretaking *2 \n",
    "- Hostile * 5"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate the oversampled relationships dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[i for i in range(2)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_kukleva_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "nx_dataset = defaultdict()\n",
    "# Initialize the list of samples to duplicate\n",
    "# If not enough samples (< mean) are provided and the class\n",
    "# doesnt belong to the list of classes of interest,\n",
    "#  we duplicate less samples\n",
    "relationships_left = defaultdict(int)\n",
    "relationships_left[\"lover\"] = 3\n",
    "relationships_left[\"friend\"] = 2\n",
    "relationships_left[\"boss/owner\"] = 2, \n",
    "relationships_left[\"manager\"] = 4\n",
    "relationships_left[\"enemy\"] = 4\n",
    "relationships_left[\"worker\"] = 4\n",
    "relationships_left[\"customer\"] = 4\n",
    "relationships_left[\"colleague\"] = 2\n",
    "\n",
    "# Loop through the movies and clips \n",
    "for movie_id, clips in dataset.items():\n",
    "    # Loop through the scenes_id and graphs\n",
    "    for clip_id, graph in clips.clip_graphs.items():                 \n",
    "        # Check if there are some interactions within this scene \n",
    "        if clip_id in dataset_relationships[movie_id].keys(): \n",
    "            # Loop through the relationships\n",
    "            #for relationship in dataset_relationships[movie_id][clip_id]: \n",
    "            for i,relationship in enumerate(dataset_relationships[movie_id][clip_id]):\n",
    "                # Get the relationship class \n",
    "                relationship_class = get_relationship_class(relationship[\"class\"], relationship_list)\n",
    "                # Check if it's in the list of classes to resample\n",
    "                if list(relationship_list.keys())[relationship_class] in relationships_left:\n",
    "                    # Add n times \n",
    "                    for j in range(relationships_left[relationship[\"class\"]]+1):\n",
    "                        # Generate the scene graph\n",
    "                        scene_graph = generate_graph_2(graph)\n",
    "                        # Insert the relationship node\n",
    "                        scene_graph.add_node(relationship_class,type=\"Relationship\", id=f\"{movie_id}_{clip_id}_{i}\", color=\"#4C8EDA\")\n",
    "                        # Insert the roles \n",
    "                        # Subject\n",
    "                        scene_graph.add_edge(relationship[\"subject\"],relationship_class, type=\"linked_to\")\n",
    "                        # Object \n",
    "                        scene_graph.add_edge(relationship_class, relationship[\"object\"], type=\"linked_to\")\n",
    "\n",
    "                        # Set the clip graph for this relationship\n",
    "                        nx_dataset[movie_id+\"_\"+str(clip_id)+\"_\"+str(i)+str(j)] = scene_graph\n",
    "                else:\n",
    "                    # Loop through the list of relationships to append\n",
    "                    # Generate the scene graph\n",
    "                    scene_graph = generate_graph_2(graph)\n",
    "                    # Insert the relationship node\n",
    "                    scene_graph.add_node(relationship_class,type=\"Relationship\", id=f\"{movie_id}_{clip_id}_{i}\", color=\"#4C8EDA\")\n",
    "                    # Insert the roles \n",
    "                    # Subject\n",
    "                    scene_graph.add_edge(relationship[\"subject\"],relationship_class, type=\"linked_to\")\n",
    "                    # Object \n",
    "                    scene_graph.add_edge(relationship_class, relationship[\"object\"], type=\"linked_to\")\n",
    "\n",
    "                    # Set the clip graph for this relationship\n",
    "                    nx_dataset[movie_id+\"_\"+str(clip_id)+\"_\"+str(i)] = scene_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3821"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(nx_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Create the file if it doesn't exist \n",
    "with open(\"nx_dataset_relationships_oversampled.pkl\", \"wb\") as file: \n",
    "    # Save the dictionary \n",
    "    pickle.dump(nx_dataset,file)\n",
    "    # Print a success message \n",
    "    print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Objectification graphs"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations = pd.read_csv(\"ObyGaze12_thresh_02.csv\", delimiter=\";\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-001.ss-0001.es-0001</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-001</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-001.ss-0001.es-0001.srt</td>\n",
       "      <td>tt0108160_scene_1.avi</td>\n",
       "      <td>0.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-002.ss-0002.es-0002</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-002</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-002.ss-0002.es-0002.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-003.ss-0003.es-0006</td>\n",
       "      <td>Not Sure</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Activities']</td>\n",
       "      <td>tt0108160-003</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-003.ss-0003.es-0006.srt</td>\n",
       "      <td>tt0108160_scene_3.avi</td>\n",
       "      <td>2.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-004.ss-0007.es-0017</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-004</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-004.ss-0007.es-0017.srt</td>\n",
       "      <td>tt0108160_scene_4.avi</td>\n",
       "      <td>3.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-005.ss-0018.es-0018</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-005</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-005.ss-0018.es-0018.srt</td>\n",
       "      <td>tt0108160_scene_5.avi</td>\n",
       "      <td>4.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   idx  util                                clip     label  overlap_ratio  \\\n",
       "1  0.0   1.0  tt0108160scene-001.ss-0001.es-0001  Easy Neg           1.00   \n",
       "2  1.0   0.0  tt0108160scene-002.ss-0002.es-0002  Easy Neg           1.00   \n",
       "3  2.0   1.0  tt0108160scene-003.ss-0003.es-0006  Not Sure           1.00   \n",
       "4  3.0   1.0  tt0108160scene-004.ss-0007.es-0017  Easy Neg           0.85   \n",
       "5  4.0   1.0  tt0108160scene-005.ss-0018.es-0018  Easy Neg           0.85   \n",
       "\n",
       "         concepts             id      movie                       srt_name  \\\n",
       "1            ['']  tt0108160-001  tt0108160  scene-001.ss-0001.es-0001.srt   \n",
       "2            ['']  tt0108160-002  tt0108160  scene-002.ss-0002.es-0002.srt   \n",
       "3  ['Activities']  tt0108160-003  tt0108160  scene-003.ss-0003.es-0006.srt   \n",
       "4            ['']  tt0108160-004  tt0108160  scene-004.ss-0007.es-0017.srt   \n",
       "5            ['']  tt0108160-005  tt0108160  scene-005.ss-0018.es-0018.srt   \n",
       "\n",
       "              video_name  graph_number split  \n",
       "1  tt0108160_scene_1.avi           0.0   val  \n",
       "2                    NaN          -1.0   val  \n",
       "3  tt0108160_scene_3.avi           2.0   val  \n",
       "4  tt0108160_scene_4.avi           3.0   val  \n",
       "5  tt0108160_scene_5.avi           4.0   val  "
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove NaN rows \n",
    "annotations.dropna(how=\"all\", inplace=True)\n",
    "# Remove the nan graph IDs \n",
    "annotations.dropna(subset=[\"id\"], inplace=True)\n",
    "# Add the splits \n",
    "annotations[\"split\"] = [\"val\" for i in range(len(annotations))]\n",
    "annotations.head(5)\n",
    "# Display results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Hard Neg    711\n",
       "Easy Neg    453\n",
       "Not Sure    397\n",
       "Sure        353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display the categories\n",
    "annotations.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Hard Neg': 427, 'Not Sure': 238, 'Easy Neg': 272, 'Sure': 212}"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialize the training ratio\n",
    "train_ratio = 0.6\n",
    "# Get the training ratio for each label\n",
    "hard_neg = round(annotations.label.value_counts()[\"Hard Neg\"] * train_ratio)\n",
    "not_sure = round(annotations.label.value_counts()[\"Not Sure\"] * train_ratio)\n",
    "easy_neg = round(annotations.label.value_counts()[\"Easy Neg\"] * train_ratio)\n",
    "sure = round(annotations.label.value_counts()[\"Sure\"] * train_ratio)\n",
    "\n",
    "# Assign the values\n",
    "samples_left = {\"Hard Neg\": hard_neg, \"Not Sure\": not_sure, \"Easy Neg\": easy_neg, \"Sure\": sure}\n",
    "\n",
    "samples_left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>816.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0822832scene-066.ss-0983.es-0987</td>\n",
       "      <td>Sure</td>\n",
       "      <td>0.88</td>\n",
       "      <td>['Speech']</td>\n",
       "      <td>tt0822832-066</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-066.ss-0983.es-0987.srt</td>\n",
       "      <td>tt0822832_scene_66.mp4</td>\n",
       "      <td>65.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1519</th>\n",
       "      <td>1518.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1454029scene-121.ss-1022.es-1025</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.84</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt1454029-121</td>\n",
       "      <td>tt1454029</td>\n",
       "      <td>scene-121.ss-1022.es-1025.srt</td>\n",
       "      <td>tt1454029_scene_121.mp4</td>\n",
       "      <td>120.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>142.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0110912scene-033.ss-0268.es-0273</td>\n",
       "      <td>Sure</td>\n",
       "      <td>0.22</td>\n",
       "      <td>['Voice', ' Appearance', ' Clothes', ' Look']</td>\n",
       "      <td>tt0110912-033</td>\n",
       "      <td>tt0110912</td>\n",
       "      <td>scene-033.ss-0268.es-0273.srt</td>\n",
       "      <td>tt0110912_scene_33.mkv</td>\n",
       "      <td>32.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1130</th>\n",
       "      <td>1129.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1142988scene-090.ss-0763.es-0770</td>\n",
       "      <td>Sure</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Body', ' Clothes', ' Look']</td>\n",
       "      <td>tt1142988-090</td>\n",
       "      <td>tt1142988</td>\n",
       "      <td>scene-090.ss-0763.es-0770.srt</td>\n",
       "      <td>tt1142988_scene_90.mkv</td>\n",
       "      <td>89.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>835</th>\n",
       "      <td>834.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0822832scene-084.ss-1213.es-1215</td>\n",
       "      <td>Hard Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Activities']</td>\n",
       "      <td>tt0822832-084</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-084.ss-1213.es-1215.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         idx  util                                clip     label  \\\n",
       "817    816.0   1.0  tt0822832scene-066.ss-0983.es-0987      Sure   \n",
       "1519  1518.0   1.0  tt1454029scene-121.ss-1022.es-1025  Easy Neg   \n",
       "143    142.0   1.0  tt0110912scene-033.ss-0268.es-0273      Sure   \n",
       "1130  1129.0   1.0  tt1142988scene-090.ss-0763.es-0770      Sure   \n",
       "835    834.0   0.0  tt0822832scene-084.ss-1213.es-1215  Hard Neg   \n",
       "\n",
       "      overlap_ratio                                       concepts  \\\n",
       "817            0.88                                     ['Speech']   \n",
       "1519           0.84                                           ['']   \n",
       "143            0.22  ['Voice', ' Appearance', ' Clothes', ' Look']   \n",
       "1130           1.00                  ['Body', ' Clothes', ' Look']   \n",
       "835            1.00                                 ['Activities']   \n",
       "\n",
       "                 id      movie                       srt_name  \\\n",
       "817   tt0822832-066  tt0822832  scene-066.ss-0983.es-0987.srt   \n",
       "1519  tt1454029-121  tt1454029  scene-121.ss-1022.es-1025.srt   \n",
       "143   tt0110912-033  tt0110912  scene-033.ss-0268.es-0273.srt   \n",
       "1130  tt1142988-090  tt1142988  scene-090.ss-0763.es-0770.srt   \n",
       "835   tt0822832-084  tt0822832  scene-084.ss-1213.es-1215.srt   \n",
       "\n",
       "                   video_name  graph_number split  \n",
       "817    tt0822832_scene_66.mp4          65.0   val  \n",
       "1519  tt1454029_scene_121.mp4         120.0   val  \n",
       "143    tt0110912_scene_33.mkv          32.0   val  \n",
       "1130   tt1142988_scene_90.mkv          89.0   val  \n",
       "835                       NaN          -1.0   val  "
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# shuffle the dataset \n",
    "annotations = annotations.sample(random_state=123, frac=1)\n",
    "annotations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Easy Neg'"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations.loc[2,\"label\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through the annotations\n",
    "for i in range(1,len(annotations)): \n",
    "    # Get the current annotation \n",
    "    annotation = annotations.loc[i]\n",
    "    # Get the label \n",
    "    label = annotation[\"label\"]\n",
    "    # Check if there's still some train data left\n",
    "    if label in samples_left and samples_left[label]>0:\n",
    "        # Assign to the training samples\n",
    "        annotations.at[i,\"split\"] = \"train\"\n",
    "        # Decrement the samples left for this label\n",
    "        samples_left[label] = samples_left[label] - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Hard Neg': 0, 'Not Sure': 0, 'Easy Neg': 0, 'Sure': 0}"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples_left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train    1149\n",
       "val       765\n",
       "Name: split, dtype: int64"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations[\"split\"].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform data processing on the annotation files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_label(label: str):\n",
    "    if label==\"Hard Neg\":\n",
    "        return 0\n",
    "    elif label==\"Easy Neg\":\n",
    "        return 1\n",
    "    elif label==\"Not Sure\":\n",
    "        return 2\n",
    "    else: \n",
    "        return 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>816.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0822832scene-066.ss-0983.es-0987</td>\n",
       "      <td>3</td>\n",
       "      <td>0.88</td>\n",
       "      <td>['Speech']</td>\n",
       "      <td>tt0822832-066</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-066.ss-0983.es-0987.srt</td>\n",
       "      <td>tt0822832_scene_66.mp4</td>\n",
       "      <td>65.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1519</th>\n",
       "      <td>1518.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1454029scene-121.ss-1022.es-1025</td>\n",
       "      <td>1</td>\n",
       "      <td>0.84</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt1454029-121</td>\n",
       "      <td>tt1454029</td>\n",
       "      <td>scene-121.ss-1022.es-1025.srt</td>\n",
       "      <td>tt1454029_scene_121.mp4</td>\n",
       "      <td>120.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>142.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0110912scene-033.ss-0268.es-0273</td>\n",
       "      <td>3</td>\n",
       "      <td>0.22</td>\n",
       "      <td>['Voice', ' Appearance', ' Clothes', ' Look']</td>\n",
       "      <td>tt0110912-033</td>\n",
       "      <td>tt0110912</td>\n",
       "      <td>scene-033.ss-0268.es-0273.srt</td>\n",
       "      <td>tt0110912_scene_33.mkv</td>\n",
       "      <td>32.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         idx  util                                clip  label  overlap_ratio  \\\n",
       "817    816.0   1.0  tt0822832scene-066.ss-0983.es-0987      3           0.88   \n",
       "1519  1518.0   1.0  tt1454029scene-121.ss-1022.es-1025      1           0.84   \n",
       "143    142.0   1.0  tt0110912scene-033.ss-0268.es-0273      3           0.22   \n",
       "\n",
       "                                           concepts             id      movie  \\\n",
       "817                                      ['Speech']  tt0822832-066  tt0822832   \n",
       "1519                                           ['']  tt1454029-121  tt1454029   \n",
       "143   ['Voice', ' Appearance', ' Clothes', ' Look']  tt0110912-033  tt0110912   \n",
       "\n",
       "                           srt_name               video_name  graph_number  \\\n",
       "817   scene-066.ss-0983.es-0987.srt   tt0822832_scene_66.mp4          65.0   \n",
       "1519  scene-121.ss-1022.es-1025.srt  tt1454029_scene_121.mp4         120.0   \n",
       "143   scene-033.ss-0268.es-0273.srt   tt0110912_scene_33.mkv          32.0   \n",
       "\n",
       "      split  \n",
       "817   train  \n",
       "1519    val  \n",
       "143   train  "
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations[\"label\"] = annotations.label.apply(lambda label: process_label(label))\n",
    "annotations.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "      <th>graph_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>816.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0822832scene-066.ss-0983.es-0987</td>\n",
       "      <td>3</td>\n",
       "      <td>0.88</td>\n",
       "      <td>['Speech']</td>\n",
       "      <td>tt0822832-066</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-066.ss-0983.es-0987.srt</td>\n",
       "      <td>tt0822832_scene_66.mp4</td>\n",
       "      <td>65.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0822832_66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1519</th>\n",
       "      <td>1518.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1454029scene-121.ss-1022.es-1025</td>\n",
       "      <td>1</td>\n",
       "      <td>0.84</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt1454029-121</td>\n",
       "      <td>tt1454029</td>\n",
       "      <td>scene-121.ss-1022.es-1025.srt</td>\n",
       "      <td>tt1454029_scene_121.mp4</td>\n",
       "      <td>120.0</td>\n",
       "      <td>val</td>\n",
       "      <td>tt1454029_121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>142.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0110912scene-033.ss-0268.es-0273</td>\n",
       "      <td>3</td>\n",
       "      <td>0.22</td>\n",
       "      <td>['Voice', ' Appearance', ' Clothes', ' Look']</td>\n",
       "      <td>tt0110912-033</td>\n",
       "      <td>tt0110912</td>\n",
       "      <td>scene-033.ss-0268.es-0273.srt</td>\n",
       "      <td>tt0110912_scene_33.mkv</td>\n",
       "      <td>32.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0110912_33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1130</th>\n",
       "      <td>1129.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1142988scene-090.ss-0763.es-0770</td>\n",
       "      <td>3</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Body', ' Clothes', ' Look']</td>\n",
       "      <td>tt1142988-090</td>\n",
       "      <td>tt1142988</td>\n",
       "      <td>scene-090.ss-0763.es-0770.srt</td>\n",
       "      <td>tt1142988_scene_90.mkv</td>\n",
       "      <td>89.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt1142988_90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>835</th>\n",
       "      <td>834.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0822832scene-084.ss-1213.es-1215</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Activities']</td>\n",
       "      <td>tt0822832-084</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-084.ss-1213.es-1215.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0822832_84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>305</th>\n",
       "      <td>304.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0119822scene-063.ss-0375.es-0394</td>\n",
       "      <td>1</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0119822-063</td>\n",
       "      <td>tt0119822</td>\n",
       "      <td>scene-063.ss-0375.es-0394.srt</td>\n",
       "      <td>tt0119822_scene_63.avi</td>\n",
       "      <td>62.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0119822_63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1845</th>\n",
       "      <td>1844.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt2267998scene-153.ss-1852.es-1853</td>\n",
       "      <td>1</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt2267998-153</td>\n",
       "      <td>tt2267998</td>\n",
       "      <td>scene-153.ss-1852.es-1853.srt</td>\n",
       "      <td>tt2267998_scene_153.mp4</td>\n",
       "      <td>152.0</td>\n",
       "      <td>val</td>\n",
       "      <td>tt2267998_153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1001</th>\n",
       "      <td>1000.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1045658scene-130.ss-1272.es-1291</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Speech', ' Look']</td>\n",
       "      <td>tt1045658-130</td>\n",
       "      <td>tt1045658</td>\n",
       "      <td>scene-130.ss-1272.es-1291.srt</td>\n",
       "      <td>tt1045658_scene_130.mkv</td>\n",
       "      <td>129.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt1045658_130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1252</th>\n",
       "      <td>1251.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt1193138scene-077.ss-0602.es-0607</td>\n",
       "      <td>0</td>\n",
       "      <td>0.88</td>\n",
       "      <td>['Speech']</td>\n",
       "      <td>tt1193138-077</td>\n",
       "      <td>tt1193138</td>\n",
       "      <td>scene-077.ss-0602.es-0607.srt</td>\n",
       "      <td>tt1193138_scene_77.mkv</td>\n",
       "      <td>76.0</td>\n",
       "      <td>val</td>\n",
       "      <td>tt1193138_77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>801</th>\n",
       "      <td>800.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0822832scene-050.ss-0840.es-0840</td>\n",
       "      <td>1</td>\n",
       "      <td>0.99</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0822832-050</td>\n",
       "      <td>tt0822832</td>\n",
       "      <td>scene-050.ss-0840.es-0840.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0822832_50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         idx  util                                clip  label  overlap_ratio  \\\n",
       "817    816.0   1.0  tt0822832scene-066.ss-0983.es-0987      3           0.88   \n",
       "1519  1518.0   1.0  tt1454029scene-121.ss-1022.es-1025      1           0.84   \n",
       "143    142.0   1.0  tt0110912scene-033.ss-0268.es-0273      3           0.22   \n",
       "1130  1129.0   1.0  tt1142988scene-090.ss-0763.es-0770      3           1.00   \n",
       "835    834.0   0.0  tt0822832scene-084.ss-1213.es-1215      0           1.00   \n",
       "...      ...   ...                                 ...    ...            ...   \n",
       "305    304.0   1.0  tt0119822scene-063.ss-0375.es-0394      1           1.00   \n",
       "1845  1844.0   1.0  tt2267998scene-153.ss-1852.es-1853      1           1.00   \n",
       "1001  1000.0   1.0  tt1045658scene-130.ss-1272.es-1291      0           1.00   \n",
       "1252  1251.0   1.0  tt1193138scene-077.ss-0602.es-0607      0           0.88   \n",
       "801    800.0   0.0  tt0822832scene-050.ss-0840.es-0840      1           0.99   \n",
       "\n",
       "                                           concepts             id      movie  \\\n",
       "817                                      ['Speech']  tt0822832-066  tt0822832   \n",
       "1519                                           ['']  tt1454029-121  tt1454029   \n",
       "143   ['Voice', ' Appearance', ' Clothes', ' Look']  tt0110912-033  tt0110912   \n",
       "1130                  ['Body', ' Clothes', ' Look']  tt1142988-090  tt1142988   \n",
       "835                                  ['Activities']  tt0822832-084  tt0822832   \n",
       "...                                             ...            ...        ...   \n",
       "305                                            ['']  tt0119822-063  tt0119822   \n",
       "1845                                           ['']  tt2267998-153  tt2267998   \n",
       "1001                            ['Speech', ' Look']  tt1045658-130  tt1045658   \n",
       "1252                                     ['Speech']  tt1193138-077  tt1193138   \n",
       "801                                            ['']  tt0822832-050  tt0822832   \n",
       "\n",
       "                           srt_name               video_name  graph_number  \\\n",
       "817   scene-066.ss-0983.es-0987.srt   tt0822832_scene_66.mp4          65.0   \n",
       "1519  scene-121.ss-1022.es-1025.srt  tt1454029_scene_121.mp4         120.0   \n",
       "143   scene-033.ss-0268.es-0273.srt   tt0110912_scene_33.mkv          32.0   \n",
       "1130  scene-090.ss-0763.es-0770.srt   tt1142988_scene_90.mkv          89.0   \n",
       "835   scene-084.ss-1213.es-1215.srt                      NaN          -1.0   \n",
       "...                             ...                      ...           ...   \n",
       "305   scene-063.ss-0375.es-0394.srt   tt0119822_scene_63.avi          62.0   \n",
       "1845  scene-153.ss-1852.es-1853.srt  tt2267998_scene_153.mp4         152.0   \n",
       "1001  scene-130.ss-1272.es-1291.srt  tt1045658_scene_130.mkv         129.0   \n",
       "1252  scene-077.ss-0602.es-0607.srt   tt1193138_scene_77.mkv          76.0   \n",
       "801   scene-050.ss-0840.es-0840.srt                      NaN          -1.0   \n",
       "\n",
       "      split       graph_id  \n",
       "817   train   tt0822832_66  \n",
       "1519    val  tt1454029_121  \n",
       "143   train   tt0110912_33  \n",
       "1130  train   tt1142988_90  \n",
       "835   train   tt0822832_84  \n",
       "...     ...            ...  \n",
       "305   train   tt0119822_63  \n",
       "1845    val  tt2267998_153  \n",
       "1001  train  tt1045658_130  \n",
       "1252    val   tt1193138_77  \n",
       "801   train   tt0822832_50  \n",
       "\n",
       "[100 rows x 13 columns]"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#annotations[\"graph_number\"] = annotations.graph_number.apply(lambda x: str(x))\n",
    "annotations[\"graph_id\"] = annotations[\"id\"].apply(lambda x: x.split(\"-\")[0]+\"_\"+str(int(x.split(\"-\")[1])))\n",
    "annotations.head(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    711\n",
       "1    453\n",
       "2    397\n",
       "3    353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_graph_3(graph, label: str):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the scene graph \n",
    "    scene_graph = nx.DiGraph()\n",
    "    # Set the objectification tag for this graph \n",
    "    #scene_graph.__setattr__\n",
    "    \n",
    "    # Get the original json\n",
    "    clip_graph = graph.orig_graph_json\n",
    "    # Get the place\n",
    "    place = re.sub(r'[\"\\n]', '',clip_graph[\"scene\"]).strip() if \"scene\" in clip_graph else None\n",
    "    # Get the context \n",
    "    context = re.sub(r'[\"\\n]', '',clip_graph[\"situation\"]).strip() if \"situation\" in clip_graph else None\n",
    "\n",
    "    # Create the Scene node\n",
    "    scene_graph.add_node(\"Scene\",type=\"Scene\",color=\"#16a5a5\", objectification=label)\n",
    "        \n",
    "    if place is not None: \n",
    "        scene_graph.add_node(place, type=\"Place\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\", place,type=\"location\")\n",
    "\n",
    "    if context is not None:\n",
    "        scene_graph.add_node(context,type=\"Context\",color=\"#4C8EDA\")\n",
    "        scene_graph.add_edge(\"Scene\",context,type=\"circumstance\")\n",
    "    \n",
    "    # Check if there are some characters within this scene \n",
    "    if clip_id in clip_characters[movie_id].keys():\n",
    "        # Loop through the characters\n",
    "        for character in clip_characters[movie_id][clip_id]:\n",
    "            # Insert characters \n",
    "            scene_graph.add_node(character,type=\"character\",color=\"#4C8EDA\")\n",
    "            # Insert the edges \n",
    "            scene_graph.add_edge(\"Scene\",character,type=\"features\")\n",
    "\n",
    "        # Insert the characters attributes \n",
    "        if clip_id in dataset_attributes[movie_id].keys():\n",
    "            # Loop through the characters \n",
    "            for character, attributes in dataset_attributes[movie_id][clip_id].items():\n",
    "                # Check if the attributes are not empty\n",
    "                if attributes is not {}:\n",
    "                    # Loop through the attributes \n",
    "                    for key,value in attributes.items():\n",
    "                        # Check if the attribute doesn't have the name of an existing node \n",
    "                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value][\"type\"]==\"attribute\"):\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value,name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value,type=\"possesses\")\n",
    "                        else:\n",
    "                            # Insert the attribute \n",
    "                            scene_graph.add_node(value+\":attribute\",name=key,type=\"attribute\",color=\"#fb9e00\")\n",
    "                            # Insert the edge between the character and its attribute \n",
    "                            scene_graph.add_edge(character,value+\":attribute\",type=\"possesses\")\n",
    "            \n",
    "        # Insert the characters emotions \n",
    "        if clip_id in dataset_emotions[movie_id].keys():\n",
    "            # Loop through the characters and emotions list\n",
    "            for character, emotions in dataset_emotions[movie_id][clip_id].items():\n",
    "                # Loop through the emotions of a specific character\n",
    "                for emotion in emotions: \n",
    "                    # Check if the emotion doesn't have the name of an attribute \n",
    "                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion][\"type\"]==\"Emotion\"):\n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion, type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion, type=\"expresses\")\n",
    "                    else: \n",
    "                        # Insert the emotion \n",
    "                        scene_graph.add_node(emotion+\":emotion\", type=\"Emotion\", color=\"#4C8EDA\")\n",
    "                        # Insert the edge between the character and the emotion \n",
    "                        scene_graph.add_edge(character, emotion+\":emotion\", type=\"expresses\")\n",
    "                        \n",
    "        # Insert the relationships between characters \n",
    "        if clip_id in dataset_relationships[movie_id].keys():\n",
    "            # Initialize the counter \n",
    "            counter=0\n",
    "            # Loop through the relationships\n",
    "            for relationship in dataset_relationships[movie_id][clip_id]: \n",
    "                # Insert the relationship node\n",
    "                scene_graph.add_node(relationship[\"type\"]+\":\"+str(counter),type=\"Relationship\",color=\"#4C8EDA\")\n",
    "                # Insert the roles \n",
    "                # Subject\n",
    "                scene_graph.add_edge(relationship[\"subject\"],relationship[\"type\"]+\":\"+str(counter), type=\"linked_to\", role=f\"{relationship['subject_role']}\")\n",
    "                # Object \n",
    "                scene_graph.add_edge(relationship[\"object\"], relationship[\"type\"]+\":\"+str(counter), type=\"linked_to\", role=f\"{relationship['object_role']}\")\n",
    "                # Increase the counter \n",
    "                counter+=1\n",
    "    \n",
    "    # Check if there's an interaction within this scene\n",
    "    if clip_id in clip_interactions[movie_id].keys(): \n",
    "    # Loop through the interactions \n",
    "        for i,interaction in enumerate(clip_interactions[movie_id][clip_id]):\n",
    "            # Check if the summary is not none\n",
    "            if \"summary\" in interaction.keys():\n",
    "                summary_class = interaction[\"summary\"]\n",
    "                # Insert the interaction\n",
    "                scene_graph.add_node(summary_class,\n",
    "                                     type=\"Interaction\",\n",
    "                                     frame_start=interaction[\"frame_start\"], \n",
    "                                     frame_end = interaction[\"frame_end\"],\n",
    "                                     color=\"#4C8EDA\")\n",
    "                scene_graph.add_edge(\"Scene\",summary_class,type=\"has\")\n",
    "                # Insert the roles \n",
    "                # Towards\n",
    "                if \"towards\" in interaction[\"characters\"].keys():\n",
    "                # Loop through the characters \n",
    "                    for character in interaction[\"characters\"][\"towards\"]:\n",
    "                        # Insert the roles\n",
    "                        scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"towards\")\n",
    "                # Performed by\n",
    "                if \"performed_by\" in interaction[\"characters\"].keys():\n",
    "                    # Loop through the characters\n",
    "                    for character in interaction[\"characters\"][\"performed_by\"]:\n",
    "                        # Insert the roles\n",
    "                        scene_graph.add_edge(summary_class,character, type=\"involves\", role=\"performed_by\")\n",
    "            \n",
    "            if clip_id in dataset_speech[movie_id].keys(): \n",
    "                    # Loop through the speeches and tokens list \n",
    "                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):\n",
    "                        # Check \n",
    "                        if i in speech[\"interaction\"]: \n",
    "                            for line_number, line in enumerate(speech[\"transcript\"]): \n",
    "                                #print(f\"yes {movie_id} - {clip_id} - {i}\", end=\"\\r\")\n",
    "                                # Add the node \n",
    "                                scene_graph.add_node(f\"Speech_{movie_id}_{i}_{speech_number}_{line_number}\", type=\"Speech\", color=\"#4C8EDA\", transcript=line)\n",
    "                                # Add an edge \n",
    "                                scene_graph.add_edge(summary_class, f\"Speech_{movie_id}_{i}_{speech_number}_{line_number}\", type=\"has_subs\")\n",
    "                          \n",
    "    return scene_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset(set, name):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the object \n",
    "    nx_dataset = defaultdict(dict)\n",
    "\n",
    "    \n",
    "    \n",
    "        # Loop through the movies and clips \n",
    "    for movie_id, clips in dataset.items():\n",
    "        # Loop through the scenes_id and graphs\n",
    "        for clip_id, graph in clips.clip_graphs.items():\n",
    "            if f\"{movie_id}_{clip_id}\" in set[\"graph_id\"].values:\n",
    "                # Get the objectification label for this scene\n",
    "                label = set[set[\"graph_id\"]==f\"{movie_id}_{clip_id}\"][\"label\"].values[0]\n",
    "                # Generate the scene graph\n",
    "                scene_graph = generate_graph_3(graph, label)\n",
    "                # Set the clip graph for this relationship\n",
    "                nx_dataset[movie_id+\"_\"+str(clip_id)] = scene_graph\n",
    "\n",
    "    # Create the file if it doesn't exist \n",
    "    with open(f\"nx_dataset_objectification_{name}_V2.pkl\", \"wb\") as file: \n",
    "        # Save the dictionary \n",
    "        pickle.dump(nx_dataset,file)\n",
    "        # Print a success message \n",
    "        print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set = annotations[annotations[\"split\"]==\"train\"]\n",
    "val_set = annotations[annotations[\"split\"]==\"val\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations.to_csv(\"all_objectification.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_set.to_csv(\"raw_objectification.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['tt0822832_66', 'tt0110912_33', 'tt1142988_90', ..., 'tt1142988_2',\n",
       "       'tt1142988_83', 'tt1193138_215'], dtype=object)"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_set[\"graph_id\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"tt0822832_0000\" in train_set[\"graph_id\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n",
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "# Get the original relationships dataset\n",
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships\n",
    "\n",
    "# Generate the training set\n",
    "create_dataset(train_set, \"train\")\n",
    "# Generate the validation set\n",
    "create_dataset(val_set, \"val\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "765"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(val_set)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate the pos/neg dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations2 = pd.read_csv(\"ObyGaze12_thresh_02.csv\", delimiter=\";\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-001.ss-0001.es-0001</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-001</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-001.ss-0001.es-0001.srt</td>\n",
       "      <td>tt0108160_scene_1.avi</td>\n",
       "      <td>0.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-002.ss-0002.es-0002</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-002</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-002.ss-0002.es-0002.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-003.ss-0003.es-0006</td>\n",
       "      <td>Not Sure</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Activities']</td>\n",
       "      <td>tt0108160-003</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-003.ss-0003.es-0006.srt</td>\n",
       "      <td>tt0108160_scene_3.avi</td>\n",
       "      <td>2.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-004.ss-0007.es-0017</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-004</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-004.ss-0007.es-0017.srt</td>\n",
       "      <td>tt0108160_scene_4.avi</td>\n",
       "      <td>3.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-005.ss-0018.es-0018</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-005</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-005.ss-0018.es-0018.srt</td>\n",
       "      <td>tt0108160_scene_5.avi</td>\n",
       "      <td>4.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   idx  util                                clip     label  overlap_ratio  \\\n",
       "1  0.0   1.0  tt0108160scene-001.ss-0001.es-0001  Easy Neg           1.00   \n",
       "2  1.0   0.0  tt0108160scene-002.ss-0002.es-0002  Easy Neg           1.00   \n",
       "3  2.0   1.0  tt0108160scene-003.ss-0003.es-0006  Not Sure           1.00   \n",
       "4  3.0   1.0  tt0108160scene-004.ss-0007.es-0017  Easy Neg           0.85   \n",
       "5  4.0   1.0  tt0108160scene-005.ss-0018.es-0018  Easy Neg           0.85   \n",
       "\n",
       "         concepts             id      movie                       srt_name  \\\n",
       "1            ['']  tt0108160-001  tt0108160  scene-001.ss-0001.es-0001.srt   \n",
       "2            ['']  tt0108160-002  tt0108160  scene-002.ss-0002.es-0002.srt   \n",
       "3  ['Activities']  tt0108160-003  tt0108160  scene-003.ss-0003.es-0006.srt   \n",
       "4            ['']  tt0108160-004  tt0108160  scene-004.ss-0007.es-0017.srt   \n",
       "5            ['']  tt0108160-005  tt0108160  scene-005.ss-0018.es-0018.srt   \n",
       "\n",
       "              video_name  graph_number split  \n",
       "1  tt0108160_scene_1.avi           0.0   val  \n",
       "2                    NaN          -1.0   val  \n",
       "3  tt0108160_scene_3.avi           2.0   val  \n",
       "4  tt0108160_scene_4.avi           3.0   val  \n",
       "5  tt0108160_scene_5.avi           4.0   val  "
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove NaN rows \n",
    "annotations2.dropna(how=\"all\", inplace=True)\n",
    "# Remove the nan graph IDs \n",
    "annotations2.dropna(subset=[\"id\"], inplace=True)\n",
    "# Add the splits \n",
    "annotations2[\"split\"] = [\"val\" for i in range(len(annotations2))]\n",
    "annotations2.head(5)\n",
    "# Display results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{\"['Speech']\": 519,\n",
       " \"['']\": 453,\n",
       " \"['Activities']\": 86,\n",
       " \"['Speech', ' Activities']\": 69,\n",
       " \"['Speech', ' Exp of  emotion']\": 42,\n",
       " \"['Look']\": 27,\n",
       " \"['Clothes']\": 24,\n",
       " \"['Clothes', ' Speech', ' Activities']\": 22,\n",
       " \"['Posture', ' Speech']\": 22,\n",
       " \"['Clothes', ' Speech']\": 21,\n",
       " \"['Speech', ' Look']\": 21,\n",
       " \"['Speech', ' Activities', ' Exp of  emotion']\": 20,\n",
       " \"['Voice', ' Speech']\": 20,\n",
       " \"['Exp of  emotion']\": 18,\n",
       " \"['Type of plan']\": 16,\n",
       " \"['Posture', ' Speech', ' Activities']\": 16,\n",
       " \"['Body', ' Clothes']\": 15,\n",
       " \"['Voice', ' Exp of  emotion']\": 13,\n",
       " \"['Body', ' Clothes', ' Speech']\": 13,\n",
       " \"['Clothes', ' Activities']\": 12,\n",
       " \"['Type of plan', ' Look']\": 12,\n",
       " \"['Posture']\": 11,\n",
       " \"['Type of plan', ' Body', ' Clothes']\": 11,\n",
       " \"['Voice', ' Appearance', ' Clothes', ' Speech', ' Activities']\": 10,\n",
       " \"['Type of plan', ' Body']\": 10,\n",
       " \"['Type of plan', ' Body', ' Look']\": 10,\n",
       " \"['Body', ' Clothes', ' Activities']\": 10,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Speech']\": 9,\n",
       " \"['Body', ' Speech']\": 9,\n",
       " \"['Body', ' Appearance', ' Clothes']\": 9,\n",
       " \"['Voice', ' Posture', ' Speech']\": 8,\n",
       " \"['Voice', ' Speech', ' Exp of  emotion']\": 7,\n",
       " \"['Speech', ' Look', ' Exp of  emotion']\": 7,\n",
       " \"['Voice']\": 6,\n",
       " \"['Appearance', ' Speech', ' Activities']\": 6,\n",
       " \"['Type of plan', ' Speech']\": 6,\n",
       " \"['Voice', ' Speech', ' Activities']\": 6,\n",
       " \"['Body', ' Voice', ' Speech']\": 5,\n",
       " \"['Voice', ' Posture', ' Exp of  emotion']\": 5,\n",
       " \"['Body', ' Clothes', ' Activities', ' Soundtrack']\": 5,\n",
       " \"['Activities', ' Soundtrack']\": 5,\n",
       " \"['Body', ' Clothes', ' Posture']\": 5,\n",
       " \"['Body']\": 5,\n",
       " \"['Appearance']\": 5,\n",
       " \"['Type of plan', ' Activities']\": 5,\n",
       " \"['Clothes', ' Exp of  emotion']\": 5,\n",
       " \"['Type of plan', ' Body', ' Activities']\": 5,\n",
       " \"['Activities', ' Exp of  emotion']\": 5,\n",
       " \"['Voice', ' Activities']\": 5,\n",
       " \"['Appearance', ' Speech']\": 4,\n",
       " \"['Body', ' Voice', ' Posture', ' Speech']\": 4,\n",
       " \"['Voice', ' Clothes', ' Speech']\": 4,\n",
       " \"['Posture', ' Exp of  emotion']\": 4,\n",
       " \"['Voice', ' Speech', ' Look', ' Exp of  emotion']\": 4,\n",
       " \"['Body', ' Speech', ' Activities']\": 4,\n",
       " \"['Body', ' Voice', ' Clothes', ' Posture', ' Speech', ' Exp of  emotion']\": 4,\n",
       " \"['Clothes', ' Activities', ' Look']\": 4,\n",
       " \"['Clothes', ' Posture', ' Activities']\": 4,\n",
       " \"['Look', ' Exp of  emotion']\": 4,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Clothes']\": 4,\n",
       " \"['Appearance', ' Activities']\": 4,\n",
       " \"['Type of plan', ' Clothes']\": 3,\n",
       " \"['Soundtrack']\": 3,\n",
       " \"['Appearance', ' Clothes']\": 3,\n",
       " \"['Posture', ' Look']\": 3,\n",
       " \"['Type of plan', ' Posture', ' Speech', ' Exp of  emotion']\": 3,\n",
       " \"['Body', ' Appearance', ' Activities']\": 3,\n",
       " \"['Voice', ' Clothes', ' Speech', ' Activities']\": 3,\n",
       " \"['Appearance', ' Clothes', ' Speech', ' Activities']\": 3,\n",
       " \"['Posture', ' Speech', ' Activities', ' Exp of  emotion']\": 3,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Activities']\": 3,\n",
       " \"['Voice', ' Speech', ' Look']\": 3,\n",
       " \"['Type of plan', ' Posture', ' Exp of  emotion']\": 3,\n",
       " \"['Clothes', ' Look', ' Exp of  emotion']\": 3,\n",
       " \"['Appearance', ' Clothes', ' Activities']\": 3,\n",
       " \"['Type of plan', ' Speech', ' Look']\": 3,\n",
       " \"['Type of plan', ' Posture', ' Speech', ' Activities']\": 3,\n",
       " \"['Voice', ' Posture', ' Speech', ' Exp of  emotion', ' Soundtrack']\": 3,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Posture', ' Activities', ' Look', ' Exp of  emotion', ' Soundtrack']\": 3,\n",
       " \"['Clothes', ' Posture']\": 3,\n",
       " \"['Type of plan', ' Voice']\": 3,\n",
       " \"['Body', ' Voice', ' Clothes', ' Activities', ' Exp of  emotion']\": 3,\n",
       " \"['Type of plan', ' Posture', ' Look']\": 3,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Speech', ' Look']\": 2,\n",
       " \"['Body', ' Appearance', ' Clothes', ' Posture', ' Speech']\": 2,\n",
       " \"['Appearance', ' Clothes', ' Speech']\": 2,\n",
       " \"['Type of plan', ' Speech', ' Activities']\": 2,\n",
       " \"['Body', ' Clothes', ' Speech', ' Exp of  emotion']\": 2,\n",
       " \"['Type of plan', ' Look', ' Exp of  emotion']\": 2,\n",
       " \"['Voice', ' Appearance', ' Posture', ' Speech']\": 2,\n",
       " \"['Body', ' Appearance', ' Speech']\": 2,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Activities']\": 2,\n",
       " \"['Appearance', ' Activities', ' Exp of  emotion']\": 2,\n",
       " \"['Voice', ' Appearance', ' Activities', ' Exp of  emotion']\": 2,\n",
       " \"['Type of plan', ' Posture', ' Activities']\": 2,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Posture', ' Activities']\": 2,\n",
       " \"['Body', ' Posture', ' Speech']\": 2,\n",
       " \"['Voice', ' Clothes']\": 2,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Clothes', ' Posture']\": 2,\n",
       " \"['Body', ' Clothes', ' Look']\": 2,\n",
       " \"['Body', ' Voice', ' Appearance', ' Clothes', ' Posture', ' Look', ' Exp of  emotion']\": 2,\n",
       " \"['Speech', ' Narratology']\": 2,\n",
       " \"['Type of plan', ' Exp of  emotion']\": 2,\n",
       " \"['Type of plan', ' Body', ' Speech', ' Look']\": 2,\n",
       " \"['Body', ' Voice', ' Appearance', ' Clothes', ' Speech']\": 2,\n",
       " \"['Speech', ' Activities', ' Look']\": 2,\n",
       " \"['Body', ' Voice', ' Clothes', ' Speech']\": 2,\n",
       " \"['Type of plan', ' Posture']\": 2,\n",
       " \"['Voice', ' Clothes', ' Speech', ' Exp of  emotion']\": 2,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Appearance', ' Speech', ' Exp of  emotion']\": 2,\n",
       " \"['Body', ' Posture', ' Look']\": 2,\n",
       " \"['Body', ' Activities', ' Look']\": 2,\n",
       " \"['Body', ' Posture', ' Activities']\": 2,\n",
       " \"['Type of plan', ' Body', ' Posture']\": 2,\n",
       " \"['Body', ' Posture', ' Activities', ' Look']\": 2,\n",
       " \"['Voice', ' Appearance', ' Clothes', ' Look']\": 2,\n",
       " \"['Type of plan', ' Voice', ' Speech', ' Look', ' Exp of  emotion']\": 2,\n",
       " \"['Type of plan', ' Body', ' Posture', ' Look']\": 2,\n",
       " \"['Posture', ' Activities']\": 1,\n",
       " \"['Voice', ' Posture', ' Activities', ' Exp of  emotion']\": 1,\n",
       " \"['Appearance', ' Activities', ' Look']\": 1,\n",
       " \"['Posture', ' Speech', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Voice', ' Clothes', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Clothes', ' Posture', ' Activities', ' Look']\": 1,\n",
       " \"['Body', ' Speech', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Appearance']\": 1,\n",
       " \"['Body', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Clothes', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Type of plan', ' Posture', ' Activities', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Body', ' Clothes', ' Posture', ' Speech']\": 1,\n",
       " \"['Voice', ' Speech', ' Activities', ' Look']\": 1,\n",
       " \"['Appearance', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Clothes', ' Speech', ' Activities', ' Look', ' Soundtrack']\": 1,\n",
       " \"['Body', ' Clothes', ' Speech', ' Activities']\": 1,\n",
       " \"['Body', ' Speech', ' Look']\": 1,\n",
       " \"['Type of plan', ' Clothes', ' Posture', ' Speech']\": 1,\n",
       " \"['Type of plan', ' Voice', ' Clothes', ' Speech', ' Activities']\": 1,\n",
       " \"['Voice', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Voice', ' Clothes', ' Speech', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Clothes', ' Posture', ' Speech', ' Activities', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Posture', ' Activities', ' Soundtrack']\": 1,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Activities', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Appearance', ' Clothes', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Body', ' Speech', ' Look', ' Narratology']\": 1,\n",
       " \"['Clothes', ' Posture', ' Speech', ' Exp of  emotion']\": 1,\n",
       " \"['Clothes', ' Posture', ' Speech', ' Narratology']\": 1,\n",
       " \"['Voice', ' Appearance', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Voice', ' Appearance', ' Clothes', ' Posture', ' Speech', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Voice', ' Speech', ' Look', ' Narratology']\": 1,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Clothes', ' Posture', ' Look']\": 1,\n",
       " \"['Type of plan', ' Posture', ' Look', ' Soundtrack']\": 1,\n",
       " \"['Type of plan', ' Appearance', ' Exp of  emotion']\": 1,\n",
       " \"['Appearance', ' Clothes', ' Speech', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Clothes', ' Speech', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Clothes', ' Activities']\": 1,\n",
       " \"['Activities', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Look']\": 1,\n",
       " \"['Body', ' Clothes', ' Posture', ' Activities', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Posture', ' Speech', ' Activities', ' Look']\": 1,\n",
       " \"['Appearance', ' Look']\": 1,\n",
       " \"['Type of plan', ' Appearance', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Body', ' Clothes', ' Posture', ' Look']\": 1,\n",
       " \"['Voice', ' Posture', ' Speech', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Appearance', ' Clothes', ' Posture', ' Activities', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Appearance', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Body', ' Voice', ' Speech', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Voice', ' Activities', ' Look', ' Exp of  emotion']\": 1,\n",
       " \"['Body', ' Voice', ' Appearance', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Body', ' Posture', ' Speech', ' Look']\": 1,\n",
       " \"['Body', ' Voice', ' Posture', ' Soundtrack']\": 1,\n",
       " \"['Body', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Appearance', ' Clothes', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance']\": 1,\n",
       " \"['Speech', ' Exp of  emotion', ' Soundtrack']\": 1,\n",
       " \"['Body', ' Voice', ' Look']\": 1,\n",
       " \"['Type of plan', ' Clothes', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Appearance', ' Clothes', ' Posture']\": 1,\n",
       " \"['Type of plan', ' Voice', ' Clothes']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Clothes', ' Look']\": 1,\n",
       " \"['Voice', ' Posture', ' Speech', ' Activities', ' Exp of  emotion']\": 1,\n",
       " \"['Type of plan', ' Activities', ' Exp of  emotion']\": 1,\n",
       " \"['Type of plan', ' Posture', ' Speech', ' Look']\": 1,\n",
       " \"['Body', ' Appearance', ' Posture']\": 1,\n",
       " \"['Voice', ' Clothes', ' Look']\": 1,\n",
       " \"['Type of plan', ' Body', ' Posture', ' Speech']\": 1,\n",
       " \"['Body', ' Clothes', ' Posture', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Body', ' Speech', ' Activities']\": 1,\n",
       " \"['Type of plan', ' Body', ' Appearance', ' Posture', ' Look']\": 1,\n",
       " \"['Clothes', ' Posture', ' Speech']\": 1}"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict(annotations[\"concepts\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Easy Neg    453\n",
       "Sure        353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M = annotations2[annotations2[\"label\"]==\"Easy Neg\"]\n",
    "N = annotations2[annotations2[\"label\"]==\"Sure\"]\n",
    "annotations3 = pd.concat([M,N], ignore_index=True)\n",
    "\n",
    "annotations3[\"label\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    711\n",
       "1    453\n",
       "2    397\n",
       "3    353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations[\"label\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weight for class 0: 0.67\n",
      "Weight for class 1: 1.06\n",
      "Weight for class 0: 1.21\n",
      "Weight for class 1: 1.36\n"
     ]
    }
   ],
   "source": [
    "weight_for_0 = (1/711) * ((453+353+711+397)/4.0)\n",
    "weight_for_1 = (1 / 453) * ((453+353+711+397)/ 4.0)\n",
    "weight_for_2 = (1 / 397) * ((453+353+711+397)/ 4.0)\n",
    "weight_for_3 = (1 / 353) * ((453+353+711+397)/ 4.0)\n",
    "\n",
    "class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3:weight_for_3}\n",
    "\n",
    "print('Weight for class 0: {:.2f}'.format(weight_for_0))\n",
    "print('Weight for class 1: {:.2f}'.format(weight_for_1))\n",
    "print('Weight for class 0: {:.2f}'.format(weight_for_2))\n",
    "print('Weight for class 1: {:.2f}'.format(weight_for_3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weight for class 0: 0.89\n",
      "Weight for class 1: 1.14\n"
     ]
    }
   ],
   "source": [
    "weight_for_0 = (1/453) * ((453+353)/2.0)\n",
    "weight_for_1 = (1 / 353) * ((453+353)/ 2.0)\n",
    "\n",
    "class_weight = {0: weight_for_0, 1: weight_for_1}\n",
    "\n",
    "print('Weight for class 0: {:.2f}'.format(weight_for_0))\n",
    "print('Weight for class 1: {:.2f}'.format(weight_for_1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "272"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialize the training ratio\n",
    "train_ratio = 0.6\n",
    "# Get the training ratio for each label\n",
    "easy_neg = round(annotations3.label.value_counts()[\"Easy Neg\"] * train_ratio)\n",
    "sure = round(annotations3.label.value_counts()[\"Sure\"] * train_ratio)\n",
    "\n",
    "# Assign the values\n",
    "samples_left = {\"Easy Neg\": easy_neg, \"Sure\": sure}\n",
    "\n",
    "easy_neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through the annotations\n",
    "for i in range(len(annotations3)): \n",
    "    # Get the current annotation \n",
    "    annotation = annotations3.loc[i]\n",
    "    # Get the label \n",
    "    label = annotation[\"label\"]\n",
    "    # Check if there's still some train data left\n",
    "    if samples_left[label]>0:\n",
    "        # Assign to the training samples\n",
    "        annotations3.at[i,\"split\"] = \"train\"\n",
    "        # Decrement the samples left for this label\n",
    "        samples_left[label] = samples_left[label] - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_label_2(label: str):\n",
    "    if label==\"Easy Neg\":\n",
    "        return 0\n",
    "    elif label==\"Sure\":\n",
    "        return 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "      <th>graph_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-001.ss-0001.es-0001</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-001</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-001.ss-0001.es-0001.srt</td>\n",
       "      <td>tt0108160_scene_1.avi</td>\n",
       "      <td>0.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-002.ss-0002.es-0002</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-002</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-002.ss-0002.es-0002.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-004.ss-0007.es-0017</td>\n",
       "      <td>0</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-004</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-004.ss-0007.es-0017.srt</td>\n",
       "      <td>tt0108160_scene_4.avi</td>\n",
       "      <td>3.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-005.ss-0018.es-0018</td>\n",
       "      <td>0</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-005</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-005.ss-0018.es-0018.srt</td>\n",
       "      <td>tt0108160_scene_5.avi</td>\n",
       "      <td>4.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-013.ss-0058.es-0061</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-013</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-013.ss-0058.es-0061.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    idx  util                                clip  label  overlap_ratio  \\\n",
       "0   0.0   1.0  tt0108160scene-001.ss-0001.es-0001      0           1.00   \n",
       "1   1.0   0.0  tt0108160scene-002.ss-0002.es-0002      0           1.00   \n",
       "2   3.0   1.0  tt0108160scene-004.ss-0007.es-0017      0           0.85   \n",
       "3   4.0   1.0  tt0108160scene-005.ss-0018.es-0018      0           0.85   \n",
       "4  12.0   0.0  tt0108160scene-013.ss-0058.es-0061      0           1.00   \n",
       "\n",
       "  concepts             id      movie                       srt_name  \\\n",
       "0     ['']  tt0108160-001  tt0108160  scene-001.ss-0001.es-0001.srt   \n",
       "1     ['']  tt0108160-002  tt0108160  scene-002.ss-0002.es-0002.srt   \n",
       "2     ['']  tt0108160-004  tt0108160  scene-004.ss-0007.es-0017.srt   \n",
       "3     ['']  tt0108160-005  tt0108160  scene-005.ss-0018.es-0018.srt   \n",
       "4     ['']  tt0108160-013  tt0108160  scene-013.ss-0058.es-0061.srt   \n",
       "\n",
       "              video_name  graph_number  split      graph_id  \n",
       "0  tt0108160_scene_1.avi           0.0  train   tt0108160_1  \n",
       "1                    NaN          -1.0  train   tt0108160_2  \n",
       "2  tt0108160_scene_4.avi           3.0  train   tt0108160_4  \n",
       "3  tt0108160_scene_5.avi           4.0  train   tt0108160_5  \n",
       "4                    NaN          -1.0  train  tt0108160_13  "
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3[\"label\"] = annotations3.label.apply(lambda label: process_label_2(label))\n",
    "annotations3[\"graph_id\"] = annotations3[\"id\"].apply(lambda x: x.split(\"-\")[0]+\"_\"+str(int(x.split(\"-\")[1])))\n",
    "annotations3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset_2(set, name):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the object \n",
    "    nx_dataset = defaultdict(dict)\n",
    "    \n",
    "        # Loop through the movies and clips \n",
    "    for movie_id, clips in dataset.items():\n",
    "        # Loop through the scenes_id and graphs\n",
    "        for clip_id, graph in clips.clip_graphs.items():\n",
    "            if f\"{movie_id}_{clip_id}\" in set[\"graph_id\"].values:\n",
    "                # Get the objectification label for this scene\n",
    "                label = set[set[\"graph_id\"]==f\"{movie_id}_{clip_id}\"][\"label\"].values[0]\n",
    "                # Generate the scene graph\n",
    "                scene_graph = generate_graph_3(graph, label)\n",
    "                # Set the clip graph for this relationship\n",
    "                nx_dataset[movie_id+\"_\"+str(clip_id)] = scene_graph\n",
    "\n",
    "    # Create the file if it doesn't exist \n",
    "    with open(f\"nx_dataset_objectification_{name}_2_V2.pkl\", \"wb\") as file: \n",
    "        # Save the dictionary \n",
    "        pickle.dump(nx_dataset,file)\n",
    "        # Print a success message \n",
    "        print(f\"Nx dataset nx_dataset_objectification_{name}_2_V2.pkl saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set_2 = annotations3[annotations3[\"split\"]==\"train\"]\n",
    "val_set_2 = annotations3[annotations3[\"split\"]==\"val\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "484"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_set_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset nx_dataset_objectification_train_2_V2.pkl saved successfully.\n",
      "Nx dataset nx_dataset_objectification_val_2_V2.pkl saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "# Get the original relationships dataset\n",
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships\n",
    "\n",
    "# Generate the training set\n",
    "create_dataset_2(train_set_2, \"train\")\n",
    "# Generate the validation set\n",
    "create_dataset_2(val_set_2, \"val\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate the oversampled dataset "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    453\n",
       "1    353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3[\"label\"].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Duplicate samples of the class '1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter the dataset\n",
    "annotations_binary_1 = annotations3[annotations3[\"label\"]==1]\n",
    "# Shuffle the dataset \n",
    "annotations_binary_1 = annotations_binary_1.sample(random_state=123, frac=1).reset_index(drop=True)\n",
    "# Take the 100 first elements \n",
    "annotations_binary_1 = annotations_binary_1[0:100]\n",
    "# Concatenate the datasets\n",
    "annotations2_oversampled = pd.concat([annotations3,annotations_binary_1])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check the distribution of the labels "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.5\n",
       "1    0.5\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations2_oversampled[\"label\"].value_counts(normalize=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check the distribution of the splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train    0.598234\n",
       "val      0.401766\n",
       "Name: split, dtype: float64"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations2_oversampled[\"split\"].value_counts(normalize=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the oversampled dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_binary_oversampled = annotations2_oversampled[annotations2_oversampled[\"split\"]==\"train\"]\n",
    "validation_binary_oversampled = annotations2_oversampled[annotations2_oversampled[\"split\"]==\"val\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "484\n",
      "322\n"
     ]
    }
   ],
   "source": [
    "print(len(train_set_2))\n",
    "print(len(val_set_2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "542\n",
      "364\n"
     ]
    }
   ],
   "source": [
    "print(len(train_binary_oversampled))\n",
    "print(len(validation_binary_oversampled))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the oversampled dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset_2_oversampled(set, name):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the object \n",
    "    nx_dataset = defaultdict(dict)\n",
    "    \n",
    "        # Loop through the movies and clips \n",
    "    for movie_id, clips in dataset.items():\n",
    "        # Loop through the scenes_id and graphs\n",
    "        for clip_id, graph in clips.clip_graphs.items():\n",
    "            if f\"{movie_id}_{clip_id}\" in set[\"graph_id\"].values:\n",
    "                # Get the objectification label for this scene\n",
    "                labels = set[set[\"graph_id\"]==f\"{movie_id}_{clip_id}\"][\"label\"].values\n",
    "                for i,label in enumerate(labels):\n",
    "                    # Generate the scene graph\n",
    "                    scene_graph = generate_graph_3(graph, label)\n",
    "                    # Set the clip graph for this relationship\n",
    "                    nx_dataset[movie_id+\"_\"+str(clip_id)+f\"_{i}\"] = scene_graph\n",
    "\n",
    "    # Create the file if it doesn't exist \n",
    "    with open(f\"nx_dataset_objectification_{name}_2_V2.pkl\", \"wb\") as file: \n",
    "        # Save the dictionary \n",
    "        pickle.dump(nx_dataset,file)\n",
    "        # Print a success message \n",
    "        print(f\"Nx dataset nx_dataset_objectification_{name}_2_V2.pkl saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1])"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_binary_oversampled[train_binary_oversampled[\"graph_id\"]==\"tt1142988_14\"][\"label\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset nx_dataset_objectification_oversampled_train_2_V2.pkl saved successfully.\n",
      "Nx dataset nx_dataset_objectification_oversampled_val_2_V2.pkl saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "# Get the original relationships dataset\n",
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships\n",
    "\n",
    "# Generate the training set\n",
    "create_dataset_2_oversampled(train_binary_oversampled, \"oversampled_train\")\n",
    "# Generate the validation set\n",
    "create_dataset_2_oversampled(validation_binary_oversampled, \"oversampled_val\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset without 'Not Sure'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations3 = pd.read_csv(\"ObyGaze12_thresh_02.csv\", delimiter=\";\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-001.ss-0001.es-0001</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-001</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-001.ss-0001.es-0001.srt</td>\n",
       "      <td>tt0108160_scene_1.avi</td>\n",
       "      <td>0.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-002.ss-0002.es-0002</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-002</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-002.ss-0002.es-0002.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-003.ss-0003.es-0006</td>\n",
       "      <td>Not Sure</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['Activities']</td>\n",
       "      <td>tt0108160-003</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-003.ss-0003.es-0006.srt</td>\n",
       "      <td>tt0108160_scene_3.avi</td>\n",
       "      <td>2.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-004.ss-0007.es-0017</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-004</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-004.ss-0007.es-0017.srt</td>\n",
       "      <td>tt0108160_scene_4.avi</td>\n",
       "      <td>3.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-005.ss-0018.es-0018</td>\n",
       "      <td>Easy Neg</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-005</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-005.ss-0018.es-0018.srt</td>\n",
       "      <td>tt0108160_scene_5.avi</td>\n",
       "      <td>4.0</td>\n",
       "      <td>val</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   idx  util                                clip     label  overlap_ratio  \\\n",
       "1  0.0   1.0  tt0108160scene-001.ss-0001.es-0001  Easy Neg           1.00   \n",
       "2  1.0   0.0  tt0108160scene-002.ss-0002.es-0002  Easy Neg           1.00   \n",
       "3  2.0   1.0  tt0108160scene-003.ss-0003.es-0006  Not Sure           1.00   \n",
       "4  3.0   1.0  tt0108160scene-004.ss-0007.es-0017  Easy Neg           0.85   \n",
       "5  4.0   1.0  tt0108160scene-005.ss-0018.es-0018  Easy Neg           0.85   \n",
       "\n",
       "         concepts             id      movie                       srt_name  \\\n",
       "1            ['']  tt0108160-001  tt0108160  scene-001.ss-0001.es-0001.srt   \n",
       "2            ['']  tt0108160-002  tt0108160  scene-002.ss-0002.es-0002.srt   \n",
       "3  ['Activities']  tt0108160-003  tt0108160  scene-003.ss-0003.es-0006.srt   \n",
       "4            ['']  tt0108160-004  tt0108160  scene-004.ss-0007.es-0017.srt   \n",
       "5            ['']  tt0108160-005  tt0108160  scene-005.ss-0018.es-0018.srt   \n",
       "\n",
       "              video_name  graph_number split  \n",
       "1  tt0108160_scene_1.avi           0.0   val  \n",
       "2                    NaN          -1.0   val  \n",
       "3  tt0108160_scene_3.avi           2.0   val  \n",
       "4  tt0108160_scene_4.avi           3.0   val  \n",
       "5  tt0108160_scene_5.avi           4.0   val  "
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove NaN rows \n",
    "annotations3.dropna(how=\"all\", inplace=True)\n",
    "# Remove the nan graph IDs \n",
    "annotations3.dropna(subset=[\"id\"], inplace=True)\n",
    "# Add the splits \n",
    "annotations3[\"split\"] = [\"val\" for i in range(len(annotations3))]\n",
    "annotations3.head(5)\n",
    "# Display results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Hard Neg    711\n",
       "Easy Neg    453\n",
       "Sure        353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M = annotations3[annotations3[\"label\"]==\"Easy Neg\"]\n",
    "N = annotations3[annotations3[\"label\"]==\"Sure\"]\n",
    "O = annotations3[annotations3[\"label\"]==\"Hard Neg\"]\n",
    "annotations3 = pd.concat([M,N,O], ignore_index=True)\n",
    "\n",
    "annotations3[\"label\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weight for class 0: 0.71\n",
      "Weight for class 1: 1.12\n",
      "Weight for class 0: 1.43\n"
     ]
    }
   ],
   "source": [
    "weight_for_0 = (1/711) * ((453+353+711)/3.0)\n",
    "weight_for_1 = (1 / 453) * ((453+353+711)/ 3.0)\n",
    "weight_for_2 = (1 / 353) * ((453+353+711)/ 3.0)\n",
    "\n",
    "class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}\n",
    "\n",
    "print('Weight for class 0: {:.2f}'.format(weight_for_0))\n",
    "print('Weight for class 1: {:.2f}'.format(weight_for_1))\n",
    "print('Weight for class 0: {:.2f}'.format(weight_for_2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "272"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialize the training ratio\n",
    "train_ratio = 0.6\n",
    "# Get the training ratio for each label\n",
    "easy_neg = round(annotations3.label.value_counts()[\"Easy Neg\"] * train_ratio)\n",
    "hard_neg = round(annotations3.label.value_counts()[\"Hard Neg\"] * train_ratio)\n",
    "sure = round(annotations3.label.value_counts()[\"Sure\"] * train_ratio)\n",
    "\n",
    "# Assign the values\n",
    "samples_left = {\"Easy Neg\": easy_neg, \"Hard Neg\": hard_neg, \"Sure\": sure}\n",
    "\n",
    "easy_neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through the annotations\n",
    "for i in range(len(annotations3)): \n",
    "    # Get the current annotation \n",
    "    annotation = annotations3.loc[i]\n",
    "    # Get the label \n",
    "    label = annotation[\"label\"]\n",
    "    # Check if there's still some train data left\n",
    "    if samples_left[label]>0:\n",
    "        # Assign to the training samples\n",
    "        annotations3.at[i,\"split\"] = \"train\"\n",
    "        # Decrement the samples left for this label\n",
    "        samples_left[label] = samples_left[label] - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_label_3(label: str):\n",
    "    if label==\"Easy Neg\":\n",
    "        return 0\n",
    "    elif label==\"Hard Neg\":\n",
    "        return 1\n",
    "    elif label==\"Sure\":\n",
    "        return 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>util</th>\n",
       "      <th>clip</th>\n",
       "      <th>label</th>\n",
       "      <th>overlap_ratio</th>\n",
       "      <th>concepts</th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>srt_name</th>\n",
       "      <th>video_name</th>\n",
       "      <th>graph_number</th>\n",
       "      <th>split</th>\n",
       "      <th>graph_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-001.ss-0001.es-0001</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-001</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-001.ss-0001.es-0001.srt</td>\n",
       "      <td>tt0108160_scene_1.avi</td>\n",
       "      <td>0.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-002.ss-0002.es-0002</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-002</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-002.ss-0002.es-0002.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-004.ss-0007.es-0017</td>\n",
       "      <td>0</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-004</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-004.ss-0007.es-0017.srt</td>\n",
       "      <td>tt0108160_scene_4.avi</td>\n",
       "      <td>3.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>tt0108160scene-005.ss-0018.es-0018</td>\n",
       "      <td>0</td>\n",
       "      <td>0.85</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-005</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-005.ss-0018.es-0018.srt</td>\n",
       "      <td>tt0108160_scene_5.avi</td>\n",
       "      <td>4.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tt0108160scene-013.ss-0058.es-0061</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>['']</td>\n",
       "      <td>tt0108160-013</td>\n",
       "      <td>tt0108160</td>\n",
       "      <td>scene-013.ss-0058.es-0061.srt</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>tt0108160_13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    idx  util                                clip  label  overlap_ratio  \\\n",
       "0   0.0   1.0  tt0108160scene-001.ss-0001.es-0001      0           1.00   \n",
       "1   1.0   0.0  tt0108160scene-002.ss-0002.es-0002      0           1.00   \n",
       "2   3.0   1.0  tt0108160scene-004.ss-0007.es-0017      0           0.85   \n",
       "3   4.0   1.0  tt0108160scene-005.ss-0018.es-0018      0           0.85   \n",
       "4  12.0   0.0  tt0108160scene-013.ss-0058.es-0061      0           1.00   \n",
       "\n",
       "  concepts             id      movie                       srt_name  \\\n",
       "0     ['']  tt0108160-001  tt0108160  scene-001.ss-0001.es-0001.srt   \n",
       "1     ['']  tt0108160-002  tt0108160  scene-002.ss-0002.es-0002.srt   \n",
       "2     ['']  tt0108160-004  tt0108160  scene-004.ss-0007.es-0017.srt   \n",
       "3     ['']  tt0108160-005  tt0108160  scene-005.ss-0018.es-0018.srt   \n",
       "4     ['']  tt0108160-013  tt0108160  scene-013.ss-0058.es-0061.srt   \n",
       "\n",
       "              video_name  graph_number  split      graph_id  \n",
       "0  tt0108160_scene_1.avi           0.0  train   tt0108160_1  \n",
       "1                    NaN          -1.0  train   tt0108160_2  \n",
       "2  tt0108160_scene_4.avi           3.0  train   tt0108160_4  \n",
       "3  tt0108160_scene_5.avi           4.0  train   tt0108160_5  \n",
       "4                    NaN          -1.0  train  tt0108160_13  "
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3[\"label\"] = annotations3.label.apply(lambda label: process_label_3(label))\n",
    "annotations3[\"graph_id\"] = annotations3[\"id\"].apply(lambda x: x.split(\"-\")[0]+\"_\"+str(int(x.split(\"-\")[1])))\n",
    "annotations3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    711\n",
       "0    453\n",
       "2    353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3[\"label\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset_3(set, name):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the object \n",
    "    nx_dataset = defaultdict(dict)\n",
    "    \n",
    "        # Loop through the movies and clips \n",
    "    for movie_id, clips in dataset.items():\n",
    "        # Loop through the scenes_id and graphs\n",
    "        for clip_id, graph in clips.clip_graphs.items():\n",
    "            if f\"{movie_id}_{clip_id}\" in set[\"graph_id\"].values:\n",
    "                # Get the objectification label for this scene\n",
    "                label = set[set[\"graph_id\"]==f\"{movie_id}_{clip_id}\"][\"label\"].values[0]\n",
    "                # Generate the scene graph\n",
    "                scene_graph = generate_graph_3(graph, label)\n",
    "                # Set the clip graph for this relationship\n",
    "                nx_dataset[movie_id+\"_\"+str(clip_id)] = scene_graph\n",
    "\n",
    "    # Create the file if it doesn't exist \n",
    "    with open(f\"nx_dataset_objectification_{name}_2_V3.pkl\", \"wb\") as file: \n",
    "        # Save the dictionary \n",
    "        pickle.dump(nx_dataset,file)\n",
    "        # Print a success message \n",
    "        print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set_2 = annotations3[annotations3[\"split\"]==\"train\"]\n",
    "val_set_2 = annotations3[annotations3[\"split\"]==\"val\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "911"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_set_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n",
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "# Get the original relationships dataset\n",
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships\n",
    "\n",
    "# Generate the training set\n",
    "create_dataset_3(train_set_2, \"train\")\n",
    "# Generate the validation set\n",
    "create_dataset_3(val_set_2, \"val\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_names_3 = [0,1,2]\n",
    "\n",
    "with open(\"class_names_3.pkl\", \"wb\") as file: \n",
    "    pickle.dump(class_names_3, file)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the oversampled Without Not Sure dataset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check the distribution of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    711\n",
       "0    453\n",
       "2    353\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3[\"label\"].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Duplicate samples of class 0 and 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter the dataset\n",
    "annotations_wns_0 = annotations3[annotations3[\"label\"]==0]\n",
    "annotations_wns_2 = annotations3[annotations3[\"label\"]==2]\n",
    "# Shuffle the dataset \n",
    "annotations_wns_0 = annotations_wns_0.sample(random_state=123, frac=1).reset_index(drop=True)\n",
    "annotations_wns_2 = annotations_wns_2.sample(random_state=123, frac=1).reset_index(drop=True)\n",
    "# Take the elements \n",
    "annotations_wns_0 = annotations_wns_0[0:711-453]\n",
    "annotations_wns_2 = annotations_wns_2[0:711-353]\n",
    "# Concatenate the datasets\n",
    "annotations3_oversampled = pd.concat([annotations3,annotations_wns_0,annotations_wns_2])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check the distributions of values "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.334117\n",
       "1    0.334117\n",
       "2    0.331767\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3_oversampled[\"label\"].value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train    0.599154\n",
       "val      0.400846\n",
       "Name: split, dtype: float64"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations3_oversampled[\"split\"].value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset_3_oversampled(set, name):\n",
    "    \"\"\"_summary_\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the object \n",
    "    nx_dataset = defaultdict(dict)\n",
    "    \n",
    "        # Loop through the movies and clips \n",
    "    for movie_id, clips in dataset.items():\n",
    "        # Loop through the scenes_id and graphs\n",
    "        for clip_id, graph in clips.clip_graphs.items():\n",
    "            if f\"{movie_id}_{clip_id}\" in set[\"graph_id\"].values:\n",
    "                # Get the objectification label for this scene\n",
    "                labels = set[set[\"graph_id\"]==f\"{movie_id}_{clip_id}\"][\"label\"].values\n",
    "                for i,label in enumerate(labels):\n",
    "                    # Generate the scene graph\n",
    "                    scene_graph = generate_graph_3(graph, label)\n",
    "                    # Set the clip graph for this relationship\n",
    "                    nx_dataset[movie_id+\"_\"+str(clip_id)+f\"_{i}\"] = scene_graph\n",
    "\n",
    "    # Create the file if it doesn't exist \n",
    "    with open(f\"nx_dataset_objectification_{name}_2_V4.pkl\", \"wb\") as file: \n",
    "        # Save the dictionary \n",
    "        pickle.dump(nx_dataset,file)\n",
    "        # Print a success message \n",
    "        print(\"Nx dataset saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1275\n",
      "853\n"
     ]
    }
   ],
   "source": [
    "train_wns_oversampled = annotations3_oversampled[annotations3_oversampled[\"split\"]==\"train\"]\n",
    "val_wns_oversampled = annotations3_oversampled[annotations3_oversampled[\"split\"]==\"val\"]\n",
    "\n",
    "print(len(train_wns_oversampled))\n",
    "print(len(val_wns_oversampled))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nx dataset saved successfully.\n",
      "Nx dataset saved successfully.\n"
     ]
    }
   ],
   "source": [
    "# Initialize the object \n",
    "nx_dataset = defaultdict(dict)\n",
    "# Get the original relationships dataset\n",
    "# Initialize the emotions set\n",
    "dataset_relationships = defaultdict(dict)\n",
    "\n",
    "# Loop through the movies\n",
    "for movie in dataset.keys():\n",
    "    # Loop through the clips\n",
    "    for clip in dataset[movie].clip_graphs.items(): \n",
    "        # Extract the relationships for the current clip \n",
    "        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)\n",
    "        # Check if it's not null\n",
    "        if relationships!=[]:\n",
    "            # Append to the relationships dataset\n",
    "            dataset_relationships[movie][clip[0]] = relationships\n",
    "\n",
    "# Generate the training set\n",
    "create_dataset_3_oversampled(train_wns_oversampled, \"train\")\n",
    "# Generate the validation set\n",
    "create_dataset_3_oversampled(val_wns_oversampled, \"val\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dataset_processing",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "5aa4fe1c6b87645cc659c3e6352c28b6ca736c6dc41619a767f7c30ac445b470"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
