{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, gzip, json, requests, pandas as pd\n",
    "from tqdm.auto import tqdm\n",
    "import random\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "512b214e79514528acc633bed42d8cc9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "scan principals: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "• writing 1000 lines → movies.jsonl\n",
      "✅ done\n"
     ]
    }
   ],
   "source": [
    "# Cell: make 1 000 post-1980 popular films with two unique leads\n",
    "# -------------------------------------------------------------\n",
    "\n",
    "# -------- parameters you may tweak --------\n",
    "N_MOVIES     = 1_000\n",
    "YEAR_MIN     = 1980\n",
    "STREAM_ROWS  = 1_000_000      # principals chunk size\n",
    "CACHE_DIR    = \"imdb_cache\"\n",
    "OUT_PATH     = \"movies.jsonl\"\n",
    "# ------------------------------------------\n",
    "\n",
    "IMDB_BASE   = \"https://datasets.imdbws.com/\"\n",
    "FILES       = {\n",
    "    \"basics\":   \"title.basics.tsv.gz\",\n",
    "    \"ratings\":  \"title.ratings.tsv.gz\",\n",
    "    \"princip\":  \"title.principals.tsv.gz\",\n",
    "    \"names\":    \"name.basics.tsv.gz\",\n",
    "}\n",
    "\n",
    "def cached(path_key):\n",
    "    fname = FILES[path_key]\n",
    "    os.makedirs(CACHE_DIR, exist_ok=True)\n",
    "    path = os.path.join(CACHE_DIR, fname)\n",
    "    if not os.path.exists(path):\n",
    "        url = IMDB_BASE + fname\n",
    "        print(f\"↳ downloading {fname}\")\n",
    "        with requests.get(url, stream=True, timeout=60) as r:\n",
    "            r.raise_for_status()\n",
    "            with open(path, \"wb\") as f, tqdm(\n",
    "                total=int(r.headers.get(\"Content-Length\", 0)),\n",
    "                unit=\"B\", unit_scale=True, desc=fname\n",
    "            ) as bar:\n",
    "                for chunk in r.iter_content(1 << 20):\n",
    "                    f.write(chunk); bar.update(len(chunk))\n",
    "    return path\n",
    "\n",
    "# 1️⃣  modern feature films with votes\n",
    "cols = [\"tconst\", \"primaryTitle\", \"titleType\", \"isAdult\", \"startYear\"]\n",
    "basics = pd.read_csv(gzip.open(cached(\"basics\")), sep=\"\\t\",\n",
    "                     usecols=cols, na_values=\"\\\\N\", low_memory=False)\n",
    "basics[\"startYear\"] = pd.to_numeric(basics[\"startYear\"], errors=\"coerce\")\n",
    "basics = basics[(basics[\"titleType\"] == \"movie\") &\n",
    "                (basics[\"isAdult\"] == 0) &\n",
    "                (basics[\"startYear\"] >= YEAR_MIN)]\n",
    "\n",
    "ratings = pd.read_csv(gzip.open(cached(\"ratings\")), sep=\"\\t\",\n",
    "                      usecols=[\"tconst\", \"numVotes\"], na_values=\"\\\\N\")\n",
    "ratings[\"numVotes\"] = pd.to_numeric(ratings[\"numVotes\"], errors=\"coerce\")\n",
    "films = (basics.merge(ratings, on=\"tconst\", how=\"inner\")\n",
    "               .dropna(subset=[\"numVotes\"])\n",
    "               .sort_values(\"numVotes\", ascending=False))\n",
    "\n",
    "# 2️⃣  stream principals to keep memory low\n",
    "tset = set(films[\"tconst\"])\n",
    "keep_cols = [\"tconst\", \"ordering\", \"category\", \"nconst\"]\n",
    "chunks = []\n",
    "with gzip.open(cached(\"princip\")) as gz:\n",
    "    reader = pd.read_csv(gz, sep=\"\\t\", usecols=keep_cols,\n",
    "                         na_values=\"\\\\N\", chunksize=STREAM_ROWS)\n",
    "    for chunk in tqdm(reader, desc=\"scan principals\"):\n",
    "        chunk = chunk[(chunk[\"category\"].isin([\"actor\", \"actress\"])) &\n",
    "                      (chunk[\"tconst\"].isin(tset))]\n",
    "        chunks.append(chunk)\n",
    "principals = pd.concat(chunks, ignore_index=True)\n",
    "\n",
    "# 3️⃣  attach performer names\n",
    "names = pd.read_csv(gzip.open(cached(\"names\")), sep=\"\\t\",\n",
    "                    usecols=[\"nconst\", \"primaryName\"],\n",
    "                    na_values=\"\\\\N\", low_memory=False)\n",
    "cast = principals.merge(names, on=\"nconst\", how=\"left\").dropna(subset=[\"primaryName\"])\n",
    "\n",
    "cast[\"ordering\"] = pd.to_numeric(cast[\"ordering\"], errors=\"coerce\")\n",
    "cast = cast.dropna(subset=[\"ordering\"]).sort_values([\"tconst\", \"ordering\"])\n",
    "\n",
    "# 4️⃣  first two distinct actor names per film\n",
    "def first_two_unique(series):\n",
    "    seen = set()\n",
    "    out = []\n",
    "    for name in series:\n",
    "        if name not in seen:\n",
    "            out.append(name); seen.add(name)\n",
    "        if len(out) == 2:\n",
    "            break\n",
    "    return out\n",
    "\n",
    "pairs = (cast.groupby(\"tconst\")[\"primaryName\"]\n",
    "              .apply(first_two_unique))\n",
    "pairs = pairs[pairs.str.len() == 2]             # ensure two uniques\n",
    "\n",
    "# 5️⃣  popularity cut (top-voted)\n",
    "eligible = films.set_index(\"tconst\").loc[pairs.index]\n",
    "eligible = eligible.head(N_MOVIES)\n",
    "\n",
    "# 6️⃣  write JSONL\n",
    "print(f\"• writing {len(eligible)} lines → {OUT_PATH}\")\n",
    "with open(OUT_PATH, \"w\", encoding=\"utf-8\") as f:\n",
    "    for idx, (tc, row) in enumerate(eligible.iterrows(), 1):\n",
    "        first, second = pairs[tc]\n",
    "        json.dump({\"first_actor\": first,\n",
    "                   \"second_actor\": second,\n",
    "                   \"movie_title\": row[\"primaryTitle\"],\n",
    "                   \"id\": idx}, f, ensure_ascii=False)\n",
    "        f.write(\"\\n\")\n",
    "print(\"✅ done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'first_actor': 'Michael Emil', 'second_actor': 'Eloy Herrera', 'movie_title': 'Dama de noche', 'id': 1}\n",
      "{'first_actor': 'Sasha Montenegro', 'second_actor': 'Ray Milland', 'movie_title': 'Kate & Leopold', 'id': 2}\n",
      "{'first_actor': 'George C. Scott', 'second_actor': 'Emilio Álvarez', 'movie_title': 'Another Time, Another Place', 'id': 3}\n",
      "{'first_actor': 'Mark Hamill', 'second_actor': 'Vittorio Mezzogiorno', 'movie_title': 'Shiva und die Galgenblume', 'id': 4}\n",
      "{'first_actor': 'Daisy Granados', 'second_actor': 'Ferenc Begányi', 'movie_title': 'La rosa de los vientos', 'id': 5}\n"
     ]
    }
   ],
   "source": [
    "rmra_path = DATA_DIR / \"real_movies_real_actors\" / \"2025-05-26_11-58-04\" / \"metadata\" / \"metadata.jsonl\"\n",
    "\n",
    "# Load the data\n",
    "with open(rmra_path, \"r\", encoding=\"utf-8\") as f:\n",
    "    data = [json.loads(line) for line in f]\n",
    "\n",
    "# Extract actor lists\n",
    "first_actors = [item[\"first_actor\"] for item in data]\n",
    "second_actors = [item[\"second_actor\"] for item in data]\n",
    "\n",
    "# Shuffle independently\n",
    "random.shuffle(first_actors)\n",
    "random.shuffle(second_actors)\n",
    "\n",
    "# Reassign shuffled actors\n",
    "for i, item in enumerate(data):\n",
    "    item[\"first_actor\"] = first_actors[i]\n",
    "    item[\"second_actor\"] = second_actors[i]\n",
    "\n",
    "# Optional: print a few to verify\n",
    "for item in data[:5]:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = DATA_DIR / \"real_movies_real_actors_shuffled\" / \"2025-06-04_11-58-04\" / \"metadata\" / \"metadata.jsonl\"\n",
    "\n",
    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
    "    for item in data:\n",
    "        f.write(json.dumps(item, ensure_ascii=False) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "import ast\n",
    "import random\n",
    "\n",
    "INPUT_CSV = ARTIFACTS_DIR / \"Top_10000_Movies_IMDb.csv\"\n",
    "OUTPUT_JSONL = DATA_DIR / \"real_movies_real_actors\" / \"2025-05-26_11-58-04\" / \"metadata\" / \"metadata.jsonl\"\n",
    "N_MOVIES = 1000\n",
    "SHUFFLE_ACTORS = False  # Set to False to keep original actor pairings\n",
    "\n",
    "df = pd.read_csv(INPUT_CSV)\n",
    "\n",
    "# Ensure required columns are present\n",
    "required_columns = {'Stars', 'Movie Name'}\n",
    "if not required_columns.issubset(df.columns):\n",
    "    raise ValueError(f\"Dataset must contain the columns: {required_columns}\")\n",
    "\n",
    "# Extract exactly two unique actor names\n",
    "def get_first_two_stars(stars_str):\n",
    "    try:\n",
    "        stars = ast.literal_eval(stars_str)\n",
    "        stars = [s.strip() for s in stars if s.strip()]\n",
    "        return stars[:2] if len(stars) >= 2 else None\n",
    "    except Exception:\n",
    "        return None\n",
    "\n",
    "# Filter for rows with two valid actors\n",
    "df[\"LeadActors\"] = df[\"Stars\"].apply(get_first_two_stars)\n",
    "df = df[df[\"LeadActors\"].notnull()]\n",
    "df[\"first_actor\"] = df[\"LeadActors\"].str[0]\n",
    "df[\"second_actor\"] = df[\"LeadActors\"].str[1]\n",
    "\n",
    "# Take top N after filtering\n",
    "df_top = df.head(N_MOVIES).copy()\n",
    "\n",
    "# Check for edge case: fewer than N_MOVIES after filtering\n",
    "if len(df_top) < N_MOVIES:\n",
    "    print(f\"⚠️ Only {len(df_top)} valid movies available after filtering.\")\n",
    "\n",
    "# Shuffle actor pairs if specified\n",
    "if SHUFFLE_ACTORS:\n",
    "    actor_pairs = list(zip(df_top[\"first_actor\"], df_top[\"second_actor\"]))\n",
    "    random.shuffle(actor_pairs)\n",
    "\n",
    "# Write to JSONL\n",
    "os.makedirs(os.path.dirname(OUTPUT_JSONL), exist_ok=True)\n",
    "with open(OUTPUT_JSONL, 'w', encoding='utf-8') as f:\n",
    "    for idx, (i, row) in enumerate(df_top.iterrows()):\n",
    "        if SHUFFLE_ACTORS:\n",
    "            first_actor, second_actor = actor_pairs[idx]\n",
    "        else:\n",
    "            first_actor, second_actor = row[\"first_actor\"], row[\"second_actor\"]\n",
    "\n",
    "        json.dump({\n",
    "            \"id\": idx + 1,\n",
    "            \"movie_title\": row[\"Movie Name\"],\n",
    "            \"first_actor\": first_actor,\n",
    "            \"second_actor\": second_actor,\n",
    "            \"rating\": row.get(\"Rating\"),\n",
    "            \"runtime\": row.get(\"Runtime\"),\n",
    "            \"genre\": row.get(\"Genre\"),\n",
    "            \"metascore\": row.get(\"Metascore\"),\n",
    "            \"plot\": row.get(\"Plot\"),\n",
    "            \"directors\": row.get(\"Directors\"),\n",
    "            \"votes\": row.get(\"Votes\"),\n",
    "            \"gross\": row.get(\"Gross\")\n",
    "        }, f, ensure_ascii=False)\n",
    "        f.write('\\n')\n",
    "\n",
    "status = \"shuffled\" if SHUFFLE_ACTORS else \"original\"\n",
    "print(f\"✅ Successfully wrote {len(df_top)} {status} entries\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton', 'William Sadler']\", \"['Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton']\", \"['Yûgô Sakô', 'Koichi Saski', 'Arun Govil', 'Nikhil Kapoor', 'Edie Mirman', 'Rael Padamsee']\", \"['Kemal Sunal', 'Münir Özkul', 'Halit Akçatepe', 'Tarik Akan']\", \"['Vishal Mourya', 'Karan Kandhapan', 'Babushan Mohanty', 'Dipanwit Dashmohapatra', 'Manaswani Takri']\"]\n"
     ]
    }
   ],
   "source": [
    "print(df[\"Stars\"].head(5).tolist())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kg",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
