{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import numpy as np\n",
    "import gzip\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "gz_file = '../dat/cs_papers.gz'\n",
    "papers = []\n",
    "with gzip.open(gz_file, 'rb') as f:\n",
    "    for line in f:\n",
    "        paper = json.loads(line)\n",
    "        papers.append(paper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48760"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "papers = papers[0]\n",
    "len(papers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_year = 2000\n",
    "max_year = 2019\n",
    "restricted_papers = []\n",
    "for paper in papers:\n",
    "    if 'year' in paper and paper['year']:\n",
    "        if paper['year'] >= min_year or paper['year'] <= max_year:\n",
    "            restricted_papers.append(paper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48760"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(restricted_papers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "venues = {'ArXiV', 'ICML', 'ICLR', 'Journal of Machine Learning Research', \n",
    "          'NIPS', 'CVPR', 'ACL', 'HLT-NAACL', 'KDD', 'NAACL-HLT', 'SIGIR', 'CIKM',\n",
    "         'ICDM', 'AAAI', 'IJCAI'} #, 'PloS one', 'Proceedings of the National Academy of Sciences of the United States of America'}\n",
    "venue_papers = []\n",
    "for paper in restricted_papers:\n",
    "    if 'venue' in paper and paper['venue'] != '':\n",
    "        if paper['venue'] in venues:\n",
    "            venue_papers.append(paper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'AAAI',\n",
       "  'ACL',\n",
       "  'CIKM',\n",
       "  'CVPR',\n",
       "  'HLT-NAACL',\n",
       "  'ICDM',\n",
       "  'ICML',\n",
       "  'IJCAI',\n",
       "  'Journal of Machine Learning Research',\n",
       "  'KDD',\n",
       "  'NIPS',\n",
       "  'SIGIR'},\n",
       " 48760)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "{p['venue'] for p in venue_papers}, len(venue_papers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "paper_abstracts = {}\n",
    "for paper in venue_papers:\n",
    "    pid = paper['id']\n",
    "    if 'paperAbstract' in paper:\n",
    "        if len(paper['paperAbstract']) > 0:\n",
    "            paper_abstracts[pid] = paper['paperAbstract']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(45003, 23630)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = CountVectorizer(min_df=0.001, max_df=0.7, ngram_range=(1,2))\n",
    "docs = [paper_abstracts[pid] for pid in paper_abstracts]\n",
    "counts = vectorizer.fit_transform(docs)\n",
    "counts = counts.toarray()\n",
    "counts.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['of the', 'are', 'as', 'with', 'by', 'an', 'be', 'data', 'can',\n",
       "       'which', 'from', 'our', 'model', 'in the', 'based', 'learning',\n",
       "       'it', 'paper', 'problem', 'this paper'], dtype='<U27')"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = np.array(vectorizer.get_feature_names())\n",
    "term_freq = counts.sum(axis=0)\n",
    "top_terms = np.argsort(-term_freq)[:20]\n",
    "vocab[top_terms]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "coauthor_network = {}\n",
    "author_map = {}\n",
    "paper_authors = {}\n",
    "citation_network = {}\n",
    "paper_map ={}\n",
    "paper_year = {}\n",
    "for paper in venue_papers:\n",
    "    pid = paper['id']\n",
    "    paper_year[pid] = paper['year']\n",
    "    paper_map[pid] = paper\n",
    "    authors = paper['authors']\n",
    "    author_map.update({a['ids'][0]:a['name'] for a in authors if len(a['ids']) > 0 and 'name' in a})\n",
    "    aids = [author['ids'][0] for author in authors if len(author['ids']) > 0]\n",
    "    if len(aids) > 0:\n",
    "        paper_authors[paper['id']] = aids\n",
    "    for aid in aids:\n",
    "        if aid in coauthor_network:\n",
    "            coauthor_network[aid].extend([(a, pid) for a in aids if a != aid])\n",
    "        else:\n",
    "            coauthor_network[aid] = [(a, pid) for a in aids if a != aid]\n",
    "    \n",
    "    citations = paper['outCitations']\n",
    "    if len(citations) > 0:\n",
    "        citation_network[paper['id']] = citations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8.856831455128733, 569, 46903)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neighborhoods = [len(n) for a, n in coauthor_network.items()]\n",
    "np.mean(neighborhoods), np.max(neighborhoods), np.nonzero(neighborhoods)[0].shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mark Dredze 290\n",
      "Ido Dagan 288\n",
      "David W. Aha 287\n",
      "Vincent Conitzer 277\n",
      "Jiawei Han 272\n",
      "Wolfram Burgard 271\n",
      "Bistra N. Dilkina 271\n",
      "Christian Bessiere 269\n",
      "Yixin Chen 268\n",
      "David Chiang 268\n",
      "Jesse Davis 267\n",
      "Eneko Agirre 267\n",
      "Chitta Baral 265\n",
      "Johan de Kleer 265\n",
      "Diego Calvanese 263\n",
      "Carmel Domshlak 263\n",
      "Fabio Ciravegna 263\n",
      "Giuseppe De Giacomo 259\n",
      "Xiaoping Chen 258\n",
      "Bernardo Cuenca Grau 257\n",
      "Nello Cristianini 257\n",
      "Vincent Cicirello 257\n",
      "Gerhard Brewka 256\n",
      "Adnan Darwiche 255\n",
      "Sanjoy Dasgupta 255\n",
      "Colin Cherry 255\n",
      "Fahiem Bacchus 255\n",
      "Salem Benferhat 254\n",
      "Adi Botea 252\n",
      "Douglas Appelt 252\n",
      "James P. Delgrande 251\n",
      "Vijil Chenthamarakshan 251\n",
      "Bikramjit Banerjee 250\n",
      "Aron Culotta 250\n",
      "Vadim Bulitko 250\n",
      "Li Cheng 249\n",
      "Olivier Buffet 249\n",
      "Prashant Doshi 248\n",
      "Mikhail Bilenko 248\n",
      "Carlos Ansótegui 246\n",
      "Simon de Givry 246\n",
      "Shane Bergsma 246\n",
      "Mark Baillie 246\n",
      "Felix Brandt 245\n",
      "Silvia Coradeschi 245\n",
      "Daniel Bryce 244\n",
      "Franz Baader 244\n",
      "Roy J. Byrd 244\n",
      "Padraig Cunningham 244\n",
      "Piergiorgio Bertoli 244\n",
      "Frank Dellaert 244\n",
      "Blai Bonet 244\n",
      "Robin D. Burke 244\n",
      "Shotaro Akaho 244\n",
      "Philippe Cudré-Mauroux 244\n",
      "Walter Daelemans 243\n",
      "Hideki Asoh 243\n",
      "Philippe Dague 242\n",
      "Michael Buro 242\n",
      "Daniel Borrajo 242\n",
      "Xavier Carreras 242\n",
      "Iadine Chades 242\n",
      "Minh Do 242\n",
      "Philipp Cimiano 242\n",
      "Carlos Diuk 242\n",
      "Fábio Gagliardi Cozman 241\n",
      "Gloria Bordogna 240\n",
      "Arthur Choi 240\n",
      "Ramón Béjar 240\n",
      "Gautam Biswas 240\n",
      "Douglas Aberdeen 240\n",
      "Yves Deville 239\n",
      "Sylvie Coste-Marquis 239\n",
      "Susan Craw 239\n",
      "Guido Boella 239\n",
      "Stefano Bistarelli 239\n",
      "Nate Blaylock 239\n",
      "Emilios Cambouropoulos 239\n",
      "Brett Browning 239\n",
      "Subhashis Banerjee 239\n",
      "Gianni Amati 238\n",
      "Alvaro del Val 238\n",
      "Marco Botta 238\n",
      "Yin Chen 238\n",
      "Lucas Bordeaux 238\n",
      "Philippe Besnard 238\n",
      "Fabrice Clérot 238\n",
      "Kai Arras 238\n",
      "Marco Cadoli 238\n",
      "Alexandre Albore 238\n",
      "Marie-Odile Cordier 237\n",
      "Chen Avin 237\n",
      "Gilles Audemard 237\n",
      "Mark Chavira 237\n",
      "Raju S. Bapi 237\n",
      "Iván Dotú 237\n",
      "Nicolas Beldiceanu 237\n",
      "Jean-François Condotta 237\n",
      "Aurélie Beynier 237\n",
      "Marc Boullé 236\n",
      "Christophe Dousson 236\n",
      "Luca Console 236\n",
      "Oskar Dressler 236\n",
      "Hung Bui 236\n",
      "Younès Bennani 236\n",
      "Yllias Chali 236\n",
      "Andrew Davenport 236\n",
      "Philippe Chatalic 236\n",
      "Yacine Boufkhad 236\n",
      "Boris Chidlovskii 235\n",
      "Yiannis Demiris 235\n",
      "Tristan Cazenave 235\n",
      "Bulusu Deekshatulu 235\n",
      "Amedeo Cesta 235\n",
      "Stuart Bain 235\n",
      "Claudette Cayrol 235\n",
      "Fiorella de Rosis 235\n",
      "Anne Auger 235\n",
      "Klaus-Dieter Althoff 234\n",
      "Antoine Cornuéjols 234\n",
      "Andrea Bonarini 234\n",
      "Carlos Ivan Chesñevar 234\n",
      "Ernest Davis 234\n",
      "Frank Broz 234\n",
      "Karl Branting 234\n",
      "Simon Colton 234\n",
      "M. Bernardine Dias 234\n",
      "Ion Constantinescu 234\n",
      "Victor Dalmau 234\n",
      "Roman Bartak 234\n",
      "AnHai Doan 234\n",
      "Alicia Ageno 234\n",
      "Philippe Bretier 234\n",
      "Hubie Chen 233\n",
      "Matteo Cristani 233\n",
      "Fabrizio Angiulli 233\n",
      "Simon Dixon 233\n",
      "Anuradha Bhamidipaty 233\n",
      "Alexandre Delteil 233\n",
      "Analia Amandi 233\n",
      "Roger Dannenberg 233\n",
      "Anthony Barrett 233\n",
      "Yuan Ding 233\n",
      "Mehran Asadi 233\n",
      "Will Briggs 233\n",
      "Sabine Buchholz 233\n",
      "Maren Bennewitz 233\n",
      "Sam Chapman 233\n",
      "Bram Bakker 232\n",
      "Hei Chan 232\n",
      "Akinori Abe 232\n",
      "Neeharika Adabala 232\n",
      "Stuart Aitken 232\n",
      "Osamu Akashi 232\n",
      "Rama Akkiraju 232\n",
      "Jose Ramon Alvarez Sanchez 232\n",
      "Rema Ananthanarayanan 232\n",
      "Rie Ando 232\n",
      "Henrik Andreasson 232\n",
      "Raghav Aras 232\n",
      "Aluizio Araujo 232\n",
      "Josep-Lluis Arcos 232\n",
      "Antonio Artés 232\n",
      "Naveen Ashish 232\n",
      "Jorge Baier 232\n",
      "Olivier Bailleux 232\n",
      "Sreeram Balakrishnan 232\n",
      "Christian Balkenius 232\n",
      "Antonio Bandera Rubio 232\n",
      "Guilherme Barreto 232\n",
      "Leliane Barros 232\n",
      "Peter Baumgartner 232\n",
      "Paolo Bellutta 232\n",
      "Rachel Ben-Eliyahu 232\n",
      "Hachemi Bennaceur 232\n",
      "Andraz Bezek 232\n",
      "Peter Biber 232\n",
      "M. Brian Blake 232\n",
      "Bastian Blankenburg 232\n",
      "Olivier Boissier 232\n",
      "Alex Borgida 232\n",
      "Cecile Bothorel 232\n",
      "Vicente J. Botti 232\n",
      "Paolo Bouquet 232\n",
      "Michael Bowling 232\n",
      "Antonio Braga 232\n",
      "Arthur Braga 232\n",
      "Sebastian Brand 232\n",
      "Jonathan Bredin 232\n",
      "Christopher Brewster 232\n",
      "Darin Brezeale 232\n",
      "Derek Bridge 232\n",
      "Ismel Brito 232\n",
      "Ken Brown 232\n",
      "Ernesto Burattini 232\n",
      "Stefan Byttner 232\n",
      "Philippe Caillou 232\n",
      "Christopher Callison 232\n",
      "Joe Carsten 232\n",
      "Andre Carvalho 232\n"
     ]
    }
   ],
   "source": [
    "neighborhoods = [(a, len(set([t[0] for t in n]))) for a, n in coauthor_network.items()]\n",
    "sorted_by_coauthors = sorted(neighborhoods, key=lambda x:x[1], reverse=True)\n",
    "for (aid, size) in sorted_by_coauthors[:200]:\n",
    "    print(author_map[aid], size)\n",
    "\n",
    "# (aid, size) = sorted_by_coauthors[75]\n",
    "# unique_coauthors = set([t[0] for t in coauthor_network[aid]])\n",
    "# print(author_map[aid], len(unique_coauthors))\n",
    "# for (coauth) in unique_coauthors:\n",
    "#     print(author_map[coauth], coauth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6.875133667845685, 1.9816977872830468)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distinct_coauthors = []\n",
    "duplicates = []\n",
    "for a, n in coauthor_network.items():\n",
    "    authors_only = [t[0] for t in n]\n",
    "    distinct_coauthors.append(len(set(authors_only)))\n",
    "    duplicates.append(len(n) - len(set(authors_only)))\n",
    "np.mean(distinct_coauthors), np.mean(duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15.074233559560028, 207, 42730)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neighborhoods = [len(n) for a, n in citation_network.items()]\n",
    "np.mean(neighborhoods), np.max(neighborhoods), len(neighborhoods)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "author_citation_map = {}\n",
    "for pid, authors in paper_authors.items():\n",
    "    if pid not in citation_network:\n",
    "        continue\n",
    "    \n",
    "    cited_papers = citation_network[pid]\n",
    "    for a in authors:\n",
    "        if a in author_citation_map:\n",
    "            author_citation_map[a].extend([(pid, p) for p in cited_papers])\n",
    "        else:\n",
    "            author_citation_map[a] = [(pid, p) for p in cited_papers]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "author_pair_papers = {}\n",
    "for author, coauthors in coauthor_network.items():\n",
    "    for (coauth_id, pid) in coauthors:\n",
    "        if (author, coauth_id) in author_pair_papers:\n",
    "            author_pair_papers[(author, coauth_id)].add(pid)\n",
    "        else:\n",
    "            author_pair_papers[(author, coauth_id)] = {pid}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "citation_overlap = []\n",
    "for aid, coauth in coauthor_network.items():\n",
    "    if aid not in author_citation_map:\n",
    "        continue\n",
    "    cited = [pair[1] for pair in author_citation_map[aid]]\n",
    "    for (ca, p) in coauth:\n",
    "        if ca not in author_citation_map:\n",
    "            continue\n",
    "        coauthored_paper_set = author_pair_papers[(aid, ca)]\n",
    "        coauth_cited = [pair[1] for pair in author_citation_map[ca] if pair[0] not in coauthored_paper_set]\n",
    "        citation_overlap.append(len(set(cited) & set(coauth_cited)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.431196516490803"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(citation_overlap)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "year_cutoff = 2010\n",
    "past_auth_citations = {}\n",
    "for pid, cited in citation_network.items():\n",
    "    if paper_year[pid] > year_cutoff:\n",
    "        continue\n",
    "    for author in paper_authors[pid]:\n",
    "        if author in past_auth_citations:\n",
    "            past_auth_citations[author] |= set(cited)\n",
    "        else:\n",
    "            past_auth_citations[author] = set(cited)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(85864, 24115)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "past_cited_papers = set.union(*past_auth_citations.values())\n",
    "M = len(past_cited_papers) \n",
    "N = len(past_auth_citations.keys())\n",
    "M,N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "present_outcomes = {}\n",
    "for pid, cited in citation_network.items():\n",
    "    if paper_year[pid] <= year_cutoff:\n",
    "        continue\n",
    "    for author in paper_authors[pid]:\n",
    "        if author in present_outcomes:\n",
    "            present_outcomes[author] |= set(cited) & past_cited_papers\n",
    "        else:\n",
    "            present_outcomes[author] = set(cited) & past_cited_papers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27877, 26414)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "present_cited_papers = set.union(*present_outcomes.values())\n",
    "M = len(present_cited_papers)\n",
    "N = len(present_outcomes.keys())\n",
    "M,N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
