{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import tweepy\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# url access\n",
    "data_url = \"https://raw.githubusercontent.com/echen102/COVID-19-TweetIDs/master/2021-10/coronavirus-tweet-id-2021-10-01-00.txt\"\n",
    "tweet_data = pd.read_csv(data_url, header=None, sep='\\s+')\n",
    "# tweet_data.rename(columns={0: 'tweet_ids'}, inplace=True)\n",
    "# tweet_data.head()\n",
    "tweet_data.iloc[[0,3]][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scrape tweet_ids from github randomly and uniformly across time\n",
    "number_scraped = 100000/0.6359 #number to be scraped taking into account number of expected english tweets we'll find\n",
    "number_per_month = number_scraped/23 # 23 months of covid\n",
    "number_per_day = lambda month: number_per_month/31 if month in [1, 3, 5, 7, 8, 10, 12] else number_per_month/30\n",
    "number_per_hour = lambda month: int(number_per_day(month)/16) # least activity is midnight to 8am: https://smallbusiness.chron.com/peak-times-twitter-activity-62864.html\n",
    "years = ['2020', '2021']\n",
    "months = list(range(1, 13))\n",
    "days = lambda month: list(range(1,32)) if month in [1, 3, 5, 7, 8, 10, 12] else list(range(1,31))\n",
    "hours = list(range(8, 24))\n",
    "banned = [('03', '02', 0, 23), ('02', '23', 0, 23), ('02', '22', 21, 23), ('02', '01', 4, 9)]\n",
    "base_url = \"https://raw.githubusercontent.com/echen102/COVID-19-TweetIDs/master/\" #2021-10/coronavirus-tweet-id-2021-10-01-00.txt\n",
    "with open('tweet_ids.txt', 'w') as f:\n",
    "    for year in years:\n",
    "        for month in months:\n",
    "            month = str(month) if month >= 10 else '0' + str(month)\n",
    "            print('Writing Month: ' + month)\n",
    "            for day in days(month):\n",
    "                day = str(day) if day >= 10 else '0' + str(day)\n",
    "                if year == '2020' and month == '01' and day < '22':\n",
    "                    continue\n",
    "                for hour in hours:\n",
    "                    if len(banned) != 0 and month > banned[-1][0] or (month == banned[-1][0] and day > banned[-1][1]):\n",
    "                        banned = banned[:-1]\n",
    "                    if len(banned) != 0 and month == banned[-1][0] and day == banned[-1][1] and year == '2020' and hour >= banned[-1][2] and hour <= banned[-1][3]:\n",
    "                        continue\n",
    "                    hour = str(hour) if hour >= 10 else '0' + str(hour)\n",
    "                    url = base_url + year + '-' + month + '/coronavirus-tweet-id-' + year + '-' + month + '-' + day + '-' + hour + '.txt'\n",
    "                    tweet_data = pd.read_csv(data_url, header=None, sep='\\s+')\n",
    "                    indexes = random.sample(range(0, len(tweet_data)), number_per_hour(int(month)))\n",
    "                    f.write('\\n'.join([str(x) for x in list(tweet_data.iloc[indexes][0])]))\n",
    "                    f.write('\\n')\n",
    "f.close()\n",
    "                \n",
    "                \n",
    "                \n",
    "                \n",
    "                "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
