{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:52.748126Z",
     "start_time": "2024-09-28T13:59:52.512659Z"
    }
   },
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import numpy as np\n",
    "data_dir = 'data'\n",
    "data_fn = 'COVIDW1_W6.csv'\n",
    "data_path = os.path.join(data_dir, data_fn)"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:54.460511Z",
     "start_time": "2024-09-28T13:59:52.871821Z"
    }
   },
   "cell_type": "code",
   "source": "data = pd.read_csv(data_path)",
   "id": "d4a72c454ac49654",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_66932/1675668717.py:1: DtypeWarning: Columns (0,5,1462,1463,1464,1465,1466,1467,1469,1470,1471,1472,1474,1475,1476,1477,1478,1479,1481,1490,1491,1505,1506,1507,1508,1509,1510,1512,1513,1514,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524,1525,1526,1527,1528,1529,1530,1536,1537,1538,1539,1540,1541,1542,1543,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558,1559,1560,1561,1562,1564,1565,1566,1567,1568,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599,1600,1601,1602,1603,1616,1617,1618,1619,1620,1621,1622,1623,1624,1625,1626,1627,1628,1629,1630,1631,1632,1633,1634,1635,1636,1637,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1653,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705,1706,1707,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726,1727,1728,1729,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745,1748,1749,1750,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,1761,1762,1763,1764,1765,1766,1767,1768,1769,1770,1771,1772,1773,1774,1775,1776,1777,1778,1779,1780,1781,1782,1783,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1807,1808,1809,1810,1811,1812,1813,1814,1815,1816,1817,1818,1819,1820,1821,1822,1823,1824,1825,1826,1827,1828,1829,1830,1831,1832,1833,1834,1835,1836,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850,1851,1852,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1907,1908,1909,1910,1911,1912,1914,1915,1916,1917,1918,1919,1921,1922,1923,1925,1926,1927,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data = pd.read_csv(data_path)\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:55.437114Z",
     "start_time": "2024-09-28T13:59:55.433142Z"
    }
   },
   "cell_type": "code",
   "source": [
    "w2_cols = []\n",
    "for col in data.columns:\n",
    "    if 'W2' in col:\n",
    "        w2_cols.append(col)\n",
    "    if 'W2_Relationships' in col:\n",
    "        print(col)"
   ],
   "id": "e855129fde7cf685",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "W2_Relationships1\n",
      "W2_Relationships2\n",
      "W2_Relationships3\n",
      "W2_Relationships4\n"
     ]
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:56.306896Z",
     "start_time": "2024-09-28T13:59:56.302181Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "    \n",
    "    \n",
    "w2_age = ['W2_Age_year']\n",
    "w2_keyworker = ['W2_Keyworker']\n",
    "w2_adults = ['W2_Adults_household']\n",
    "w2_children = ['W2_Children_household'] \n",
    "w2_income = ['W2_Change_Income'] \n",
    "w2_covid_anx = ['W2_COVID19_anxiety']\n",
    "w2_risk1month = ['W2_RISK_1month'] \n",
    "\n",
    "w2_gender = ['W2_Gender']\n",
    "w2_attachment = ['W2_Attach_Style']\n",
    "w2_loneliness = ['W2_Loneliness1',\n",
    "                'W2_Loneliness2',\n",
    "                'W2_Loneliness3']\n",
    "w2_chronicill = ['W2_Chronic_illness_self']\n",
    "w2_rel_stat = ['W2_Relationship']\n",
    "w2_social_dist = ['W2_SocialDistance1',\n",
    "                'W2_SocialDistance2',\n",
    "                'W2_SocialDistance3',\n",
    "                'W2_SocialDistance4',\n",
    "                'W2_SocialDistance5',\n",
    "                'W2_SocialDistance6',\n",
    "                'W2_SocialDistance7',\n",
    "                'W2_SocialDistance8',\n",
    "                'W2_SocialDistance9',\n",
    "                'W2_SocialDistance10',\n",
    "                'W2_SocialDistance11',\n",
    "                'W2_SocialDistance12',\n",
    "                'W2_SocialDistance13',\n",
    "                'W2_SocialDistance14',\n",
    "                'W2_SocialDistance15',\n",
    "                'W2_SocialDistance16']\n",
    "\n",
    "w2_depression = ['W2_Dep_1',\n",
    "            'W2_Dep_2',\n",
    "            'W2_Dep_3',\n",
    "            'W2_Dep_4',\n",
    "            'W2_Dep_5',\n",
    "            'W2_Dep_6',\n",
    "            'W2_Dep_7',\n",
    "            'W2_Dep_8',\n",
    "            'W2_Dep_9']\n",
    "w2_anx = ['W2_GAD_1',\n",
    "        'W2_GAD_2',\n",
    "        'W2_GAD_3',\n",
    "        'W2_GAD_4',\n",
    "        'W2_GAD_5',\n",
    "        'W2_GAD_6',\n",
    "        'W2_GAD_7']\n",
    "\n",
    "# wave 3 variables\n",
    "w3_age = ['W2_Age_year']  \n",
    "w3_gender = ['W3_Age_year']\n",
    "w3_attachment = ['W3_Attach_style']\n",
    "# wave 1 variables\n",
    "w1_age = ['W1_Age_year']  \n",
    "w1_gender = ['W1_Age_year']\n",
    "w1_attachment = ['W3_Attach_style']  # no W1 value exists, so just try W3 again\n",
    "\n",
    "all_vars = [w2_anx, w2_depression, w2_social_dist, w2_age, w2_keyworker, w2_adults, w2_children, w2_income, w2_covid_anx, w2_risk1month,\n",
    "            w2_gender, w2_attachment, w2_loneliness, w2_chronicill, w2_rel_stat]"
   ],
   "id": "59e30c10ba6c7bd4",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:59.081814Z",
     "start_time": "2024-09-28T13:59:56.959131Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# convert to float:\n",
    "def convert_columns_to_float(df, columns):\n",
    "    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')\n",
    "    \n",
    "    \n",
    "for construct in [item for sublist in all_vars for item in sublist]:\n",
    "    convert_columns_to_float(data, construct)\n"
   ],
   "id": "8080940688d7884a",
   "outputs": [],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:59.092168Z",
     "start_time": "2024-09-28T13:59:59.082890Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "# forward backward filling for sets to fill:\n",
    "sets_to_fill = [w2_age, w2_gender, w2_attachment]\n",
    "for col_w2, col_w1, col_w3 in zip(sets_to_fill, [w1_age, w1_gender, w1_attachment], [w3_age, w3_gender, w3_attachment]):\n",
    "    \n",
    "    data[col_w2[0]] = data[col_w2[0]].combine_first(data[col_w1[0]])  # Fill missing with wave 1 values\n",
    "    data[col_w2[0]] = data[col_w2[0]].combine_first(data[col_w3[0]]) \n"
   ],
   "id": "5fcc37a89b0bd2c",
   "outputs": [],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:59.127975Z",
     "start_time": "2024-09-28T13:59:59.093025Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# list wise deletion for relevant variables\n",
    "data = data[[item for sublist in all_vars for item in sublist]]\n",
    "print(data.shape)\n",
    "data = data.dropna()\n",
    "print(data.shape)"
   ],
   "id": "dfc8f568b91708cd",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5364, 46)\n",
      "(1406, 46)\n"
     ]
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T13:59:59.135295Z",
     "start_time": "2024-09-28T13:59:59.129055Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# deal with -99 and other anomolous values:\n",
    "\n",
    "# remove -99 in W2_Keyworker\n",
    "# change -9 to 0 in keyworker\n",
    "# change -9 to 0 in Adults household\n",
    "# remove -99 in children household\n",
    "# change -9 to -1 in children household\n",
    "# remove -99 from attachment style\n",
    "\n",
    "\n",
    "# Step 1: Listwise delete rows where -99 is the value of W2 Keyworker\n",
    "data = data[data[w2_keyworker[0]] != -99]\n",
    "\n",
    "# Step 2: Change -9 to 0.0 in W2 Keyworker\n",
    "data[w2_keyworker[0]] = data[w2_keyworker[0]].replace(-9, 0.0)\n",
    "\n",
    "# Step 3: Change -9 to 0.0 in Adults household\n",
    "data[w2_adults[0]] = data[w2_adults[0]].replace(-9, 0.0)\n",
    "\n",
    "# Step 4: Listwise delete rows where -99 is the value of Children household\n",
    "data = data[data[w2_children[0]] != -99]\n",
    "\n",
    "# Step 5: Change -9 to -1.0 in Children household\n",
    "data[w2_children[0]] = data[w2_children[0]].replace(-9, -1.0)\n",
    "\n",
    "# Step 6: Listwise delete rows where -99 is the value of Attachment style\n",
    "data = data[data[w2_attachment[0]] != -99]\n",
    "\n",
    "print(data.shape)"
   ],
   "id": "6ec4e19318d6fb8c",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(895, 46)\n"
     ]
    }
   ],
   "execution_count": 8
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-28T14:00:06.034915Z",
     "start_time": "2024-09-28T14:00:06.016435Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# export\n",
    "data.to_csv(os.path.join(data_dir, 'real_world_dataset.csv'), index=False)\n",
    "\n",
    "\n"
   ],
   "id": "19af4ce491bbb754",
   "outputs": [],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-27T11:50:12.842434Z",
     "start_time": "2024-09-27T11:50:12.840595Z"
    }
   },
   "cell_type": "code",
   "source": "",
   "id": "23f76aff30cc1674",
   "outputs": [],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-27T11:50:13.165272Z",
     "start_time": "2024-09-27T11:50:13.163398Z"
    }
   },
   "cell_type": "code",
   "source": "",
   "id": "ce0b878ec386f3bf",
   "outputs": [],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T15:52:52.754062Z",
     "start_time": "2024-09-20T15:52:52.752615Z"
    }
   },
   "cell_type": "code",
   "source": "",
   "id": "ea333475fd167963",
   "outputs": [],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T15:52:52.756215Z",
     "start_time": "2024-09-20T15:52:52.754728Z"
    }
   },
   "cell_type": "code",
   "source": "",
   "id": "d537728e62b0d974",
   "outputs": [],
   "execution_count": 9
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
