{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f265100f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "module_path = os.path.abspath(os.path.join('../..'))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "# Use pygeos in geopandas\n",
    "os.environ['USE_PYGEOS'] = '0'\n",
    "\n",
    "import json\n",
    "import warnings\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import geopandas as gpd\n",
    "import matplotlib as mpl\n",
    "# import modin.pandas as mpd\n",
    "import matplotlib.cm as cm\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from glob import glob\n",
    "from math import comb\n",
    "from pyproj import Proj\n",
    "from datetime import date\n",
    "from copy import deepcopy\n",
    "from functools import partial\n",
    "from argparse import Namespace\n",
    "from scipy.spatial import KDTree\n",
    "from sklearn.cluster import KMeans\n",
    "from itertools import islice,product\n",
    "from tqdm.auto import tqdm\n",
    "# from modin.config import ProgressBar\n",
    "# from distributed import Client, LocalCluster\n",
    "from scipy.spatial.distance import squareform,pdist,cdist\n",
    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
    "from shapely.geometry import Point,LineString, Polygon, mapping, MultiPoint, box\n",
    "\n",
    "\n",
    "# from gensit.utils import *\n",
    "# from gensit.notebook_functions import *\n",
    "\n",
    "# ProgressBar.enable()\n",
    "\n",
    "# mpl.rcParams['agg.path.chunksize'] = 10000\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "# AUTO RELOAD EXTERNAL MODULES\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80c30bfa",
   "metadata": {},
   "source": [
    "# Cambridge commuter\n",
    "## Import table and geometries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcf844b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Expertiment id\n",
    "geometry_name = 'lsoas_to_msoas'\n",
    "origin_geometry_name = 'lsoa'\n",
    "destination_geometry_name = 'msoa'\n",
    "# 'msoa'\n",
    "# 'lsoa'\n",
    "# 'oa'\n",
    "dataset = f'cambridge_work_commuter_{geometry_name}'\n",
    "table_filename = 'lower_super_output_areas_to_medium_super_output_areas_work_flows_cambridge_2011'\n",
    "# 'lower_super_output_areas_work_flows_cambridge_2011'\n",
    "# 'middle_super_output_areas_work_flows_cambridge_2011'\n",
    "# 'lower_super_output_areas_work_flows_cambridge_2011'\n",
    "# 'output_areas_work_flows_cambridge_2011'\n",
    "# 'lower_super_output_areas_to_medium_super_output_areas_work_flows_cambridge_2011'\n",
    "\n",
    "# Define directory\n",
    "table_path = f'../data/raw/cambridge_commuter/{table_filename}.csv'\n",
    "geometries_path = f'../data/raw/cambridge_commuter/cambridge_{geometry_name}.geojson'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09d703ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in destination attraction\n",
    "destination_employment = pd.read_csv(f'../data/raw/cambridge_commuter/employment_survey_{destination_geometry_name}.csv',header=None)\n",
    "destination_employment.columns = [f'{destination_geometry_name}_id','number_of_jobs']\n",
    "destination_employment[f'{destination_geometry_name}_id'] = destination_employment[f'{destination_geometry_name}_id'].apply(lambda x: x.split(' : ')[0])\n",
    "destination_employment = destination_employment.sort_values(f'{destination_geometry_name}_id')\n",
    "destination_number_of_jobs = destination_employment['number_of_jobs'].values\n",
    "\n",
    "origin_employment = pd.read_csv(f'../data/raw/cambridge_commuter/employment_survey_{origin_geometry_name}.csv',header=None)\n",
    "origin_employment.columns = [f'{origin_geometry_name}_id','number_of_jobs']\n",
    "origin_employment[f'{origin_geometry_name}_id'] = origin_employment[f'{origin_geometry_name}_id'].apply(lambda x: x.split(' : ')[0])\n",
    "origin_employment = origin_employment.sort_values(f'{origin_geometry_name}_id')\n",
    "origin_number_of_jobs = origin_employment['number_of_jobs'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1cf97e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read table\n",
    "table = pd.read_csv(table_path,index_col=0)\n",
    "# Store first column\n",
    "origin_geometry_ids = sorted(table.index.values)\n",
    "destination_geometry_ids = sorted(table.columns.values)\n",
    "geometry_ids = np.append(origin_geometry_ids,destination_geometry_ids)\n",
    "# Sort columns and rows alphabetically\n",
    "table = table[destination_geometry_ids]\n",
    "table = table.sort_index()\n",
    "# Convert to array\n",
    "table = table.values\n",
    "\n",
    "# Get dimensions\n",
    "I,J = np.shape(table)\n",
    "\n",
    "# Read geometries\n",
    "geometries = gpd.read_file(geometries_path)\n",
    "# Reproject\n",
    "geometries = geometries.set_crs('epsg:27700',allow_override=True)\n",
    "\n",
    "if geometry_name == 'lsoas':\n",
    "    geometries = geometries.rename(columns={\"LSOA11CD\":\"geometry_id\"})\n",
    "else:\n",
    "    geometries = geometries.rename(columns={\"code\":\"geometry_id\"})\n",
    "\n",
    "geometries['geometry_id'] = geometries['geometry_id'].astype(str)\n",
    "geometries = geometries.set_index(\"geometry_id\")\n",
    "# Reindex by table geometry ids\n",
    "geometries = geometries.reindex(origin_geometry_ids) if origin_geometry_name == destination_geometry_name else geometries.reindex(origin_geometry_ids+destination_geometry_ids)\n",
    "geometries = geometries.reset_index()\n",
    "# Extract centroids\n",
    "geometries[\"centroid\"] = geometries.centroid\n",
    "geometries[\"LONG\"] = geometries.centroid.x\n",
    "geometries[\"LAT\"] = geometries.centroid.y\n",
    "# Get all relevant columns\n",
    "geometries = geometries[[\"geometry_id\",\"LONG\",\"LAT\",\"geometry\"]]\n",
    "geometries.loc[geometries.geometry_id.isin(origin_geometry_ids),'geometry_type'] = origin_geometry_name\n",
    "geometries.loc[geometries.geometry_id.isin(destination_geometry_ids),'geometry_type'] = destination_geometry_name\n",
    "\n",
    "# Write geometries to file\n",
    "# geometries.set_index('geometry_id').to_file(f\"../data/inputs/{dataset}/{geometry_name}.geojson\", driver='GeoJSON')\n",
    "\n",
    "# Add rowsums and column sums to geometries\n",
    "geometries.loc[geometries.geometry_type == origin_geometry_name,'origin_demand'] = table.sum(axis=1)\n",
    "geometries.loc[geometries.geometry_type == destination_geometry_name,'destination_demand'] = table.sum(axis=0)\n",
    "geometries.loc[geometries.geometry_type == destination_geometry_name,'number_of_jobs'] = destination_number_of_jobs\n",
    "geometries.loc[geometries.geometry_type == origin_geometry_name,'number_of_jobs'] = origin_number_of_jobs\n",
    "\n",
    "# Find all geometries around boundary \n",
    "origin_boundary_polygon_ids = geometries[(geometries.geometry_type==origin_geometry_name)&(geometries.intersects(geometries.unary_union.boundary))].geometry_id.values\n",
    "destination_boundary_polygon_ids = geometries[(geometries.geometry_type==destination_geometry_name)&(geometries.intersects(geometries.unary_union.boundary))].geometry_id.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a09570a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in facilities\n",
    "# facilities = gpd.read_file('../data/raw/cambridge_commuter/facilities.geojson')\n",
    "facilities = gpd.read_file('../data/raw/cambridge_commuter/facilities_v2.geojson')\n",
    "facilities = facilities.set_crs('epsg:27700',allow_override=True)\n",
    "cambridge_facilities = gpd.sjoin(facilities, geometries, how='inner', predicate='within')\n",
    "\n",
    "# Get only facilities for specific activities\n",
    "cambridge_facilities = cambridge_facilities[cambridge_facilities['activities'].str.contains('home') | cambridge_facilities['activities'].str.contains('work')]\n",
    "# Filter out retired and student homes\n",
    "cambridge_facilities = cambridge_facilities[~(cambridge_facilities['activities'].str.contains('retired_home')) &\n",
    "                                            ~(cambridge_facilities['activities'].str.contains('student_home')) &\n",
    "                                            ~(cambridge_facilities['activities'].str.contains('transit'))]\n",
    "\n",
    "# Discern between home-only and work activities\n",
    "cambridge_facilities.loc[cambridge_facilities['activities'].str.contains('home',na=False),'main_activity'] = 'home'\n",
    "cambridge_facilities.loc[~cambridge_facilities['activities'].str.contains('home',na=False),'main_activity'] = 'work'\n",
    "# cambridge_facilities.loc[~cambridge_facilities['activities'].isin(['home']),'main_activity'] = 'work'\n",
    "\n",
    "# Create facility_id\n",
    "cambridge_facilities.loc[:,'facility_id'] = cambridge_facilities['main_activity'] + '_' + cambridge_facilities['id']\n",
    "\n",
    "# Get all home locations in origin geometry\n",
    "home_locs = cambridge_facilities[(cambridge_facilities.main_activity == 'home') & (cambridge_facilities.geometry_type == origin_geometry_name)]\n",
    "# Get all work locations in destination geometry\n",
    "work_locs = cambridge_facilities[(cambridge_facilities.main_activity == 'work') & (cambridge_facilities.geometry_type == destination_geometry_name)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff2420b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count number of facilities per geography\n",
    "work_facility_count = cambridge_facilities[cambridge_facilities['main_activity']=='work'].groupby(['geometry_id']).size().reset_index(name='work_facility_count')\n",
    "home_facility_count = cambridge_facilities[cambridge_facilities['main_activity']=='home'].groupby(['geometry_id']).size().reset_index(name='home_facility_count')\n",
    "geometries = pd.merge(geometries,work_facility_count,on='geometry_id',how='left')\n",
    "geometries = pd.merge(geometries,home_facility_count,on='geometry_id',how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5d7ae88",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert table to geopandas\n",
    "table_df = pd.DataFrame(table,index=origin_geometry_ids,columns=destination_geometry_ids)\n",
    "# Create pairs of flows instead of 2d flows\n",
    "table_df = table_df.stack().reset_index()\n",
    "# Rename columns\n",
    "table_df.rename(columns={\"level_0\":\"origin\",\"level_1\":\"destination\",0:\"flow\"},inplace=True)\n",
    "# Attach origin geometry\n",
    "table_df = table_df.merge(\n",
    "                geometries[['geometry_id','LONG','LAT','geometry','origin_demand']].set_index('geometry_id'),\n",
    "                left_on='origin',\n",
    "                right_index=True,\n",
    "                how='left'\n",
    ")\n",
    "# Rename geometries\n",
    "table_df.rename(columns={\"LONG\":\"origin_long\",\"LAT\":\"origin_lat\",\"geometry\":\"origin_geometry\"},inplace=True)\n",
    "# Attach destination geometry\n",
    "table_df = table_df.merge(\n",
    "                geometries[['geometry_id','LONG','LAT','geometry','destination_demand']].set_index('geometry_id'),\n",
    "                left_on='destination',\n",
    "                right_index=True,\n",
    "                how='left'\n",
    ")\n",
    "# Rename geometries\n",
    "table_df.rename(columns={\"LONG\":\"destination_long\",\"LAT\":\"destination_lat\",\"geometry\":\"destination_geometry\"},inplace=True)\n",
    "\n",
    "# Convert to geopandas\n",
    "table_gdf = gpd.GeoDataFrame(table_df,geometry='origin_geometry')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efb43940",
   "metadata": {},
   "outputs": [],
   "source": [
    "adjacency_matrix = []\n",
    "for i,orig_geom in geometries[geometries.geometry_type=='lsoa'].iterrows():\n",
    "    for j,dest_geom in geometries[geometries.geometry_type=='msoa'].iterrows():\n",
    "        if orig_geom.geometry.intersects(dest_geom.geometry):\n",
    "            adjacency_matrix.append([orig_geom.geometry_id,dest_geom.geometry_id,1])\n",
    "            # Add ones to the augmented matrix (origin+destination x origin+destination)\n",
    "            adjacency_matrix.append([dest_geom.geometry_id,orig_geom.geometry_id,1])\n",
    "        else:\n",
    "            adjacency_matrix.append([orig_geom.geometry_id,dest_geom.geometry_id,0])\n",
    "            # Add zeros to the augmented matrix (origin+destination x origin+destination)\n",
    "            adjacency_matrix.append([dest_geom.geometry_id,orig_geom.geometry_id,0])\n",
    "\n",
    "# Add ones to the augmented matrix (origin+destination x origin+destination)\n",
    "# for i,orig_geom in geometries[geometries.geometry_type=='lsoa'].iterrows():\n",
    "#     adjacency_matrix.append([orig_geom.geometry_id,orig_geom.geometry_id,1])\n",
    "# for j,dest_geom in geometries[geometries.geometry_type=='msoa'].iterrows():\n",
    "#     adjacency_matrix.append([dest_geom.geometry_id,dest_geom.geometry_id,1])\n",
    "\n",
    "adjacency_matrix = pd.DataFrame(adjacency_matrix,columns=['origin','destination','adjacency'])\n",
    "adjacency_matrix = adjacency_matrix.pivot(index='origin', columns='destination', values='adjacency')\n",
    "# Replace nulls with zeros - do this only for the augemented matrix\n",
    "adjacency_matrix = adjacency_matrix.fillna(0).astype('int32')\n",
    "# identity_adjacency_matrix = np.ones((I,J),dtype='int32')\n",
    "# Do this for the augmented matrix\n",
    "identity_adjacency_matrix = np.ones((I+J,I+J),dtype='int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0a165ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Region features\n",
    "region_features = deepcopy(geometries)\n",
    "region_features['demand'] = region_features['origin_demand'].combine_first(region_features['destination_demand'])\n",
    "region_features = region_features[['geometry_id','LONG','LAT','demand','number_of_jobs','geometry_type']]\n",
    "region_features['geometry_id'] = region_features['geometry_id'].astype('str')\n",
    "region_features = region_features.set_index('geometry_id')\n",
    "\n",
    "origin_region_features = region_features[region_features.geometry_type=='lsoa'].drop(columns=['geometry_type'])\n",
    "destination_region_features = region_features[region_features.geometry_type=='msoa'].drop(columns=['geometry_type'])\n",
    "region_features = region_features.drop(columns=['geometry_type',\"LONG\",\"LAT\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6a265db",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(os.path.join(f'../data/inputs/{dataset}/region_features.npy'),region_features.to_numpy())\n",
    "# np.save(os.path.join(f'../data/inputs/{dataset}/origin_region_features.npy'),origin_region_features.to_numpy())\n",
    "# np.save(os.path.join(f'../data/inputs/{dataset}/destination_region_features.npy'),destination_region_features.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56422c3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "cm_filename = \"\"\"../data/inputs/cambridge_work_commuter_lsoas_to_msoas/cost_matrices/clustered_facilities_sample_20x20_20_01_2023_sample_20x20_clustered_facilities_ripleys_k_500_euclidean_points%_prob_origin_destination_adjusted_normalised_boundary_only_edge_corrected_cost_matrix_max_normalised.txt\"\"\"\n",
    "augmented_cm_filename = cm_filename.replace(\"cost_matrix\",\"augmented_cost_matrix\")\n",
    "cm = np.loadtxt(cm_filename)\n",
    "augmented_cm = np.zeros((I+J,I+J),dtype='float32')\n",
    "for i in range(I):\n",
    "    for j in range(J):\n",
    "        augmented_cm[i,I+j] = cm[i,j]\n",
    "        augmented_cm[I+j,i] = cm[i,j]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "881710ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "augmented_table = np.zeros((I+J,I+J),dtype='int32')\n",
    "for i in range(I):\n",
    "    for j in range(J):\n",
    "        augmented_table[i,I+j] = table[i,j]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a31673b",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(15,10))\n",
    "plt.imshow(augmented_table, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "# ax.set_title('Destination attraction',fontsize=16)\n",
    "ax.set_ylabel('Nodes',fontsize=16)\n",
    "ax.set_xlabel('Nodes',fontsize=16)\n",
    "ax.set_xticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "ax.set_yticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe472146",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(15,10))\n",
    "plt.imshow(adjacency_matrix, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "# ax.set_title('Destination attraction',fontsize=16)\n",
    "ax.set_ylabel('Nodes',fontsize=16)\n",
    "ax.set_xlabel('Nodes',fontsize=16)\n",
    "ax.set_xticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "ax.set_yticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2cae434",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(15,10))\n",
    "plt.imshow(augmented_cm, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "# ax.set_title('Destination attraction',fontsize=16)\n",
    "ax.set_ylabel('Nodes',fontsize=16)\n",
    "ax.set_xlabel('Nodes',fontsize=16)\n",
    "ax.set_xticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "ax.set_yticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6981edd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt(os.path.join(f'../data/inputs/{dataset}/adjacency_matrix.txt'),adjacency_matrix.values,fmt='%i')\n",
    "np.savetxt(os.path.join(f'../data/inputs/{dataset}/augmented_table_lsoas_to_msoas.txt'),augmented_table)\n",
    "np.savetxt(augmented_cm_filename,augmented_cm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46807365",
   "metadata": {},
   "outputs": [],
   "source": [
    "ten_percent_train_cells = np.loadtxt('../data/inputs/cambridge_work_commuter_lsoas_to_msoas/constraints/cell_constraints_permuted_size_90_cell_percentage_10_constrained_axes_0_1_seed_1234.txt')\n",
    "twenty_percent_train_cells = np.loadtxt('../data/inputs/cambridge_work_commuter_lsoas_to_msoas/constraints/cell_constraints_permuted_size_179_cell_percentage_20_constrained_axes_0_1_seed_1234.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd641d41",
   "metadata": {},
   "outputs": [],
   "source": [
    "augmented_test_cells = []\n",
    "augmented_ten_percent_train_cells = []\n",
    "augmented_twenty_percent_train_cells = []\n",
    "for i in range(I):\n",
    "    for j in range(J):\n",
    "        augmented_test_cells.append([i,I+j])\n",
    "        if ((i,j) in set(list(map(tuple,ten_percent_train_cells.astype('int32').tolist())))):\n",
    "            augmented_ten_percent_train_cells.append([i,I+j])\n",
    "        if ((i,j) in set(list(map(tuple,twenty_percent_train_cells.astype('int32').tolist())))):\n",
    "            augmented_twenty_percent_train_cells.append([i,I+j])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ca511ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of origins, destinations used in training\n",
    "print(\"10% cell constrained \\n\",\n",
    "      f\"origins: {len(set(np.array(augmented_ten_percent_train_cells)[:,0]))} \\n\",\n",
    "      f\"destinations: {len(set(np.array(augmented_ten_percent_train_cells)[:,1]))}\")\n",
    "print(\"20% cell constrained \\n\",\n",
    "      f\"origins: {len(set(np.array(augmented_twenty_percent_train_cells)[:,0]))} \\n\",\n",
    "      f\"destinations: {len(set(np.array(augmented_twenty_percent_train_cells)[:,1]))}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be0ab53d",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_cell_matrix = np.zeros((I+J,I+J),dtype='int32')\n",
    "augmented_ten_percent_train_cell_matrix = np.zeros((I+J,I+J),dtype='int32')\n",
    "augmented_twenty_percent_train_cell_matrix = np.zeros((I+J,I+J),dtype='int32')\n",
    "for c in augmented_test_cells:\n",
    "    test_cell_matrix[tuple(c)] = 1\n",
    "for c in augmented_ten_percent_train_cells:\n",
    "    augmented_ten_percent_train_cell_matrix[tuple(c)] = 1\n",
    "for c in augmented_twenty_percent_train_cells:\n",
    "    augmented_twenty_percent_train_cell_matrix[tuple(c)] = 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e42dfdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(15,10))\n",
    "plt.imshow(test_cell_matrix, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "ax.set_ylabel('Nodes',fontsize=16)\n",
    "ax.set_xlabel('Nodes',fontsize=16)\n",
    "ax.set_xticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "ax.set_yticks(range(I+J),origin_geometry_ids+destination_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f786b86f",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt(\n",
    "    os.path.join(\n",
    "        f'../data/inputs/{dataset}/constraints/augmented_test_cells.txt'\n",
    "    ),np.array(augmented_test_cells)\n",
    ")\n",
    "np.savetxt(\n",
    "    os.path.join(\n",
    "        f'../data/inputs/{dataset}/constraints/augmented_train_cells_permuted_size_90_cell_percentage_10_constrained_axes_0_1_seed_1234.txt'\n",
    "    ),np.array(augmented_ten_percent_train_cells)\n",
    ")\n",
    "np.savetxt(\n",
    "    os.path.join(\n",
    "        f'../data/inputs/{dataset}/constraints/augmented_train_cells_permuted_size_179_cell_percentage_20_constrained_axes_0_1_seed_1234.txt'\n",
    "    ),np.array(augmented_twenty_percent_train_cells)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6daa54e",
   "metadata": {},
   "source": [
    "# Visualise table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e3cb7b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,10))\n",
    "plt.imshow(destination_number_of_jobs[:,np.newaxis], cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "# ax.set_title('Destination attraction',fontsize=16)\n",
    "ax.set_ylabel('Destinations',fontsize=16)\n",
    "ax.set_yticks(range(J),destination_geometry_ids)\n",
    "ax.set_xticks([])\n",
    "for j in range(J):\n",
    "    text = ax.text(0, j, j, ha=\"center\", va=\"center\", color=\"black\")\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "90a58062",
   "metadata": {},
   "source": [
    "### Normalise data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "486658c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "rowsums = table.sum(axis=1)\n",
    "colsums = table.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36087005",
   "metadata": {},
   "outputs": [],
   "source": [
    "destination_attraction_normalisation_factor = 10000\n",
    "destination_attraction_normalisation = f'_sum_normalised'\n",
    "#f'_factor_{destination_attraction_normalisation_factor}_normalised'\n",
    "#'_sum_normalised'\n",
    "#'_max_normalised'\n",
    "#'_factor_{normalisation_factor}_normalised'\n",
    "\n",
    "origin_demand_normalisation_factor = 1\n",
    "origin_demand_normalisation = f'_sum_normalised'\n",
    "#'_sum_normalised'\n",
    "#'_max_normalised'\n",
    "#'_factor_{origin_demand_normalisation_factor}_normalised'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfd4214b",
   "metadata": {},
   "outputs": [],
   "source": [
    "origin_sizes = deepcopy(geometries.loc[(geometries.geometry_type==origin_geometry_name),['geometry_id','origin_demand']])\n",
    "origin_sizes['origin_demand'] = normalise_data(\n",
    "    origin_sizes['origin_demand'],\n",
    "    origin_demand_normalisation,\n",
    "    origin_demand_normalisation_factor\n",
    ")\n",
    "\n",
    "destination_attraction = deepcopy(geometries.loc[(geometries.geometry_type==destination_geometry_name),['geometry_id','number_of_jobs']])\n",
    "destination_attraction['number_of_jobs'] = normalise_data(\n",
    "    destination_attraction['number_of_jobs'],\n",
    "    destination_attraction_normalisation,\n",
    "    destination_attraction_normalisation_factor\n",
    ")\n",
    "destination_attraction['number_of_jobs'] = destination_attraction['number_of_jobs'].astype('float32')\n",
    "# destination_attraction['number_of_jobs'] = np.log(destination_attraction['number_of_jobs'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0febcd8",
   "metadata": {},
   "source": [
    "### True kappa and delta\n",
    "\n",
    "$$\\kappa = \\frac{\\sum_i O_i+\\delta M}{\\sum_j W_j}$$\n",
    "$$\\delta = \\kappa W_{min}$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fdb098b",
   "metadata": {},
   "outputs": [],
   "source": [
    "total_w = destination_attraction['number_of_jobs'].values.sum()\n",
    "min_w = 0.0#np.min(np.exp(destination_attraction['number_of_jobs'].values))\n",
    "total_o = origin_sizes['origin_demand'].values.sum()\n",
    "M = destination_attraction.shape[1]\n",
    "# Compute kappa, delta\n",
    "kappa = total_o / (total_w - min_w*M)\n",
    "delta = kappa * min_w"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45aeef59",
   "metadata": {},
   "outputs": [],
   "source": [
    "def kappa_from_delta(d):\n",
    "    return d / min_w, (total_o+d*M)/total_w\n",
    "def delta_from_kappa(k):\n",
    "    return kappa*min_w,(total_w*k-total_o)/M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83c0dca2",
   "metadata": {},
   "outputs": [],
   "source": [
    "total_w,total_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0f50e9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "kappa,delta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1fc26d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "kappa_from_delta(0.001642710997442455)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c90ff3da",
   "metadata": {},
   "outputs": [],
   "source": [
    "delta_from_kappa(1.1009130103981741)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a00316d",
   "metadata": {},
   "source": [
    "### Export data to file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a5751e",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt(f'../data/inputs/{dataset}/table_{geometry_name}.txt',table)\n",
    "np.savetxt(f'../data/inputs/{dataset}/rowsums_{geometry_name}.txt',rowsums)\n",
    "np.savetxt(f'../data/inputs/{dataset}/colsums_{geometry_name}.txt',colsums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad35b8ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt(os.path.join(f'../data/inputs/{dataset}/origin_demand{origin_demand_normalisation}.txt'),origin_sizes['origin_demand'].values)\n",
    "np.savetxt(os.path.join(f'../data/inputs/{dataset}/log_destination_attraction{destination_attraction_normalisation}.txt'),np.log(destination_attraction['number_of_jobs'].values))\n",
    "np.savetxt(os.path.join(f'../data/inputs/{dataset}/destination_attraction_time_series{destination_attraction_normalisation}.txt'),destination_attraction['number_of_jobs'].values[:,np.newaxis])\n",
    "\n",
    "np.save(os.path.join(f'../data/inputs/{dataset}/region_features.npy'),region_features.values)\n",
    "np.save(os.path.join(f'../data/inputs/{dataset}/adjacency_matrix.npy'),adjacency_matrix.values)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "fa7de216",
   "metadata": {},
   "source": [
    "### Sample facilities for which shortest path will be computed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7f72793",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of facilities per geographical unit\n",
    "n_facilities = None #20\n",
    "random_seed = 1234 #None\n",
    "facility_sample_name = f'sample_{n_facilities}' if (n_facilities is not None) else 'all'\n",
    "facility_name = 'facilities'\n",
    "import_sample = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dc80b53",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_sample:\n",
    "    # Fix seed \n",
    "    np.random.seed(random_seed)\n",
    "\n",
    "    # Sample facilities in each geometry\n",
    "    home_ids = np.array([])\n",
    "    work_ids = np.array([])\n",
    "    for i in tqdm(range(I)):\n",
    "        # If no home facilities exist compute shortest path to geometry centroid\n",
    "        if home_locs[home_locs.geometry_id==origin_geometry_ids[i]].size == 0:\n",
    "            print('No home facilities found in',origin_geometry_ids[i])\n",
    "            home_ids = np.append(home_ids,origin_geometry_ids[i])\n",
    "            new_row = pd.DataFrame.from_dict({'facility_id':[origin_geometry_ids[i]],'geometry_id':[origin_geometry_ids[i]],'geometry':geometries[geometries.geometry_id == origin_geometry_ids[i]].centroid})\n",
    "            home_locs = pd.concat([home_locs, new_row], axis=0, ignore_index=True)\n",
    "        # If there are less home facilities than sample size take all faciities as origins\n",
    "        elif (n_facilities is None) or (home_locs[home_locs.geometry_id==origin_geometry_ids[i]].shape[0] < n_facilities):\n",
    "    #         print('Getting all home facilities found in',geometry_ids[i])\n",
    "            home_ids = np.append(home_ids,sorted(home_locs[home_locs.geometry_id==origin_geometry_ids[i]].facility_id.values))\n",
    "        # Sample n_facilities \n",
    "        else:\n",
    "            home_ids = np.append(home_ids,sorted(home_locs[home_locs.geometry_id==origin_geometry_ids[i]].sample(n_facilities).facility_id.values))\n",
    "    for j in tqdm(range(J)):\n",
    "        # If no work facilities exist compute shortest path to centroid\n",
    "        if work_locs[work_locs.geometry_id==destination_geometry_ids[j]].size == 0:\n",
    "            print('No work facilities found in',destination_geometry_ids[j])\n",
    "            work_ids = np.append(work_ids,destination_geometry_ids[j])\n",
    "            new_row = pd.DataFrame.from_dict({'facility_id':[destination_geometry_ids[j]],'geometry_id':[destination_geometry_ids[j]],'geometry':geometries[geometries.geometry_id == destination_geometry_ids[j]].centroid})\n",
    "            work_locs = pd.concat([work_locs, new_row], axis=0, ignore_index=True)\n",
    "        # If there are less work facilities than sample size take all faciities as destinations\n",
    "        elif (n_facilities is None) or (work_locs[work_locs.geometry_id==destination_geometry_ids[j]].shape[0] < n_facilities):\n",
    "    #         print('Getting all work facilities found in',geometry_ids[j])\n",
    "            work_ids = np.append(work_ids,sorted(work_locs[work_locs.geometry_id==destination_geometry_ids[j]].facility_id.values))\n",
    "        # Sample n_facilities\n",
    "        else:\n",
    "            work_ids = np.append(work_ids,sorted(work_locs[work_locs.geometry_id==destination_geometry_ids[j]].sample(n_facilities).facility_id.values))\n",
    "\n",
    "    # Export to file\n",
    "    home_locs[home_locs.facility_id.isin(home_ids)].drop(columns=['centroid'], errors='ignore').to_file(\n",
    "            f'../data/inputs/{dataset}/{facility_sample_name}_home_facilities_seed_{random_seed}.geojson',\n",
    "            driver=\"GeoJSON\")\n",
    "    work_locs[work_locs.facility_id.isin(work_ids)].drop(columns=['centroid'], errors='ignore').to_file(\n",
    "            f'../data/inputs/{dataset}/{facility_sample_name}_work_facilities_seed_{random_seed}.geojson',\n",
    "            driver=\"GeoJSON\")\n",
    "\n",
    "    print(len(home_ids)*len(work_ids), 'facility pairs sampled')\n",
    "else:\n",
    "    home_locs_sampled = gpd.read_file(f'../data/inputs/{dataset}/{facility_sample_name}_home_facilities_seed_{random_seed}.geojson')\n",
    "    home_ids = np.unique(home_locs_sampled.facility_id.values)\n",
    "    work_locs_sampled = gpd.read_file(f'../data/inputs/{dataset}/{facility_sample_name}_work_facilities_seed_{random_seed}.geojson')\n",
    "    work_ids = np.unique(work_locs_sampled.facility_id.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ffb97d1",
   "metadata": {},
   "source": [
    "## K-means clustering of facilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc60fcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_origin_clusters = 20#22\n",
    "n_destination_clusters = 20#21\n",
    "# Clustered sample must be taken from entire population of facilities\n",
    "assert n_facilities is None\n",
    "facility_sample_name = f'sample_{n_origin_clusters}x{n_destination_clusters}'\n",
    "facility_name = 'clustered_facilities'\n",
    "\n",
    "import_sample = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf313019",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_sample:\n",
    "    # Fix seed \n",
    "    np.random.seed(random_seed)\n",
    "\n",
    "    home_ids = []\n",
    "    work_ids = []\n",
    "\n",
    "    # Sample origin facilities in each geometry\n",
    "    for i in tqdm(range(I)):\n",
    "        # If no home facilities exist compute shortest path to geometry centroid\n",
    "        if home_locs[home_locs.geometry_id==origin_geometry_ids[i]].size == 0:\n",
    "            print('No home facilities found in',origin_geometry_ids[i])\n",
    "            home_ids = np.append(home_ids,\n",
    "                                 origin_geometry_ids[i])\n",
    "            new_row = pd.DataFrame.from_dict({'facility_id':[origin_geometry_ids[i]],\n",
    "                                              'geometry_id':[origin_geometry_ids[i]],\n",
    "                                              'geometry':geometries[geometries.geometry_id == origin_geometry_ids[i]].centroid})\n",
    "            home_locs = pd.concat([home_locs, new_row],\n",
    "                                  axis=0,\n",
    "                                  ignore_index=True)\n",
    "        # If there are less home facilities than sample size take all faciities as origins\n",
    "        elif (home_locs[home_locs.geometry_id==origin_geometry_ids[i]].shape[0] < n_origin_clusters):\n",
    "            home_ids = np.append(home_ids,\n",
    "                                 sorted(home_locs[home_locs.geometry_id==origin_geometry_ids[i]].facility_id.values))\n",
    "        # Cluster facilities\n",
    "        else:\n",
    "            # Fit k-means clustering algorithm\n",
    "            kmeans = KMeans(n_clusters=n_origin_clusters, \n",
    "                            random_state=random_seed).fit(list(zip(home_locs[home_locs.geometry_id==origin_geometry_ids[i]].centroid.x.values,\n",
    "                                                                   home_locs[home_locs.geometry_id==origin_geometry_ids[i]].centroid.y.values)))\n",
    "            # Add cluster labels to data\n",
    "            home_locs.loc[home_locs.geometry_id==origin_geometry_ids[i],\"cluster_id\"] = kmeans.labels_\n",
    "            # Sample one facility in each cluster\n",
    "            home_ids = np.append(home_ids,\n",
    "                                 home_locs[home_locs.geometry_id==origin_geometry_ids[i]].groupby(\"cluster_id\").sample(1).facility_id.values)\n",
    "\n",
    "    # Sample destination facilities in each geometry\n",
    "    for j in tqdm(range(J)):\n",
    "        # If no work facilities exist compute shortest path to centroid\n",
    "        if work_locs[work_locs.geometry_id==destination_geometry_ids[j]].size == 0:\n",
    "            print('No work facilities found in',destination_geometry_ids[j])\n",
    "            work_ids = np.append(work_ids,\n",
    "                                 destination_geometry_ids[j])\n",
    "            new_row = pd.DataFrame.from_dict({'facility_id':[destination_geometry_ids[j]],\n",
    "                                              'geometry_id':[destination_geometry_ids[j]],\n",
    "                                              'geometry':geometries[geometries.geometry_id == destination_geometry_ids[j]].centroid})\n",
    "            work_locs = pd.concat([work_locs, new_row],\n",
    "                                  axis=0,\n",
    "                                  ignore_index=True)\n",
    "        # If there are less work facilities than sample size take all faciities as destinations\n",
    "        elif (work_locs[work_locs.geometry_id==destination_geometry_ids[j]].shape[0] < n_destination_clusters):\n",
    "            work_ids = np.append(work_ids,\n",
    "                                 sorted(work_locs[work_locs.geometry_id==destination_geometry_ids[j]].facility_id.values))\n",
    "        # Sample n_facilities\n",
    "        else:\n",
    "            # Fit k-means clustering algorithm\n",
    "            kmeans = KMeans(n_clusters=n_destination_clusters, \n",
    "                            random_state=random_seed).fit(list(zip(work_locs[work_locs.geometry_id==destination_geometry_ids[j]].centroid.x.values,\n",
    "                                                                   work_locs[work_locs.geometry_id==destination_geometry_ids[j]].centroid.y.values)))\n",
    "            # Add cluster labels to data\n",
    "            work_locs.loc[work_locs.geometry_id==destination_geometry_ids[j],\"cluster_id\"] = kmeans.labels_\n",
    "            # Sample one facility in each cluster\n",
    "            work_ids = np.append(work_ids,work_locs[work_locs.geometry_id==destination_geometry_ids[j]].groupby(\"cluster_id\").sample(1).facility_id.values)\n",
    "\n",
    "    # Export to file\n",
    "    home_locs[home_locs.facility_id.isin(home_ids)].drop(columns=['centroid'],errors='ignore').to_file(\n",
    "            f'../data/inputs/{dataset}/clustered_sample_{n_origin_clusters}_home_facilities_seed_{random_seed}.geojson',\n",
    "            driver=\"GeoJSON\")\n",
    "    work_locs[work_locs.facility_id.isin(work_ids)].drop(columns=['centroid'],errors='ignore').to_file(\n",
    "            f'../data/inputs/{dataset}/clustered_sample_{n_destination_clusters}_work_facilities_seed_{random_seed}.geojson',\n",
    "            driver=\"GeoJSON\")\n",
    "\n",
    "    print('origins',len(home_ids),'destinations',len(work_ids))\n",
    "    print(len(home_ids)*len(work_ids), 'facility pairs sampled')\n",
    "else:\n",
    "    home_locs_sampled = gpd.read_file(f'../../data/inputs/{dataset}/clustered_sample_{n_origin_clusters}_home_facilities_seed_{random_seed}.geojson')\n",
    "    home_ids = np.unique(home_locs_sampled.facility_id.values)\n",
    "    work_locs_sampled = gpd.read_file(f'../../data/inputs/{dataset}/clustered_sample_{n_destination_clusters}_work_facilities_seed_{random_seed}.geojson')\n",
    "    work_ids = np.unique(work_locs_sampled.facility_id.values)\n",
    "\n",
    "    print('origins',len(home_ids),'destinations',len(work_ids))\n",
    "    print(len(home_ids)*len(work_ids), 'facility pairs sampled')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4be47e4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cost_matrix = np.loadtxt(\"../data/inputs/cambridge_work_commuter_lsoas_to_msoas/cost_matrices/clustered_facilities_sample_20x20_20_01_2023_sample_20x20_clustered_facilities_ripleys_k_500_euclidean_points%_prob_origin_destination_adjusted_normalised_boundary_only_edge_corrected_cost_matrix_sum_normalised.txt\",dtype='float32')\n",
    "\n",
    "# pd.DataFrame(\n",
    "#     cost_matrix,\n",
    "#     index=geometries.loc[(geometries.geometry_type=='lsoa'),'geometry_id'].values,\n",
    "#     columns=geometries.loc[(geometries.geometry_type=='msoa'),'geometry_id'].values\n",
    "# ).to_csv('../../NeuralABM/data/HarrisWilson/Cambridge_data/clustered_facilities_sample_20x20_euclidean_20_01_2023_clustered_facilities_sample_20x20_ripleys_k_1000_euclidean_destination_adjusted_normalised_boundary_only_edge_corrected_cost_matrix_sum_normalised.csv',index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b19655d0",
   "metadata": {},
   "source": [
    "### View clustered sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de36c84d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(15,15))\n",
    "# home_locs.plot(ax=ax)\n",
    "# home_locs[home_locs.facility_id.isin(home_ids)].plot(ax=ax,color='black',markersize=100)\n",
    "# geometries[geometries.geometry_type==origin_geometry_name].plot(ax=ax,facecolor='none',edgecolor='blue')\n",
    "# geometries[geometries.geometry_type==destination_geometry_name].plot(ax=ax,facecolor='none',edgecolor='red')\n",
    "gpd.GeoDataFrame({\"id\":[0],\"geometry\":[geometries.unary_union.boundary]},crs=\"EPSG:27700\").plot(ax=ax,facecolor='none',edgecolor='black')\n",
    "geometries[(geometries.geometry_type==destination_geometry_name)&(geometries.intersects(geometries.unary_union.boundary))].plot(ax=ax,column=\"destination_demand\")#facecolor='none',edgecolor='red')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9461b0bf",
   "metadata": {},
   "source": [
    "### Visualise facilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69451bcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(10,10))\n",
    "geometries.plot(facecolor=\"none\",ax=ax)\n",
    "geometries.centroid.plot(color='black',ax=ax)\n",
    "cambridge_facilities[cambridge_facilities['main_activity'] == 'home'].plot(color='blue',ax=ax,aspect=1,markersize=1)\n",
    "cambridge_facilities[cambridge_facilities['main_activity'] == 'work'].plot(color='red',ax=ax,aspect=1,markersize=1)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "edd80f20",
   "metadata": {},
   "source": [
    "# \n",
    "# \n",
    "# \n",
    "# Construct transportation network graph "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72cff0c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_graph = True\n",
    "\n",
    "if import_graph:\n",
    "    \n",
    "    # Read graph pickle from file\n",
    "    graph = nx.read_gpickle(os.path.join(f'../data/raw/cambridge_commuter/{geometry_name}_graph.gpickle'))\n",
    "    \n",
    "    network_and_origin_destination_vertices = gpd.read_file(f'../data/raw/cambridge_commuter/{geometry_name}_graph_nodes_network_and_origin_destination.geojson',index_col=0)\n",
    "    network_and_origin_destination_nodes = network_and_origin_destination_vertices.set_index('id')\n",
    "    home_nodes = network_and_origin_destination_nodes.loc[network_and_origin_destination_nodes.node_type=='home',['geometry']].to_dict()['geometry']\n",
    "    work_nodes = network_and_origin_destination_nodes.loc[network_and_origin_destination_nodes.node_type=='work',['geometry']].to_dict()['geometry']\n",
    "    network_nodes = network_and_origin_destination_nodes.loc[network_and_origin_destination_nodes.node_type=='network',['geometry']].to_dict()['geometry']\n",
    "\n",
    "    network_and_origin_destination_nodes = {**network_nodes,**home_nodes,**work_nodes}\n",
    "    facility_nodes = {**home_nodes,**work_nodes}\n",
    "else:    \n",
    "\n",
    "    # Read network file\n",
    "    network_filename = f'../data/raw/cambridge_commuter/network.geojson'\n",
    "    network = gpd.read_file(network_filename)\n",
    "    network = network.set_crs('epsg:27700',allow_override=True)\n",
    "\n",
    "    # Extract edges and nodes\n",
    "    vertices = network[network['geometry'].apply(lambda x : x.type=='Point')]\n",
    "    edges = network[network['geometry'].apply(lambda x : x.type!='Point')]\n",
    "\n",
    "    # Add nodes to graph\n",
    "    network_nodes = {}\n",
    "    geography_nodes = {}\n",
    "    work_nodes = {}\n",
    "    home_nodes = {}\n",
    "\n",
    "    # Classify facility points into homes and workplaces\n",
    "    for _,fac in tqdm(cambridge_facilities.iterrows(),total=cambridge_facilities.shape[0]):\n",
    "        if fac.main_activity == 'home':\n",
    "            home_nodes[fac.facility_id] = [(fac.geometry.x,fac.geometry.y),'home']\n",
    "        elif fac.main_activity == 'work':\n",
    "            work_nodes[fac.facility_id] = [(fac.geometry.x,fac.geometry.y),'work']\n",
    "        else:\n",
    "            print(fac.facility_id,fac.main_activity)\n",
    "            raise\n",
    "    # Store all network vertices\n",
    "    for i,v in tqdm(vertices.iterrows(),total=vertices.shape[0]):\n",
    "        network_nodes[v.id] = [(v.geometry.x,v.geometry.y),'network']\n",
    "\n",
    "    # Add geography centroids as nodes in graph\n",
    "    for _,g in tqdm(geometries.iterrows(),total=geometries.shape[0]):\n",
    "        geography_nodes[g.geometry_id] = [(g.geometry.centroid.x,g.geometry.centroid.y),'geography']\n",
    "\n",
    "    # Merge nodes into groups\n",
    "    network_and_origin_destination_nodes = {**network_nodes,**home_nodes,**work_nodes}\n",
    "    facility_nodes = {**home_nodes,**work_nodes}\n",
    "    # Network and origin vertices\n",
    "    network_and_origin_destination_vertices = pd.DataFrame.from_dict(network_and_origin_destination_nodes,\n",
    "                                                                    orient='index',\n",
    "                                                                    columns=['geometry','node_type']).reset_index().rename(columns={'index': 'id'}\n",
    "                                            )\n",
    "    network_and_origin_destination_vertices.geometry = network_and_origin_destination_vertices.geometry.apply(Point)\n",
    "    network_and_origin_destination_vertices = gpd.GeoDataFrame(network_and_origin_destination_vertices,crs='epsg:27700')\n",
    "\n",
    "    network_and_origin_destination_vertices.set_index('id').to_file(f'../data/raw/cambridge_commuter/{geometry_name}_graph_nodes_network_and_origin_destination.geojson',index=True)\n",
    "\n",
    "    # Create graph from network\n",
    "    graph = nx.MultiDiGraph()\n",
    "\n",
    "    # Add facility points as nodes in graph\n",
    "    for _,fac in tqdm(cambridge_facilities.iterrows(),total=cambridge_facilities.shape[0]):\n",
    "        graph.add_node(fac.facility_id,pos=(fac.geometry.x,fac.geometry.y),activity=fac.main_activity)\n",
    "\n",
    "    # Store all network vertices in networkx object\n",
    "    for i,v in tqdm(vertices.iterrows(),total=vertices.shape[0]):\n",
    "        graph.add_node(v.id,pos=(v.geometry.x,v.geometry.y),activity='other')\n",
    "\n",
    "    # Add geography centroids as nodes in graph\n",
    "    for _,g in tqdm(geometries.iterrows(),total=geometries.shape[0]):\n",
    "        graph.add_node(g.geometry_id,pos=(g.geometry.centroid.x,g.geometry.centroid.y),activity='other')\n",
    "\n",
    "    # Construct a spatial tree to find nearest point in graph\n",
    "    tree = KDTree(list(map(list, zip(*network_nodes.values())))[0])\n",
    "\n",
    "    # Store all network edges in networkx object\n",
    "    for i,e in tqdm(edges.iterrows(),total=edges.shape[0]):\n",
    "        for mode in e['modes'].split(','):\n",
    "            graph.add_edge(e['fromNode'],\n",
    "                           e['toNode'],\n",
    "                           mode=mode,\n",
    "                           weight=float(e['length']),\n",
    "                           capacity=float(e['capacity']),\n",
    "                           length=float(e['length']))\n",
    "\n",
    "    # For every geography centroid add an edge to the closest vertex on graph\n",
    "    for node in tqdm(geography_nodes.keys(),total=len(geography_nodes)):\n",
    "        # Get nearest node and its distance\n",
    "        nearest_dist, nearest_ind = tree.query(geography_nodes[node][0], k=1)\n",
    "        # Get closest point in network \n",
    "        network_node = list(network_nodes.keys())[nearest_ind]\n",
    "        network_point = network_nodes[network_node][0]\n",
    "        # Add artificial links geography centroids as nodes in graph\n",
    "        graph.add_edge(node,\n",
    "                       network_node,\n",
    "                       mode='walk',\n",
    "                       weight=float(nearest_dist),\n",
    "                       capacity=float(nearest_dist),\n",
    "                       length=float(nearest_dist))\n",
    "        graph.add_edge(network_node,\n",
    "                       node,\n",
    "                       mode='walk',\n",
    "                       weight=float(nearest_dist),\n",
    "                       capacity=float(nearest_dist),\n",
    "                       length=float(nearest_dist))\n",
    "\n",
    "    # For every facility centroid add an edge to the closest vertex on graph\n",
    "    for node in tqdm(facility_nodes.keys(),total=len(facility_nodes)):\n",
    "        # Get nearest node and its distance\n",
    "        nearest_dist, nearest_ind = tree.query(facility_nodes[node][0], k=1)\n",
    "        # Get closest point in network \n",
    "        network_node = list(network_nodes.keys())[nearest_ind]\n",
    "        network_point = network_nodes[network_node][0]\n",
    "        # Add artificial links geography centroids as nodes in graph\n",
    "        graph.add_edge(node,\n",
    "                       network_node,\n",
    "                       mode='walk',\n",
    "                       weight=float(nearest_dist),\n",
    "                       capacity=float(nearest_dist),\n",
    "                       length=float(nearest_dist))\n",
    "        graph.add_edge(network_node,\n",
    "                       node,\n",
    "                       mode='walk',\n",
    "                       weight=float(nearest_dist),\n",
    "                       capacity=float(nearest_dist),\n",
    "                       length=float(nearest_dist))\n",
    "\n",
    "    # Remove self loops\n",
    "    graph.remove_edges_from(list(nx.selfloop_edges(graph, keys=True)))\n",
    "    \n",
    "    # Write to file\n",
    "    nx.write_gpickle(graph,os.path.join(f'../data/raw/cambridge_commuter/{geometry_name}_graph.gpickle'))\n",
    "\n",
    "# Get all edge data\n",
    "# edges_gdf = [[(e[0]+'_'+e[1]),LineString([Point(graph.nodes[e[0]]['pos']),Point(graph.nodes[e[1]]['pos'])]),e[2]['mode'],e[2]['weight'],e[2]['length']] for e in graph.edges(data=True)]\n",
    "# edges_gdf = gpd.GeoDataFrame(pd.DataFrame(edges_gdf,columns=['id','geometry','mode','weight','length']))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "92907150",
   "metadata": {},
   "source": [
    "### Visualise transportation network graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62fe3ba3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Get modal subgraph(s)\n",
    "# selected_edges = [(u,v,e['mode']) for u,v,e in graph.edges(data=True) if e['mode'] in ['bus','car','walk'] ]\n",
    "# H = graph.edge_subgraph(selected_edges)\n",
    "# pos = nx.get_node_attributes(H, 'pos')\n",
    "\n",
    "# origin_node,destination_node = 'home_72000256','work_315112358'\n",
    "\n",
    "# # Find shortest path between two nodes\n",
    "# spath = nx.shortest_path(graph,origin_node,destination_node)\n",
    "# H2 = graph.subgraph(spath)\n",
    "# spath_edges = list(zip(spath,spath[1:]))\n",
    "# pos2 = nx.get_node_attributes(H2, 'pos')\n",
    "\n",
    "# # Get bounding box of selected nodes\n",
    "# bbox = box(*MultiPoint(list([x[1] for x in H2.nodes(data=\"pos\")])).bounds, ccw=True)\n",
    "# xmin,ymin,xmax,ymax = bbox.bounds\n",
    "# # Find all nodes within that box\n",
    "# relevant_nodes = [n[0] for n in H2.nodes(data=\"pos\") if bbox.contains(Point(n[1]))]\n",
    "# new_nodes = list(set(relevant_nodes)-set(spath))\n",
    "# # Get all relevant edges\n",
    "# H3 = graph.subgraph(relevant_nodes)\n",
    "# # Get position\n",
    "# pos3 = nx.get_node_attributes(H3, 'pos')\n",
    "\n",
    "# # Remove all home/work nodes except 2 above\n",
    "# removed_nodes = [node for node in H.nodes if (node.startswith('work') or node.startswith('home'))]\n",
    "# removed_nodes = list(set(removed_nodes)-set([origin_node,destination_node]))\n",
    "# # Remove edges\n",
    "# kept_edges = []\n",
    "# for u,v,e in tqdm(selected_edges):\n",
    "#     if not (u in removed_nodes or v in removed_nodes):\n",
    "#         kept_edges.append((u,v))\n",
    "# # Remove all such nodes\n",
    "# H = H.copy()\n",
    "# H.remove_nodes_from(removed_nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eff50b79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fig,ax = plt.subplots(1,1,figsize=(20,20))\n",
    "\n",
    "# # convert lat and lon to map projection\n",
    "# mx,my=m([xminr,xmaxr],[yminr,ymaxr])\n",
    "\n",
    "# nx.draw_networkx_nodes(H,\n",
    "#                        nx.get_node_attributes(H, 'pos'),\n",
    "#                        nodelist=H.nodes,\n",
    "#                        node_color='blue',\n",
    "#                        node_size=1,\n",
    "#                        ax=ax)\n",
    "\n",
    "# nx.draw_networkx_edges(H,\n",
    "#                        nx.get_node_attributes(H, 'pos'),\n",
    "#                        edgelist=kept_edges,\n",
    "#                        edge_color='black',\n",
    "#                        width=1,\n",
    "#                        arrows=False,\n",
    "#                        arrowsize=14,\n",
    "#                        arrowstyle='-|>', \n",
    "#                        alpha=0.4,\n",
    "#                        ax=ax)\n",
    "\n",
    "# # Draw shortest path\n",
    "# nx.draw_networkx_nodes(H2,pos2,nodelist=spath,node_color='r',node_size=2)\n",
    "# nx.draw_networkx_edges(H2,pos2,edgelist=spath_edges,edge_color='r',width=3)\n",
    "\n",
    "# # Plot bounding box\n",
    "# ax.plot([xmin-pad,xmax+pad,xmax+pad,xmin-pad,xmin-pad],[ymin-pad,ymin-pad,ymax+pad,ymax+pad,ymin-pad],color='black')\n",
    "\n",
    "# # Limit x axis\n",
    "# plt.xlim(xmin-pad,xmax+pad)\n",
    "# plt.ylim(ymin-pad,ymax+pad)\n",
    "\n",
    "# # Add basemap\n",
    "# cx.add_basemap(ax, crs=edges.crs, source=cx.providersStamen.Watercolor)\n",
    "\n",
    "\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16ebbd31",
   "metadata": {},
   "source": [
    "# \n",
    "# \n",
    "# \n",
    "# Compute/import edge effect corrections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c40f5b0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_edge_corrections = True\n",
    "\n",
    "if import_edge_corrections:\n",
    "    edge_corrections = gpd.read_file(f'../data/inputs/{dataset}/edge_corrections.geojson')\n",
    "    edge_corrections.crs = geometries.crs\n",
    "else:\n",
    "    edge_corrections = deepcopy(geometries[['geometry_id','geometry','geometry_type']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81526fb9",
   "metadata": {},
   "source": [
    "### Method 1: Boundary geography centroid enclosed angles "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81176a09",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_edge_corrections:\n",
    "    # Compute percentage of angle enclosed in boundary\n",
    "    angle_covered = compute_centroid_boundary_enclosed_angle(geometries[geometries.geometry_type==destination_geometry_name])\n",
    "    # Pass it to geometries df\n",
    "    edge_corrections.loc[:,'angle_covered'] = edge_corrections.loc[:,'geometry_id'].map(angle_covered)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c2e1bcb",
   "metadata": {},
   "source": [
    "### Visualise method 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e917e9cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(10,10))\n",
    "ps = list(MultiPoint(geometries.centroid).convex_hull.boundary.coords)\n",
    "xx, yy = [x[0] for x in ps], [y[1] for y in ps]\n",
    "geometries.plot(facecolor=\"none\",ax=ax)\n",
    "geometries.plot(column=\"number_of_jobs\",cmap='RdYlGn',legend=True,legend_kwds={'fraction':0.046,'pad':0.04},ax=ax)\n",
    "geometries.centroid.plot(color='black',ax=ax)\n",
    "ax.plot(xx,yy)\n",
    "# cambridge_facilities[cambridge_facilities['main_activity'] == 'home'].plot(color='blue',ax=ax,aspect=1,markersize=1)\n",
    "# cambridge_facilities[cambridge_facilities['main_activity'] == 'work'].plot(color='red',ax=ax,aspect=1,markersize=1)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d8a2103",
   "metadata": {},
   "source": [
    "### Method 2: Compute angle covered by outmost destination zones located on convex hull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a6ba4f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_edge_corrections:\n",
    "    # Get all facility locations from sampling pool\n",
    "    facility_location_sample_pool = gpd.GeoDataFrame( pd.concat( [home_locs,work_locs], ignore_index=True) )\n",
    "    # Get all sampled faclitiies\n",
    "    sampled_locs = facility_location_sample_pool[facility_location_sample_pool.facility_id.isin(np.append(home_ids,work_ids))]\n",
    "    # Get all points in convex hull of sampled facilities\n",
    "    convex_hull_boundary_points = MultiPoint(sampled_locs.geometry.values).convex_hull.boundary.coords\n",
    "\n",
    "    # Collect all boundary facility ids\n",
    "    boundary_facility_ids = []\n",
    "    for chp in convex_hull_boundary_points:\n",
    "        boundary_facility_ids.append(sampled_locs[sampled_locs.geometry.geom_almost_equals(Point(chp))].facility_id.values[0])\n",
    "    # Collect all boundary work facilities    \n",
    "    boundary_work_facilities = facility_location_sample_pool[facility_location_sample_pool.facility_id.isin([wid for wid in boundary_facility_ids if wid.startswith('work')])]\n",
    "    # Compute convex hull of collected work facilities\n",
    "    boundary_work_facilities_boundary = MultiPoint(boundary_work_facilities.geometry.values).convex_hull.boundary.coords\n",
    "\n",
    "\n",
    "    # Computer destination facility boundary adjustment\n",
    "    sampled_facility_angles_covered = compute_sampled_facility_boundary_enclosed_angle(\n",
    "        home_locs[home_locs.facility_id.isin(home_ids)],\n",
    "        boundary_work_facilities,\n",
    "        statistic='max'\n",
    "    )\n",
    "    # Merge them into non-adjusted destinations\n",
    "    all_facilities_angles_covered = dict(zip(geometry_ids,np.ones(len(geometry_ids))))\n",
    "    all_facilities_angles_covered.update(sampled_facility_angles_covered)\n",
    "    sampled_facility_angles_covered = all_facilities_angles_covered \n",
    "\n",
    "    # Pass it to geometries df\n",
    "    edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_max_angle_covered'] = edge_corrections.loc[:,'geometry_id'].map(sampled_facility_angles_covered)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17b63845",
   "metadata": {},
   "source": [
    "### Visualise boundary work facilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4676419a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fig,ax = plt.subplots(1,1,figsize=(30,20))\n",
    "# ax.set_title('Sampled facilities')\n",
    "# # boundary_work_facilities.plot(ax=ax,color='black',markersize=100)\n",
    "# # gpd.GeoDataFrame(index=[0], crs='epsg:27700', geometry=[MultiPoint(boundary_work_facilities.geometry.values).convex_hull]).plot(ax=ax,facecolor='none',edgecolor='green')\n",
    "\n",
    "# # edge_corrections.plot(column=f\"ripleys_k_{neighbourhood_method_name}_normalised\",cmap='RdYlGn',legend=True,legend_kwds={'fraction':0.046,'pad':0.04},ax=ax)\n",
    "# # home_locs[home_locs.facility_id.isin(home_ids)].geometry.plot(ax=ax,color='blue')\n",
    "# home_locs.plot(ax=ax,color='blue',markersize=5)\n",
    "# work_locs.plot(ax=ax,color='red',markersize=5)\n",
    "# geometries.plot(facecolor=\"none\",edgecolor='black',ax=ax)\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc74d965",
   "metadata": {},
   "source": [
    "### Method 3: Compute average angle covered based on facilities of nearby zones"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a28060c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of cores to parallelise across\n",
    "num_cores = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45412c1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "if (n_facilities is not None) or ((n_origin_clusters > 0) and (n_destination_clusters > 0)):\n",
    "    # Find ids of polygons around the boundary\n",
    "    if not import_edge_corrections:\n",
    "\n",
    "        facility_average_neighbour_angle = compute_sampled_facility_neighbourhood_enclosed_angle(\n",
    "            home_locs[(home_locs.geometry_id.isin(origin_boundary_polygon_ids)) & (home_locs.facility_id.isin(home_ids))],\n",
    "            work_locs[(work_locs.geometry_id.isin(destination_boundary_polygon_ids)) & (work_locs.facility_id.isin(work_ids))],\n",
    "            geometries[(geometries.geometry_type==origin_geometry_name)&(geometries.intersects(geometries.unary_union.boundary))],\n",
    "            geometries[(geometries.geometry_type==destination_geometry_name)&(geometries.intersects(geometries.unary_union.boundary))],\n",
    "            statistic = 'mean',\n",
    "            n_workers = num_cores\n",
    "        )\n",
    "\n",
    "        # Merge them into non-adjusted destinations\n",
    "        all_facilities_angles_covered = dict(zip(destination_geometry_ids,np.ones(len(destination_geometry_ids))))\n",
    "        all_facilities_angles_covered.update(facility_average_neighbour_angle)\n",
    "        facility_average_neighbour_angle = all_facilities_angles_covered \n",
    "\n",
    "        # Pass it to geometries df\n",
    "        edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_average_neighbourhood_angle_covered'] = edge_corrections.loc[:,'geometry_id'].map(facility_average_neighbour_angle)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96ad539a",
   "metadata": {},
   "source": [
    "### Visualise boundary geometries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0db3f1d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(15,15))\n",
    "ax.set_title('Sampled facilities')\n",
    "edge_corrections[edge_corrections.geometry_id.isin(origin_boundary_polygon_ids)].plot(column=f'{facility_sample_name}_{facility_name}_average_neighbourhood_angle_covered',\n",
    "                                                                          cmap='RdYlGn',\n",
    "                                                                          legend=True,\n",
    "                                                                          legend_kwds={'fraction':0.046,'pad':0.04},\n",
    "                                                                          ax=ax)\n",
    "home_locs[(home_locs.geometry_id.isin(origin_boundary_polygon_ids)) & (home_locs.facility_id.isin(home_ids))].plot(ax=ax,color='blue',markersize=5)\n",
    "work_locs[(work_locs.geometry_id.isin(destination_boundary_polygon_ids)) & (work_locs.facility_id.isin(work_ids))].plot(ax=ax,color='red',markersize=5)\n",
    "for gid in origin_boundary_polygon_ids:\n",
    "    plt.annotate(text=gid,xy=(geometries[geometries.geometry_id==gid].centroid.values[0].x,geometries[geometries.geometry_id==gid].centroid.values[0].y))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dcc394e9",
   "metadata": {},
   "source": [
    "### Method 4: Ripley's k function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1250beff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parallelisation parameters\n",
    "num_cores = 4\n",
    "buffer_radius = 500\n",
    "neighbourhood_method_name = 'euclidean'\n",
    "conditional_probabilty_method_name = 'points%'\n",
    "location_type = 'origin_destination'\n",
    "#'area%' 'points%'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0cd22bef",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_edge_corrections:\n",
    "    # Add region geometry to origins\n",
    "    orig_locs = pd.merge(home_locs[home_locs.facility_id.isin(home_ids)],\n",
    "                        geometries[['geometry_id','geometry']].rename(columns={\"geometry\":\"region_geometry\"}),\n",
    "                        on='geometry_id',\n",
    "                        how='left')[['geometry','geometry_id','region_geometry','facility_id','main_activity']]\n",
    "    # Add region geometry to destinations\n",
    "    dest_locs = pd.merge(work_locs[work_locs.facility_id.isin(work_ids)],\n",
    "                        geometries[['geometry_id','geometry']].rename(columns={\"geometry\":\"region_geometry\"}),\n",
    "                        on='geometry_id',\n",
    "                        how='left')[['geometry','geometry_id','region_geometry','facility_id','main_activity']]\n",
    "    dest_locs.loc[dest_locs.facility_id.str.startswith('E'),'main_activity'] = 'work'\n",
    "\n",
    "\n",
    "    ripleys_k_edge_correction, ripleys_k_origin_responsibility,ripleys_k_destination_responsibility = apply_ripleys_k_edge_correction(\n",
    "        index=0,\n",
    "        aoi_locations=orig_locs,\n",
    "        geographies=geometries,\n",
    "        G=graph,\n",
    "        location_type=location_type,\n",
    "        radius=buffer_radius,\n",
    "        neighbourhood_method=neighbourhood_method_name,\n",
    "        conditional_probabilty_method=conditional_probabilty_method_name,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4144cdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_edge_corrections:\n",
    "    # Pass it to geometries df\n",
    "    edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_{neighbourhood_method_name}_{conditional_probabilty_method_name}_prob_origin_adjusted'] = edge_corrections.loc[:,'geometry_id'].map({k:v['origin'] for k,v in ripleys_k_edge_correction.items()})\n",
    "    edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_{neighbourhood_method_name}_{conditional_probabilty_method_name}_prob_destination_adjusted'] = edge_corrections.loc[:,'geometry_id'].map({k:v['destination'] for k,v in ripleys_k_edge_correction.items()})\n",
    "\n",
    "    # Normalise it\n",
    "    ripleys_l_edge_correction_normalised = {k:{\"origin\":(np.pi*buffer_radius**2)/v['origin'],\n",
    "                                            \"destination\":(np.pi*buffer_radius**2)/v['destination']} \\\n",
    "                                            for k,v in ripleys_k_edge_correction.items()}\n",
    "\n",
    "    # Pass it to geometries df\n",
    "    edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_{neighbourhood_method_name}_{conditional_probabilty_method_name}_prob_origin_adjusted_normalised'] = edge_corrections.loc[:,'geometry_id'].map({k:v['origin'] for k,v in ripleys_l_edge_correction_normalised.items()})\n",
    "    edge_corrections.loc[:,f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_{neighbourhood_method_name}_{conditional_probabilty_method_name}_prob_destination_adjusted_normalised'] = edge_corrections.loc[:,'geometry_id'].map({k:v['destination'] for k,v in ripleys_l_edge_correction_normalised.items()})\n",
    "\n",
    "    # Merge home locations with their ripley responsibility\n",
    "    home_locs = pd.merge(home_locs,ripleys_k_origin_responsibility[['facility_id',f'ripleys_k']],on='facility_id',how='left')\n",
    "    work_locs = pd.merge(work_locs,ripleys_k_destination_responsibility[['facility_id',f'ripleys_k']],on='facility_id',how='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d9418f2",
   "metadata": {},
   "source": [
    "### Visualise Ripley's k function adjustment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8f2d695",
   "metadata": {},
   "outputs": [],
   "source": [
    "facility_name = 'clustered_facilities' #'facilities' # clustered_facilities\n",
    "facility_sample_name = 'sample_20x20' #'all' # sample_20x20\n",
    "conditional_probabilty_method_name = 'points%'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62638e5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(20,10))\n",
    "boundary_only = True\n",
    "\n",
    "if conditional_probabilty_method_name != \"\":\n",
    "    conditional_probabilty_method_name_suffix = \"_prob_\"\n",
    "else:\n",
    "    conditional_probabilty_method_name_suffix = \"\"\n",
    "col_name = f\"{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_{neighbourhood_method_name}_{conditional_probabilty_method_name}{conditional_probabilty_method_name_suffix}{location_type}_adjusted_normalised\"\n",
    "if boundary_only:\n",
    "    boundary_only = edge_corrections.geometry.intersects(edge_corrections.unary_union.exterior)\n",
    "else:\n",
    "    boundary_only = True\n",
    "\n",
    "ax.set_title(col_name.replace(\"_\",\" \").capitalize())\n",
    "edge_corrections[(edge_corrections.geometry_type==geom_name) & boundary_only ].plot(\n",
    "    column=col_name,\n",
    "    cmap='RdYlGn',\n",
    "    legend=True,\n",
    "    legend_kwds={'fraction':0.046,'pad':0.04},\n",
    "    ax=ax\n",
    ")\n",
    "if plot_origin:\n",
    "    home_locs[home_locs.facility_id.isin(home_ids) & home_locs.geometry_id.isin(origin_boundary_polygon_ids)].geometry.plot(ax=ax,color='blue')\n",
    "else:\n",
    "    work_locs[work_locs.facility_id.isin(work_ids) & work_locs.geometry_id.isin(destination_boundary_polygon_ids)].geometry.plot(ax=ax,color='blue')\n",
    "# home_locs[home_locs.facility_id == 'home_255522216'].geometry.plot(ax=ax,color='black',markersize=100)\n",
    "# home_locs[home_locs.facility_id == 'home_255522216'].buffer(buffer_radius).plot(ax=ax,edgecolor='black',linewidth=5,facecolor='None')\n",
    "geometries[geometries.geometry_type==geom_name].plot(facecolor=\"none\",edgecolor='black',ax=ax)\n",
    "# geometries[geometries.geometry_type==destination_geometry_name].plot(facecolor=\"none\",edgecolor='black',ax=ax)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2402f1ee",
   "metadata": {},
   "source": [
    "## Export edge corrections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c14522f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "printcols(edge_corrections)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d175ce5",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not import_edge_corrections:\n",
    "    edge_corrections.to_file(\n",
    "        f'../data/inputs/{dataset}/edge_corrections.geojson',\n",
    "        driver='GeoJSON',\n",
    "        index=False\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db4ca6a1",
   "metadata": {},
   "source": [
    "# \n",
    "# \n",
    "# \n",
    "# \n",
    "# Compute cost matrices\n",
    "## Euclidean cost matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59f96a36",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose cost_matrix calculation method\n",
    "cost_matrix_method = 'euclidean_centroids'\n",
    "# 'euclidean_centroids'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9a5668f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Euclidean distance between centroids\n",
    "centroid_euclidean_cost_matrix = pd.DataFrame(\n",
    "        cdist(\n",
    "                geometries.loc[geometries.geometry_type==origin_geometry_name, [\"LAT\",\"LONG\"]],\n",
    "                geometries.loc[geometries.geometry_type==destination_geometry_name, [\"LAT\",\"LONG\"]]\n",
    "        ), \n",
    "        columns=destination_geometry_ids,\n",
    "        index=origin_geometry_ids\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42f55419",
   "metadata": {},
   "source": [
    "## Centroid shortest path cost computation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8b7e992",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_ct = True\n",
    "\n",
    "if not import_ct:\n",
    "    # Find shortest path between geography centroids in graph\n",
    "    centroid_shortest_path_cost_matrix = np.zeros((I,J))\n",
    "    for i in tqdm(range(len(origin_geometry_ids))):\n",
    "        for j in range(i+1,len(destination_geometry_ids)):\n",
    "            centroid_shortest_path_cost_matrix[i,j] = nx.shortest_path_length(\n",
    "                                                            graph,\n",
    "                                                            origin_geometry_ids[i],\n",
    "                                                            destination_geometry_ids[j],\n",
    "                                                            weight='length',\n",
    "                                                            method='dijkstra'\n",
    "                                                    )\n",
    "            centroid_shortest_path_cost_matrix[j,i] = nx.shortest_path_length(\n",
    "                                                            graph,\n",
    "                                                            destination_geometry_ids[j],\n",
    "                                                            origin_geometry_ids[i],\n",
    "                                                            weight='length',\n",
    "                                                            method='dijkstra'\n",
    "                                                    )\n",
    "\n",
    "    # Convert to df\n",
    "    centroid_shortest_path_cost_matrix = pd.DataFrame(\n",
    "                                                centroid_shortest_path_cost_matrix,\n",
    "                                                columns=destination_geometry_ids,\n",
    "                                                index=origin_geometry_ids\n",
    "                                        )\n",
    "    # Save to file\n",
    "    centroid_shortest_path_cost_matrix.to_csv(\n",
    "        f'../data/inputs/{dataset}/cost_matrices/geometry_centroid_shortest_path_cost_matrix.csv',\n",
    "        index=True\n",
    "    )\n",
    "else:\n",
    "    # Read from file\n",
    "    centroid_shortest_path_cost_matrix = pd.read_csv(\n",
    "            f'../data/inputs/{dataset}/cost_matrices/geometry_centroid_shortest_path_cost_matrix.csv',\n",
    "            index_col=0\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "611e04bf",
   "metadata": {},
   "source": [
    "## Facility centroid shortest path cost computation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1cddcbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_ct = True\n",
    "try:\n",
    "    assert facility_sample_name == 'all'\n",
    "except:\n",
    "    raise ValueError(f'Facility centroid computed across all available facilities and not for {facility_sample_name}')\n",
    "\n",
    "if not import_ct:\n",
    "\n",
    "    # Find mean shortest paths between facility within geography in graph\n",
    "    facility_centroid_shortest_path_cost_matrix = np.zeros((I,J))\n",
    "    for i,j in tqdm(list(itertools.product(range(I),range(J))),total=(I*J)):\n",
    "        # Get all home locations in origin geometry\n",
    "        geography_home_locs = cambridge_facilities[(cambridge_facilities.geometry_id == origin_geometry_ids[i]) & \\\n",
    "                                                   (cambridge_facilities.main_activity == 'home')]\n",
    "        # Get all work locations in destination geometry\n",
    "        geography_work_locs = cambridge_facilities[(cambridge_facilities.geometry_id == destination_geometry_ids[j]) & \\\n",
    "                                                    (cambridge_facilities.main_activity == 'work')]\n",
    "\n",
    "        # Find vertices closest to home locs centroid and work locs centroid\n",
    "        closest_to_home_centroid = home_locs.sindex.nearest(home_locs.dissolve().centroid)\n",
    "        closest_to_work_centroid = work_locs.sindex.nearest(geography_work_locs.dissolve().centroid)\n",
    "\n",
    "        # If no home facilities exist compute shortest path to geometry centroid\n",
    "        if home_locs.size == 0:\n",
    "            home = geometry_ids[i]\n",
    "        # Else compute compute shortest path to home closest to centroid of all homes\n",
    "        else:\n",
    "            home = home_locs.iloc[closest_to_home_centroid[1]].facility_id.values[0]\n",
    "\n",
    "        # If no work facilities exist compute shortest path to centroid\n",
    "        if work_locs.size == 0:\n",
    "            work = geometry_ids[j]\n",
    "        # Else compute compute shortest path to workplace closest to centroid of all workplaces\n",
    "        else:\n",
    "            work = work_locs.iloc[closest_to_work_centroid[1]].facility_id.values[0]\n",
    "\n",
    "        # Calculate shortest paths from centroid of home facilities to centroid of work facilities and vice versa\n",
    "        home_to_work_sp = nx.shortest_path_length(graph,home,work,weight='length',method='dijkstra')\n",
    "\n",
    "        # Compute mean shortest path length from home to work and vice versa    \n",
    "        facility_centroid_shortest_path_cost_matrix[i,j] = home_to_work_sp\n",
    "        \n",
    "    # Convert to df\n",
    "    facility_centroid_shortest_path_cost_matrix = pd.DataFrame(facility_centroid_shortest_path_cost_matrix,\n",
    "                                                               columns=destination_geometry_ids,\n",
    "                                                               index=origin_geometry_ids)\n",
    "    # Save to file\n",
    "    facility_centroid_shortest_path_cost_matrix.to_csv(\n",
    "        '../data/inputs/{dataset}/cost_matrices/{facility_name}_centroid_{facility_sample_name}_shortest_path_cost_matrix.csv',\n",
    "        index=True\n",
    "    )\n",
    "\n",
    "else:\n",
    "    # Read from file\n",
    "    facility_centroid_shortest_path_cost_matrix = pd.read_csv(\n",
    "            f'../data/inputs/{dataset}/cost_matrices/{facility_name}_centroid_{facility_sample_name}_shortest_path_cost_matrix.csv',                                    \n",
    "            index_col=0\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a445fe03",
   "metadata": {},
   "source": [
    "## Individual facility cost computation\n",
    "### Shortest path or euclidean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f328fa91",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parallelisation parameters\n",
    "num_cores = 1\n",
    "# Number batches/chunks to split origin locations into\n",
    "n_batches = 100\n",
    "# Distance Method\n",
    "distance_method = 'euclidean'\n",
    "# 'euclidean'\n",
    "# 'shortest_path'\n",
    "print_flag = False\n",
    "store = False\n",
    "\n",
    "date_computed = '25_01_2023'\n",
    "# '13_12_2022'\n",
    "# '08_12_2022'\n",
    "# '25_01_2023'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f71815c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_facility_cm = True\n",
    "\n",
    "if import_facility_cm:\n",
    "    # Get filename\n",
    "    facility_cost_filename = f'facilities_all_{distance_method}_{date_computed}'\n",
    "    \n",
    "    # Import pickle\n",
    "    individual_facility_distance_matrix_gdf = pd.read_pickle(f'../data/raw/cambridge_commuter/{facility_cost_filename}.pickle')\n",
    "    \n",
    "else:\n",
    "    # Find mean distance between all facilities within geography in graph\n",
    "    \n",
    "    # Split inputs into batches/chunks\n",
    "    home_ids_batches = np.array_split(home_ids,n_batches)\n",
    "\n",
    "    # Compute shortest paths in parallel\n",
    "    if distance_method == 'shortest_path':\n",
    "        individual_facility_distance_matrices = np.asarray(Parallel(n_jobs=num_cores,\n",
    "                                                      prefer=\"threads\")(\n",
    "                                                delayed(compute_individual_facility_shortest_path)(i,\n",
    "                                                        graph,\n",
    "                                                       home_ids_batches[i],\n",
    "                                                       work_ids,\n",
    "                                                       n_batches,\n",
    "                                                       store) for i in tqdm(range(n_batches),leave=True)),dtype=object)\n",
    "    elif distance_method == 'euclidean':\n",
    "        selected_work_locs = work_locs.loc[work_locs.facility_id.isin(work_ids),['facility_id','geometry']]\n",
    "        individual_facility_distance_matrices = np.asarray(Parallel(n_jobs=num_cores,backend='multiprocessing')(\n",
    "                                            delayed(compute_individual_facility_euclidean_distance)(i,\n",
    "                                                dict(zip(home_locs.loc[home_locs.facility_id.isin(home_ids_batches[i]),'facility_id'].values,\n",
    "                                                         home_locs.loc[home_locs.facility_id.isin(home_ids_batches[i]),'geometry'].values)),\n",
    "                                                dict(zip(selected_work_locs['facility_id'],\n",
    "                                                         selected_work_locs['geometry'])),\n",
    "                                               print_flag) for i in tqdm(range(n_batches),leave=True)),dtype=object)\n",
    "    else:\n",
    "        raise ValueError(f'No distance method {distance_method} found')\n",
    "    # Convert to df\n",
    "    print('Convert to df')\n",
    "    individual_facility_distance_matrix_gdf = pd.DataFrame(np.concatenate(individual_facility_distance_matrices,axis=0), \n",
    "                                                           columns = ['origin','destination',distance_method],\n",
    "                                                           index=None)\n",
    "    print('Merge origin geometry')\n",
    "    # Merge origin geometry\n",
    "    individual_facility_distance_matrix_gdf = pd.merge(individual_facility_distance_matrix_gdf,\n",
    "                                                       home_locs[['facility_id','geometry_id','geometry']],\n",
    "                                                       left_on='origin',\n",
    "                                                       right_on='facility_id',\n",
    "                                                       how='left')\n",
    "    # Rename columns\n",
    "    individual_facility_distance_matrix_gdf = individual_facility_distance_matrix_gdf.rename(\n",
    "                                                        columns={'geometry_id':'origin_geometry_id',\n",
    "                                                                'geometry':'origin_geometry'}\n",
    "                                            )\n",
    "    # Drop columns\n",
    "    individual_facility_distance_matrix_gdf.drop(columns=['facility_id'],inplace=True)\n",
    "    print('Merge destination geometry')\n",
    "    # Merge destination geometry\n",
    "    individual_facility_distance_matrix_gdf = pd.merge(individual_facility_distance_matrix_gdf,\n",
    "                                                       work_locs[['facility_id','geometry_id','geometry']],\n",
    "                                                       left_on='destination',\n",
    "                                                       right_on='facility_id',\n",
    "                                                       how='left')\n",
    "    # Rename columns\n",
    "    individual_facility_distance_matrix_gdf = individual_facility_distance_matrix_gdf.rename(\n",
    "                                                        columns={'geometry_id':'destination_geometry_id',\n",
    "                                                          'geometry':'destination_geometry'}\n",
    "                                                )\n",
    "    # Drop columns\n",
    "    individual_facility_distance_matrix_gdf.drop(columns=['facility_id'],inplace=True)\n",
    "\n",
    "    print('Convert to geopandas df')\n",
    "    # Convert data types\n",
    "    individual_facility_distance_matrix_gdf[distance_method] = individual_facility_distance_matrix_gdf[distance_method].astype('float32')\n",
    "    # Convert to geopandas\n",
    "    individual_facility_distance_matrix_gdf = gpd.GeoDataFrame(individual_facility_distance_matrix_gdf,\n",
    "                                                               geometry='origin_geometry')\n",
    "    \n",
    "    # Get output filename\n",
    "    facility_cost_filename =  f'facilities_all_{distance_method}_{date.today().strftime(\"%d_%m_%Y\")}'\n",
    "    print('Save to file')\n",
    "    # Save pickle to file\n",
    "    individual_facility_distance_matrix_gdf.to_pickle(f\"../data/raw/cambridge_commuter/{facility_cost_filename}.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "666776fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get only facilities matching home and work ids\n",
    "individual_facility_distance_matrix_gdf = deepcopy(individual_facility_distance_matrix_gdf[\n",
    "                            individual_facility_distance_matrix_gdf.origin.isin(home_ids) &\n",
    "                           individual_facility_distance_matrix_gdf.destination.isin(work_ids)\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "923cef43",
   "metadata": {},
   "outputs": [],
   "source": [
    "date_computed = '20_01_2023'\n",
    "import_aggregate_facility_cm = True\n",
    "\n",
    "if not import_aggregate_facility_cm:\n",
    "    # Compute mean shortest path length from home to work and vice versa    \n",
    "    sample_facility_cost_matrix = pd.DataFrame(\n",
    "                                            np.zeros((I,J),dtype='float32'),\n",
    "                                            columns=destination_geometry_ids,\n",
    "                                            index=origin_geometry_ids\n",
    "                                )\n",
    "    \n",
    "    # Aggregate facility-level cost_matrix\n",
    "    for gid, group in tqdm(individual_facility_distance_matrix_gdf.groupby(['origin_geometry_id','destination_geometry_id']),total=I):\n",
    "            # If there are no facilities matched set cost to zero\n",
    "            i,j = gid[0],gid[1]\n",
    "            if group.size == 0:\n",
    "                sample_facility_cost_matrix.loc[sample_facility_cost_matrix.index==i,j] = 0.0\n",
    "            else:\n",
    "                # Compute average\n",
    "                sample_facility_cost_matrix.loc[sample_facility_cost_matrix.index==i,j] = np.mean(group[distance_method].values)\n",
    "    \n",
    "    # Export to file\n",
    "    save_cost_matrices(\n",
    "        cost_matrices = [sample_facility_cost_matrix],\n",
    "        cost_matrix_names = [f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}\"],\n",
    "        norm = '',\n",
    "        mode = '',\n",
    "        correction_method = '',\n",
    "        gensit_format=True,\n",
    "        dataset = dataset,\n",
    "        geometry_name = geometry_name\n",
    "    )\n",
    "else:\n",
    "    \n",
    "    cms = load_cost_matrices(\n",
    "        cost_matrix_names = [f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}\"],\n",
    "        norm = '',\n",
    "        mode = '',\n",
    "        correction_method = '',\n",
    "        dataset = dataset\n",
    "    )\n",
    "    # Store input cost matrix\n",
    "    sample_facility_cost_matrix = cms[f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}\"]\n",
    "    # Delete rest of results\n",
    "    del cms\n",
    "    # Convert to df\n",
    "    sample_facility_cost_matrix = pd.DataFrame(sample_facility_cost_matrix,\n",
    "                                               columns=destination_geometry_ids,\n",
    "                                               index=origin_geometry_ids)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1c10585",
   "metadata": {},
   "source": [
    "## Apply edge correction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba3caf96",
   "metadata": {},
   "outputs": [],
   "source": [
    "cost_matrix = np.loadtxt(\"../data/inputs/cambridge_work_commuter_lsoas_to_msoas/cost_matrices/clustered_facilities_sample_20x20_20_01_2023_sample_20x20_clustered_facilities_ripleys_k_500_euclidean_points%_prob_origin_destination_adjusted_normalised_boundary_only_edge_corrected_cost_matrix_sum_normalised.txt\",dtype='float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbe553e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "neighbourhood_method_name = 'euclidean'\n",
    "edge_correction_methods = [\n",
    "    f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised_boundary_only',\n",
    "    f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised_boundary_only'\n",
    "]\n",
    "edge_correction_axes = [0,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3394989",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_facility_cost_matrix_edge_corrected = apply_edge_corrections(\n",
    "                                    geodata=sample_facility_cost_matrix,\n",
    "                                    edge_correction_df=edge_corrections,\n",
    "                                    origin_id_name=origin_geometry_name,\n",
    "                                    destination_id_name=destination_geometry_name,\n",
    "                                    correction_methods=edge_correction_methods,\n",
    "                                    axes=edge_correction_axes\n",
    ")\n",
    "sample_facility_cost_matrix_edge_corrected_normalised = normalise_data(\n",
    "                sample_facility_cost_matrix_edge_corrected,\n",
    "                '_sum_normalised',\n",
    "                1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36e36cac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transform data for neural network model\n",
    "neural_net_cost_matrix = deepcopy(sample_facility_cost_matrix_edge_corrected)\n",
    "neural_net_cost_matrix /= np.sum(neural_net_cost_matrix)\n",
    "neural_net_cost_matrix = np.exp(-neural_net_cost_matrix*250)\n",
    "\n",
    "# neural_net_cost_matrix.to_csv(f'../../NeuralABM/data/HarrisWilson/Cambridge_data/exp_{facility_name}_{facility_sample_name}_{date_computed}_{shorten_filename(edge_correction_methods)}_edge_corrected_cost_matrix_max_normalised.csv',index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d6b3c1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.min(neural_net_cost_matrix.values),np.max(neural_net_cost_matrix.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99e0f996",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(neural_net_cost_matrix.values.ravel())\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d729133f",
   "metadata": {},
   "source": [
    "## Visualise cost matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65f2f7cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,5))\n",
    "plt.imshow(sample_facility_cost_matrix.T, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "plt.title('Cost matrix',fontsize=16)\n",
    "plt.ylabel('Destinations',fontsize=16)\n",
    "plt.yticks(range(J),destination_geometry_ids)\n",
    "plt.xlabel('Origins',fontsize=16)\n",
    "plt.xticks(range(I),origin_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63c194d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,5))\n",
    "plt.imshow(table.T, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "plt.title('Cost matrix',fontsize=16)\n",
    "plt.ylabel('Destinations',fontsize=16)\n",
    "plt.yticks(range(J),destination_geometry_ids)\n",
    "plt.xlabel('Origins',fontsize=16)\n",
    "plt.xticks(range(I),origin_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6cf0a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,5))\n",
    "plt.imshow(sample_facility_cost_matrix_edge_corrected.T, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "plt.title('Cost matrix',fontsize=16)\n",
    "plt.ylabel('Destinations',fontsize=16)\n",
    "plt.yticks(range(J),destination_geometry_ids)\n",
    "plt.xlabel('Origins',fontsize=16)\n",
    "plt.xticks(range(I),origin_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5baf0a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,5))\n",
    "plt.imshow(sample_facility_cost_matrix_edge_corrected.T, cmap=plt.cm.coolwarm, interpolation='nearest')\n",
    "plt.title('Cost matrix',fontsize=16)\n",
    "plt.ylabel('Destinations',fontsize=16)\n",
    "plt.yticks(range(J),destination_geometry_ids)\n",
    "plt.xlabel('Origins',fontsize=16)\n",
    "plt.xticks(range(I),origin_geometry_ids)\n",
    "plt.colorbar(fraction=0.046, pad=0.04)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c59293cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_ids_of_interest = ['']\n",
    "dest_ids_of_interest = ['E02003722','E02003719','E02003726','E02003727','E02003728','E02003729']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89ddb130",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig,ax = plt.subplots(1,1,figsize=(15,15))\n",
    "ax.set_title('Sampled facilities')\n",
    "# [edge_corrections.geometry.intersects(edge_corrections.unary_union.exterior)]\n",
    "# edge_corrections[edge_corrections.geometry_type==destination_geometry_name].plot(\n",
    "#     column=edge_correction_methods[0].replace('_boundary_only',''),#f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised',\n",
    "#     cmap='RdYlGn',\n",
    "#     legend=True,\n",
    "#     legend_kwds={'fraction':0.046,'pad':0.04},\n",
    "#     alpha=0.5,\n",
    "#     ax=ax\n",
    "# )\n",
    "# facilities[facilities.activities.str.contains('work') | facilities.activities.str.contains('home')].plot(ax=ax,color='black',markersize=5)\n",
    "geometries[geometries.geometry_type==destination_geometry_name].plot(column=f'destination_demand',\n",
    "                                                                          cmap='RdYlGn',\n",
    "                                                                          legend=True,\n",
    "                                                                          legend_kwds={'fraction':0.046,'pad':0.04},\n",
    "                                                                          alpha=0.5,\n",
    "                                                                          ax=ax)\n",
    "geometries[geometries.geometry_type == destination_geometry_name].plot(ax=ax,facecolor='none',edgecolor='black')\n",
    "# geometries[geometries.geometry_type == origin_geometry_name].plot(ax=ax,facecolor='none',edgecolor='white',alpha=1.0)\n",
    "# home_locs[(home_locs.geometry_id.isin(origin_boundary_polygon_ids)) & (home_locs.facility_id.isin(home_ids))].plot(ax=ax,color='blue',markersize=5)\n",
    "# work_locs[(work_locs.geometry_id.isin(destination_boundary_polygon_ids)) & (work_locs.facility_id.isin(work_ids))].plot(ax=ax,color='red',markersize=5)\n",
    "for gid in geometries[geometries.geometry_type == destination_geometry_name].geometry_id.values:\n",
    "    text_size = None\n",
    "    if np.any([gid == did for did in dest_ids_of_interest]):\n",
    "        text_size = 20\n",
    "        geometries[geometries.geometry_id == gid].plot(ax=ax,facecolor='none',edgecolor='yellow',linewidth=3.0)\n",
    "    plt.annotate(text=gid,xy=(geometries[geometries.geometry_id==gid].centroid.values[0].x,geometries[geometries.geometry_id==gid].centroid.values[0].y),fontsize=text_size)\n",
    "# for gid in geometries[geometries.geometry_type == origin_geometry_name].geometry_id.values:\n",
    "#     text_size = None\n",
    "#     if orig_id_of_interest == gid:\n",
    "#         text_size = 20\n",
    "#         geometries[geometries.geometry_id == orig_id_of_interest].plot(ax=ax,facecolor='none',edgecolor='yellow',linewidth=3.0)\n",
    "#     plt.annotate(text=gid,xy=(geometries[geometries.geometry_id==gid].centroid.values[0].x,geometries[geometries.geometry_id==gid].centroid.values[0].y),color='blue',fontsize=text_size)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52e60cc5",
   "metadata": {},
   "source": [
    "## Plot destination attraction ordered by increasing cost matrix margins "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "749ad002",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_destination_attraction_by_cost_margins(\n",
    "    number_of_jobs,\n",
    "    cms=[\n",
    "        sample_facility_cost_matrix,\n",
    "        sample_facility_cost_matrix_edge_corrected_normalised\n",
    "    ],\n",
    "    cm_names=[\n",
    "        f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}\",\n",
    "        f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}_edge_corrected_sum_normalised\"\n",
    "    ],\n",
    "    fig_size = (15,7),\n",
    "    fig_title ='Destination attraction in order of increasing travel cost'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ba667ab",
   "metadata": {},
   "source": [
    "## Prepare and export together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f06bf4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# correction_types = [\n",
    "#     [''],\n",
    "#     ['angle_covered'],\n",
    "#     [f'all_facilities_max_angle_covered'],\n",
    "#     # [f'all_facilities_average_neighbourhood_angle_covered'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised_boundary_only'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised_boundary_only'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised',f'all_facilities_ripleys_k_euclidean_destination_adjusted_normalised'],\n",
    "#     [f'all_facilities_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised_boundary_only',f'all_facilities_ripleys_k_euclidean_destination_adjusted_normalised_boundary_only'],\n",
    "# ]\n",
    "\n",
    "correction_types = [\n",
    "    # [''],\n",
    "    # ['angle_covered'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_max_angle_covered'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_average_neighbourhood_angle_covered'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised_boundary_only'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised_boundary_only'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised',\n",
    "    #     f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised'],\n",
    "    # [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised_boundary_only',\n",
    "    #     f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised',\n",
    "        f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised_boundary_only',\n",
    "        f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised',\n",
    "        f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised_boundary_only',\n",
    "        f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised',\n",
    "        f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_origin_adjusted_normalised_boundary_only',\n",
    "        f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_area%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised',\n",
    "        f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised'],\n",
    "    [f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_origin_adjusted_normalised_boundary_only',\n",
    "        f'{facility_sample_name}_{facility_name}_inverse_ripleys_k_{buffer_radius}_euclidean_points%_prob_destination_adjusted_normalised_boundary_only'],\n",
    "]\n",
    "\n",
    "correction_axes = [\n",
    "    # [1],\n",
    "    # [1],\n",
    "    # [1],\n",
    "    # [1],\n",
    "    [0],\n",
    "    [0],\n",
    "    [1],\n",
    "    [1],\n",
    "    [0,1],\n",
    "    [0,1],\n",
    "    [0],\n",
    "    [0],\n",
    "    [1],\n",
    "    [1],\n",
    "    [0,1],\n",
    "    [0,1],\n",
    "    [0],\n",
    "    [0],\n",
    "    [1],\n",
    "    [1],\n",
    "    [0,1],\n",
    "    [0,1],\n",
    "    [0],\n",
    "    [0],\n",
    "    [1],\n",
    "    [1],\n",
    "    [0,1],\n",
    "    [0,1],\n",
    "]\n",
    "\n",
    "\n",
    "# correction_types = [\n",
    "#     [f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_origin_adjusted_normalised_boundary_only',\n",
    "#     f'{facility_sample_name}_{facility_name}_ripleys_k_{buffer_radius}_euclidean_destination_adjusted_normalised_boundary_only'],\n",
    "# ]\n",
    "# correction_axes = [\n",
    "#     [0,1],\n",
    "# ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19209ce8",
   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_cost_matrices(\n",
    "    [centroid_euclidean_cost_matrix,\n",
    "    centroid_shortest_path_cost_matrix,\n",
    "    sample_facility_cost_matrix],\n",
    "    ['geometry_centroid_euclidean',\n",
    "    'geometry_centroid_shortest_path',\n",
    "    f\"{facility_sample_name}_{facility_name}_{date_computed}\"],\n",
    "    edge_corrections,\n",
    "    normalisation_types=['_sum_normalised'],#,'_max_normalised'],\n",
    "    edge_correction_types=correction_types,\n",
    "    edge_correction_axes=correction_axes,\n",
    "    origin_id_name=origin_geometry_name,\n",
    "    destination_id_name=destination_geometry_name,\n",
    "    transport_mode='',\n",
    "    data=dataset,\n",
    "    geo_name=geometry_name\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81d119cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare_cost_matrices(\n",
    "#     [centroid_euclidean_cost_matrix,\n",
    "#     centroid_shortest_path_cost_matrix,\n",
    "#     sample_facility_cost_matrix,\n",
    "#     facility_centroid_shortest_path_cost_matrix],\n",
    "#     ['geometry_centroid_euclidean',\n",
    "#     'geometry_centroid_shortest_path',\n",
    "#     f\"{facility_sample_name}_{facility_name}_{date_computed}\",\n",
    "#     f\"{facility_sample_name}_{facility_name}_centroid_shortest_path_cost_matrix\"],\n",
    "#     edge_corrections,\n",
    "#     normalisation_types=['_sum_normalised'],#,'_max_normalised'],\n",
    "#     edge_correction_types=correction_types,\n",
    "#     edge_correction_axes=correction_axes,\n",
    "#     origin_id_name=origin_geometry_name,\n",
    "#     destination_id_name=destination_geometry_name,\n",
    "#     transport_mode='',\n",
    "#     data=dataset,\n",
    "#     geo_name=geometry_name\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41bf4c61",
   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_cost_matrices(\n",
    "    [sample_facility_cost_matrix],\n",
    "    [f\"{facility_name}_{facility_sample_name}_{distance_method}_{date_computed}\"],\n",
    "    edge_corrections,\n",
    "    normalisation_types=['_sum_normalised'],#,'_max_normalised'],\n",
    "    edge_correction_types=correction_types,\n",
    "    edge_correction_axes=correction_axes,\n",
    "    origin_id_name=origin_geometry_name,\n",
    "    destination_id_name=destination_geometry_name,\n",
    "    transport_mode='',\n",
    "    data=dataset,\n",
    "    geo_name=geometry_name\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dcdd6ba9",
   "metadata": {},
   "source": [
    "# Distance matrix API cost matrix\n",
    "## Read data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12228ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "resolution = f'geometry_centroid'\n",
    "# 'geometry_centroid'\n",
    "# f'clustered_facility_sample_origs_{n_origin_clusters}_dests_{n_destination_clusters}'\n",
    "\n",
    "date_ingested = ''\n",
    "# ''\n",
    "# '18_01_2023'\n",
    "\n",
    "facility_to_geometry_map = dict(zip(np.concatenate([home_locs.facility_id,work_locs.facility_id]),\n",
    "                                    np.concatenate([home_locs.geometry_id,work_locs.geometry_id])))\n",
    "\n",
    "geometry_conversions = pd.read_csv('../data/raw/cambridge_commuter/OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv')\n",
    "mapper = dict(zip(geometry_conversions.LSOA11CD,geometry_conversions.MSOA11CD))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5042e9f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "driving_times,driving_distances = read_distance_matrix_api_data(\n",
    "        origin_geometry_ids,# origin_geometry_ids\n",
    "        origin_geometry_ids,#destination_geometry_ids,\n",
    "        'driving',\n",
    "        resolution,\n",
    "        date_ingested,\n",
    "        origin_geo_name=origin_geometry_name,\n",
    "        destination_geo_name=origin_geometry_name,\n",
    "        origin_agg_statistic='mean',\n",
    "        destination_agg_statistic='mean',\n",
    "        origin_geo_map=None,\n",
    "        destination_geo_map=mapper\n",
    ")\n",
    "transit_times,transit_distances = read_distance_matrix_api_data(\n",
    "        origin_geometry_ids,\n",
    "        origin_geometry_ids,\n",
    "        'transit',\n",
    "        resolution,\n",
    "        date_ingested,\n",
    "        origin_geo_name=origin_geometry_name,\n",
    "        destination_geo_name=origin_geometry_name,\n",
    "        origin_agg_statistic='mean',\n",
    "        destination_agg_statistic='mean',\n",
    "        origin_geo_map=None,\n",
    "        destination_geo_map=mapper\n",
    ")\n",
    "bicycling_times,bicycling_distances = read_distance_matrix_api_data(\n",
    "        origin_geometry_ids,\n",
    "        origin_geometry_ids,\n",
    "        'bicycling',\n",
    "        resolution,\n",
    "        date_ingested,\n",
    "        origin_geo_name=origin_geometry_name,\n",
    "        destination_geo_name=origin_geometry_name,\n",
    "        origin_agg_statistic='mean',\n",
    "        destination_agg_statistic='mean',\n",
    "        origin_geo_map=None,\n",
    "        destination_geo_map=mapper\n",
    ")\n",
    "walking_times,walking_distances = read_distance_matrix_api_data(\n",
    "        origin_geometry_ids,\n",
    "        origin_geometry_ids,\n",
    "        'bicycling',\n",
    "        resolution,\n",
    "        date_ingested,\n",
    "        origin_geo_name=origin_geometry_name,\n",
    "        destination_geo_name=origin_geometry_name,\n",
    "        origin_agg_statistic='mean',\n",
    "        destination_agg_statistic='mean',\n",
    "        origin_geo_map=None,\n",
    "        destination_geo_map=mapper\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2da9197c",
   "metadata": {},
   "source": [
    "### Multi-modal cost matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af049555",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read and clean data\n",
    "modal_mix = pd.read_csv(f'../data/inputs/{dataset}/{geometry_name}_modal_mix.csv',index_col=0)\n",
    "\n",
    "# Create mapping to four main modes of transport\n",
    "mode_mapping = {\n",
    "    \"metro\":\"rail\",\n",
    "    \"train\":\"rail\",\n",
    "    \"bus\":\"transit\",\n",
    "    \"taxi\":\"car\",\n",
    "    \"bike\":\"car\",\n",
    "    \"car\":\"car\",\n",
    "    \"car_passenger\":\"car\",\n",
    "    \"bicycle\":\"bicycle\",\n",
    "    \"walk\":\"walk\",\n",
    "    \"other\":\"other\"\n",
    "}\n",
    "\n",
    "modal_mix.columns = modal_mix.columns.to_series().map(mode_mapping)\n",
    "# Group by columns\n",
    "modal_mix = modal_mix.groupby(modal_mix.columns, axis=1).sum()\n",
    "\n",
    "# Remove irrelevant modes\n",
    "modal_mix = modal_mix.loc[:,['car','bicycle','transit']]#,'walk']]\n",
    "\n",
    "# Compute proportions\n",
    "for c in modal_mix.columns.values:\n",
    "    modal_mix.loc[:,(c + '_proportion')] = modal_mix.loc[:,c] / modal_mix.values.sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b8c5e2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute multimodal cost matrix\n",
    "multimodal_times = driving_times.multiply(modal_mix.loc[:,\"car_proportion\"],axis='index') + \\\n",
    "                               bicycling_times.multiply(modal_mix.loc[:,\"bicycle_proportion\"],axis='index') + \\\n",
    "                               transit_times.multiply(modal_mix.loc[:,\"transit_proportion\"],axis='index') #+ \\\n",
    "#                                modal_mix.loc[:,\"walk_proportion\"] * walking_times_geometry_centroid\n",
    "multimodal_times = multimodal_times.dropna(axis=0)\n",
    "\n",
    "multimodal_distances = driving_distances.multiply(modal_mix.loc[:,\"car_proportion\"],axis='index') + \\\n",
    "                               bicycling_distances.multiply(modal_mix.loc[:,\"bicycle_proportion\"],axis='index') + \\\n",
    "                               transit_distances.multiply(modal_mix.loc[:,\"transit_proportion\"],axis='index') #+ \\\n",
    "#                                modal_mix.loc[:,\"walk_proportion\"] * walking_distances_geometry_centroid\n",
    "multimodal_distances = multimodal_distances.dropna(axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c9c7006",
   "metadata": {},
   "source": [
    "## Apply edge/boundary correction "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ad9e0d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "neighbourhood_method_name = 'euclidean'\n",
    "edge_correction_method = f'average_neighbourhood_angle_covered'\n",
    "edge_correction_axis = 1\n",
    "# 'angle_covered'\n",
    "# 'max_angle_covered'\n",
    "# 'average_neighbourhood_angle_covered'\n",
    "# 'ripleys_k_{neighbourhood_method_name}_normalised'\n",
    "# 'ripleys_k_{neighbourhood_method_name}_boundary_only_normalised'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "345e39cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "geometry_conversions = pd.read_csv('../data/raw/cambridge_commuter/OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv')\n",
    "mapper = dict(zip(geometry_conversions.LSOA11CD,geometry_conversions.MSOA11CD))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0f321af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# driving_times,driving_distances = fix(driving_times,mapper), fix(driving_distances,mapper)\n",
    "# transit_times,transit_distances = fix(transit_times,mapper), fix(transit_distances,mapper)\n",
    "# bicycling_times, bicycling_distances = fix(bicycling_times,mapper), fix(bicycling_distances,mapper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5fd402d",
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_corrected_cost_matrices = {}\n",
    "\n",
    "for m in ['driving','transit','bicycling','walking','multimodal']:\n",
    "    for t in ['distances','times']:\n",
    "        variable_name = f\"{m}_{t}_{resolution}\"\n",
    "        edge_corrected_cost_matrices[variable_name+f\"_{edge_correction_method}\"] = apply_edge_corrections(\n",
    "                                                        locals()[variable_name],\n",
    "                                                        edge_corrections,\n",
    "                                                        [edge_correction_method],\n",
    "                                                        [edge_correction_axis],\n",
    "                                                    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "163a1fe6",
   "metadata": {},
   "source": [
    "## Plot destination attraction ordered by increasing cost matrix margins "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "997c44ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_destination_attraction_by_cost_margins(\n",
    "    number_of_jobs,\n",
    "    cms=list(edge_corrected_cost_matrices.values()),\n",
    "    cm_names=[k+'_edge_corrected' for k in list(edge_corrected_cost_matrices.keys())],\n",
    "    fig_size=(20,15)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b732dafe",
   "metadata": {},
   "source": [
    "## Prepare and export matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5016475",
   "metadata": {},
   "outputs": [],
   "source": [
    "cost_matrices = {}\n",
    "for m in ['driving','transit','bicycling','multimodal']:#,'walking']:\n",
    "    for t in ['distances','times']:\n",
    "        variable_name = f\"{m}_{t}\"\n",
    "        cost_matrices[variable_name+f\"_{resolution}\"] = locals()[variable_name]\n",
    "\n",
    "prepare_cost_matrices(\n",
    "    list(cost_matrices.values()),\n",
    "    ['google/' + x for x in list(cost_matrices.keys())],\n",
    "    edge_corrections,\n",
    "    normalisation_types=['_sum_normalised'],#_max_normalised\n",
    "    edge_correction_types=correction_types,\n",
    "    edge_correction_axes=correction_axes,\n",
    "    origin_id_name=origin_geometry_name,\n",
    "    destination_id_name=destination_geometry_name,\n",
    "    transport_mode='',\n",
    "    data=dataset,\n",
    "    geo_name=geometry_name\n",
    ")\n",
    "\n",
    "del cost_matrices"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gensit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "03bfdf7ea1404c12a11ac3af73765b0e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_f116b8a4b181455eacaa86db808019b3",
       "style": "IPY_MODEL_f3ef6cfcaa0942248113c5736238fb35",
       "value": "100%"
      }
     },
     "0569875e007041ed8c7f647a1994ea0e": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "1e364e5e585e4287bd6d96bab6037658": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_24fbaf1097864d4baa4fd011f5953983",
       "style": "IPY_MODEL_34a1dea4896442e28e59267b284a69c4",
       "value": " 32137/32137 [00:03&lt;00:00, 18487.14it/s]"
      }
     },
     "24fbaf1097864d4baa4fd011f5953983": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "2d1ed75d2af1438792d812e8478110e0": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_0569875e007041ed8c7f647a1994ea0e",
       "style": "IPY_MODEL_db8f591ce2044854800b0f84b2d4e2d1",
       "value": "100%"
      }
     },
     "34a1dea4896442e28e59267b284a69c4": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "446736361cda4be48ad2aec20f4d5226": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "489fb5aad5274e8ca06476331854ad58": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "52c162973006497382739242c2b668ca": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_daceca53614f4a90b8190aca42095077",
       "style": "IPY_MODEL_933d16f28d284c5f8e2af71c52347de3",
       "value": " 108728/108728 [00:08&lt;00:00, 20831.11it/s]"
      }
     },
     "667dfae5d98240768600f89343a70612": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "794c358311b74f1fbf956bdca231d0d7": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_84c6991b0a9d4ee4a27d563e8216cc2f",
       "max": 108728,
       "style": "IPY_MODEL_9403086e98534e348581edf326cef844",
       "value": 108728
      }
     },
     "7dfa0863de97429e9f5e51f9334ab0f7": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "84c6991b0a9d4ee4a27d563e8216cc2f": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "8aeb11a646324f819639c7d835a5245a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_7dfa0863de97429e9f5e51f9334ab0f7",
       "style": "IPY_MODEL_af7c5b435c1144cdad8ad723520d670e",
       "value": "100%"
      }
     },
     "912dd3d4c577451ba0d21fdb54f32889": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_d7194f3a083845729d389b25b6ac7086",
       "style": "IPY_MODEL_d7f97e3546eb46d592e396218bb06626",
       "value": " 69/69 [00:00&lt;00:00, 4383.49it/s]"
      }
     },
     "933d16f28d284c5f8e2af71c52347de3": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "9403086e98534e348581edf326cef844": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "942cb852b4874bd9a1f9153a27c7a587": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "9f753e5826374bb9b4312c809eceb79d": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "a4f818b0d5394e5bb0f7615f4772ce29": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "aa57f6eab8054858a5823aa2c2ad2425": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_8aeb11a646324f819639c7d835a5245a",
        "IPY_MODEL_d8e5aa5d4242445d81dfde3b521b3235",
        "IPY_MODEL_1e364e5e585e4287bd6d96bab6037658"
       ],
       "layout": "IPY_MODEL_c940390dfcfb429a815523f98bdefde8"
      }
     },
     "aeabee65a48a478b86721de5eeedbc7c": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_03bfdf7ea1404c12a11ac3af73765b0e",
        "IPY_MODEL_794c358311b74f1fbf956bdca231d0d7",
        "IPY_MODEL_52c162973006497382739242c2b668ca"
       ],
       "layout": "IPY_MODEL_667dfae5d98240768600f89343a70612"
      }
     },
     "af7c5b435c1144cdad8ad723520d670e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "bfc5a56ec4e24221af7ede39237f5d17": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_942cb852b4874bd9a1f9153a27c7a587",
       "max": 69,
       "style": "IPY_MODEL_9f753e5826374bb9b4312c809eceb79d",
       "value": 69
      }
     },
     "c940390dfcfb429a815523f98bdefde8": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "d7194f3a083845729d389b25b6ac7086": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "d7f97e3546eb46d592e396218bb06626": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "d898eebc6f2d4cbaaefcaa0d8e467886": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_2d1ed75d2af1438792d812e8478110e0",
        "IPY_MODEL_bfc5a56ec4e24221af7ede39237f5d17",
        "IPY_MODEL_912dd3d4c577451ba0d21fdb54f32889"
       ],
       "layout": "IPY_MODEL_446736361cda4be48ad2aec20f4d5226"
      }
     },
     "d8e5aa5d4242445d81dfde3b521b3235": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_a4f818b0d5394e5bb0f7615f4772ce29",
       "max": 32137,
       "style": "IPY_MODEL_489fb5aad5274e8ca06476331854ad58",
       "value": 32137
      }
     },
     "daceca53614f4a90b8190aca42095077": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "db8f591ce2044854800b0f84b2d4e2d1": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "f116b8a4b181455eacaa86db808019b3": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "f3ef6cfcaa0942248113c5736238fb35": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
