{
 "cells": [
  {
   "cell_type": "raw",
   "metadata": {
    "vscode": {
     "languageId": "raw"
    }
   },
   "source": [
    "make sure the extracted reviews are placed in the preprocess/model_name/ folders\n",
    "This file doesn't need any model or api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "def loadbase():\n",
    "    with open ('Dataset/Base/Restaurants.jsonl', 'r') as file:\n",
    "        restaurants = [json.loads(line.strip()) for line in file]\n",
    "\n",
    "    with open ('Dataset/Base/Hotels.jsonl', 'r') as file:\n",
    "        hotels = [json.loads(line.strip()) for line in file]\n",
    "\n",
    "    with open ('Dataset/Base/Attractions.jsonl', 'r') as file:\n",
    "        attractions = [json.loads(line.strip()) for line in file]\n",
    "\n",
    "        return restaurants, hotels, attractions\n",
    "\n",
    "def clean_attraction(attraction):\n",
    "    level_map = {\n",
    "        0: \"no\",\n",
    "        1: \"low\",\n",
    "        2: \"medium\",\n",
    "        3: \"high\"\n",
    "    }\n",
    "    keys = []\n",
    "    values = []\n",
    "    for key, value in attraction.items():\n",
    "        keys.append(key)\n",
    "        values.append(value['level'])\n",
    "\n",
    "    parsed = {}\n",
    "\n",
    "    for i in range(len(keys)):\n",
    "        new = {keys[i].replace('_',' '): level_map[values[i]] + ' ' + keys[i].replace('_',' ')}\n",
    "        parsed = {**parsed, **new}\n",
    "\n",
    "    return parsed\n",
    "\n",
    "def parseAttraction(model,attractions_base):\n",
    "    folder_path = f'preprocess/{model}/attractions'\n",
    "    attractions_final = []\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith('.json'):\n",
    "            business_id_fromReviews = filename[:-5][-22:]\n",
    "            \n",
    "            for attraction in attractions_base:\n",
    "                if attraction['business_id'] == business_id_fromReviews:\n",
    "                    with open(folder_path + '/' + filename, 'r') as file:\n",
    "                        attraction_fromReviews = json.load(file)\n",
    "                    attraction_fromReviews_short = clean_attraction(attraction_fromReviews)\n",
    "                    attraction_final = {**attraction, **attraction_fromReviews_short}\n",
    "                    attractions_final.append(attraction_final)\n",
    "    \n",
    "    with open (f'Dataset/{model}/attractions.jsonl','w') as file:\n",
    "        for item in attractions_final:\n",
    "            json.dump(item, file)\n",
    "            file.write('\\n') \n",
    "\n",
    "    return attractions_final\n",
    "\n",
    "def clean_hotel(hotel):\n",
    "    level_map = {\n",
    "        1 : \"bad\",\n",
    "        2 : \"below average\",\n",
    "        3 : \"average\",\n",
    "        4 : \"good\",\n",
    "        5 : \"excellent\"\n",
    "    }\n",
    "    keys = []\n",
    "    values = []\n",
    "    for key, value in hotel.items():\n",
    "        keys.append(key)\n",
    "        values.append(value['rating'])\n",
    "\n",
    "    parsed = {}\n",
    "\n",
    "    for i in range(len(keys)):\n",
    "        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}\n",
    "        parsed = {**parsed, **new}\n",
    "\n",
    "    return parsed\n",
    "\n",
    "def parseHotel(model,hotels_base):\n",
    "    folder_path = f'preprocess/{model}/hotels'\n",
    "    hotels_final = []\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith('.json'):\n",
    "            business_id_fromReviews = filename[:-5][-22:]\n",
    "            \n",
    "            for hotel in hotels_base:\n",
    "                if hotel['business_id'] == business_id_fromReviews:\n",
    "                    with open(folder_path + '/' + filename, 'r') as file:\n",
    "                        hotel_fromReviews = json.load(file)\n",
    "                    hotel_fromReviews_short = clean_hotel(hotel_fromReviews)\n",
    "                    hotel_final = {**hotel, **hotel_fromReviews_short}\n",
    "                    hotels_final.append(hotel_final)\n",
    "    \n",
    "    with open (f'Dataset/{model}/hotels.jsonl','w') as file:\n",
    "        for item in hotels_final:\n",
    "            json.dump(item, file)\n",
    "            file.write('\\n') \n",
    "\n",
    "    return hotels_final\n",
    "\n",
    "def clean_restaurant(restaurant):\n",
    "    level_map = {\n",
    "        1 : \"bad\",\n",
    "        2 : \"below average\",\n",
    "        3 : \"average\",\n",
    "        4 : \"good\",\n",
    "        5 : \"excellent\"\n",
    "    }\n",
    "    keys = []\n",
    "    values = []\n",
    "    for key, value in restaurant.items():\n",
    "        keys.append(key)\n",
    "        values.append(value['rating'])\n",
    "\n",
    "    parsed = {}\n",
    "\n",
    "    for i in range(len(keys)):\n",
    "        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}\n",
    "        parsed = {**parsed, **new}\n",
    "\n",
    "    return parsed\n",
    "\n",
    "def parseRestaurant(model,restaurants_base):\n",
    "    folder_path = f'preprocess/{model}/restaurants'\n",
    "    restaurants_final = []\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith('.json'):\n",
    "            business_id_fromReviews = filename[:-5][-22:]\n",
    "            \n",
    "            for restaurant in restaurants_base:\n",
    "                if restaurant['business_id'] == business_id_fromReviews:\n",
    "                    with open(folder_path + '/' + filename, 'r') as file:\n",
    "                        restaurant_fromReviews = json.load(file)\n",
    "                    restaurant_fromReviews_short = clean_restaurant(restaurant_fromReviews)\n",
    "                    restaurant_final = {**restaurant, **restaurant_fromReviews_short}\n",
    "                    restaurants_final.append(restaurant_final)\n",
    "    \n",
    "    with open (f'Dataset/{model}/restaurants.jsonl','w') as file:\n",
    "        for item in restaurants_final:\n",
    "            json.dump(item, file)\n",
    "            file.write('\\n') \n",
    "\n",
    "    return restaurants_final\n",
    "\n",
    "def createAllData(model):\n",
    "    #load jsonl\n",
    "    with open (f'Dataset/{model}/attractions.jsonl', 'r') as file:\n",
    "        attractions = [json.loads(line.strip()) for line in file]\n",
    "    with open (f'Dataset/{model}/hotels.jsonl', 'r') as file:\n",
    "        hotels = [json.loads(line.strip()) for line in file]\n",
    "    with open (f'Dataset/{model}/restaurants.jsonl', 'r') as file:\n",
    "        restaurants = [json.loads(line.strip()) for line in file]\n",
    "\n",
    "    #make into csv\n",
    "    att_df = pd.DataFrame(attractions)\n",
    "    att_df = att_df.loc[:, ['name', 'address', 'latitude','longitude','stars', 'price','family oriented','history oriented','activity oriented','nature oriented','food oriented','shopping oriented']]\n",
    "    att_df.to_csv(f'Dataset/{model}/Attractions.csv', index=False)\n",
    "\n",
    "    hot_df = pd.DataFrame(hotels)\n",
    "    hot_df = hot_df.loc[:, ['name', 'address', 'latitude','longitude', 'stars', 'price','quality','location','service','safety']]\n",
    "    hot_df.to_csv(f'Dataset/{model}/Hotels.csv', index=False)\n",
    "\n",
    "    rest_df = pd.DataFrame(restaurants)\n",
    "    rest_df = rest_df.loc[:, ['name', 'address', 'latitude','longitude', 'stars', 'good_for_meal', 'price','cuisine_1','cuisine_2','flavor','freshness','service','environment','value']]\n",
    "    rest_df.to_csv(f'Dataset/{model}/Restaurants.csv', index=False)\n",
    "\n",
    "    #make into txt\n",
    "    with open (f'Dataset/{model}/Attractions.csv', 'r') as file:\n",
    "        reader = csv.reader(file)\n",
    "        text = ''\n",
    "        for row in reader:\n",
    "            #put each row into a string\n",
    "            text += ','.join(row)+ '\\n'\n",
    "    with open (f'Dataset/{model}/Attractions.txt', 'w') as file:\n",
    "        file.write(text)\n",
    "\n",
    "    with open (f'Dataset/{model}/Hotels.csv', 'r') as file:\n",
    "        reader = csv.reader(file)\n",
    "        text = ''\n",
    "        for row in reader:\n",
    "            #put each row into a string\n",
    "            text += ','.join(row)+ '\\n'\n",
    "    with open (f'Dataset/{model}/Hotels.txt', 'w') as file:\n",
    "        file.write(text)\n",
    "\n",
    "    with open (f'Dataset/{model}/Restaurants.csv', 'r') as file:\n",
    "        reader = csv.reader(file)\n",
    "        text = ''\n",
    "        for row in reader:\n",
    "            #put each row into a string\n",
    "            text += ','.join(row)+ '\\n'\n",
    "    with open (f'Dataset/{model}/Restaurants.txt', 'w') as file:\n",
    "        file.write(text)\n",
    "    \n",
    "    #make into one txt file\n",
    "    with open (f'Dataset/{model}/Attractions.txt', 'r') as file:\n",
    "        attraction_text = file.read().replace('\\n', '. ')\n",
    "\n",
    "    with open (f'Dataset/{model}/Hotels.txt', 'r') as file:\n",
    "        hotel_text = file.read().replace('\\n', '. ')\n",
    "\n",
    "    with open (f'Dataset/{model}/Restaurants.txt', 'r') as file:\n",
    "        restaurant_text = file.read().replace('\\n', '. ')\n",
    "    data = [\n",
    "        {\n",
    "            \"Description\": \"Accommodations in Philadelphia\",\n",
    "            \"Content\": hotel_text\n",
    "        },\n",
    "        {\n",
    "            \"Description\": \"Attractions in Philadelphia\",\n",
    "            \"Content\": attraction_text\n",
    "        },\n",
    "        {\n",
    "            \"Description\": \"Restaurants in Philadelphia\",\n",
    "            \"Content\": restaurant_text\n",
    "        }\n",
    "    ]\n",
    "\n",
    "    with open(f'Dataset/{model}/all_data.json', 'w') as f:\n",
    "        json.dump(data, f, indent=4)\n",
    "\n",
    "def createFilteredData(model):\n",
    "    restaurants = pd.read_csv(f'Dataset/{model}/Restaurants.csv')\n",
    "    hotels = pd.read_csv(f'Dataset/{model}/Hotels.csv')\n",
    "    attractions = pd.read_csv(f'Dataset/{model}/Attractions.csv')\n",
    "\n",
    "    with open ('Prompts/evals.jsonl', 'r') as file:\n",
    "        evals = [json.loads(line.strip()) for line in file]\n",
    "\n",
    "    filteredData = []\n",
    "    for i in range(len(evals)):\n",
    "        prompt = evals[i]['eval_info']\n",
    "        #attractionSearch\n",
    "        #budget\n",
    "        budget = prompt['price'][0]\n",
    "        price_map = {'cheap budget':['$','$$'],'moderate budget':['$','$$','$$$'],'expensive budget':['$$','$$$','$$$$']}\n",
    "        price_limit = price_map[budget.lower()]\n",
    "        attractions_filtered = attractions[attractions['price'].isin(price_limit)]\n",
    "\n",
    "        #preference\n",
    "        preference = prompt['attraction'][0].lower()\n",
    "        pref_list = ['medium ' + preference, 'high ' + preference]\n",
    "        attractions_filtered = attractions_filtered[attractions_filtered[preference].isin(pref_list)]\n",
    "        attractions_filtered = attractions_filtered[['name','address','latitude','longitude']]\n",
    "        #restaurants\n",
    "        #budget\n",
    "        restaurants_filtered = restaurants[restaurants['price'].isin(price_limit)]\n",
    "\n",
    "        #cuisine\n",
    "        cuisine = prompt['cuisine'][0]\n",
    "        if cuisine == 'US':\n",
    "            cuisine = ['American','American (New)','American (Traditional)']\n",
    "        else:\n",
    "            cuisine = [cuisine]\n",
    "\n",
    "        restaurants_filtered = restaurants_filtered[(restaurants_filtered['cuisine_1'].isin(cuisine)) | (restaurants_filtered['cuisine_2'].isin(cuisine))]\n",
    "        #preference\n",
    "        preference = prompt['restaurant']\n",
    "        if preference != []:\n",
    "            for pref in preference:\n",
    "                prefs = pref.split(' ')\n",
    "                col = prefs[1].lower()\n",
    "                pref_list = ['good ' + col, 'excellent ' + col]\n",
    "                restaurants_filtered = restaurants_filtered[restaurants_filtered[col].isin(pref_list)]\n",
    "        restaurants_filtered = restaurants_filtered[['name','address','latitude','longitude','good_for_meal']]\n",
    "        #hotel search\n",
    "        #budget\n",
    "        hotels_filtered = hotels[hotels['price'].isin(price_limit)]\n",
    "\n",
    "        #preference\n",
    "        preference = prompt['hotel']\n",
    "        if preference != []:\n",
    "            for pref in preference:\n",
    "                prefs = pref.split(' ')\n",
    "                col = prefs[1].lower()\n",
    "                pref_list = ['good ' + col, 'excellent ' + col]\n",
    "                hotels_filtered = hotels_filtered[hotels_filtered[col].isin(pref_list)]\n",
    "        hotels_filtered = hotels_filtered[['name','address','latitude','longitude']]\n",
    "        #spatial clustering of all the filtered data\n",
    "        #attractions_filtered; restaurants_filtered; hotels_filtered\n",
    "        allBusiness_names = []\n",
    "        allBusiness_cordinates = []\n",
    "        for index, row in attractions_filtered.iterrows():\n",
    "            allBusiness_names.append(row['name'])\n",
    "            allBusiness_cordinates.append([row['latitude'], row['longitude']])\n",
    "        #for index, row in restaurants_filtered.iterrows():\n",
    "        #    allBusiness.append([row['latitude'], row['longitude']])\n",
    "        for index, row in hotels_filtered.iterrows():\n",
    "            allBusiness_names.append(row['name'])\n",
    "            allBusiness_cordinates.append([row['latitude'], row['longitude']])\n",
    "        \n",
    "        coordinates = np.array(allBusiness_cordinates)\n",
    "        #k = int(prompt['day'][0].strip()[0]) 0.23 1.24\n",
    "        # k=10 0.27 1.3\n",
    "        k = int(len(coordinates)/5)\n",
    "        kmeans = KMeans(n_clusters=k, random_state=42)\n",
    "        kmeans.fit(coordinates)\n",
    "        # Cluster labels\n",
    "        allBusiness_clusterNumbers = kmeans.labels_\n",
    "\n",
    "        clusterInfo = pd.DataFrame({'business': allBusiness_names, 'cluster number': allBusiness_clusterNumbers})\n",
    "        clusterInfo['cluster number'] = clusterInfo['cluster number'].apply(lambda x: 'Cluster_' + str(x))\n",
    "        clusterInfo = clusterInfo.sort_values('cluster number')\n",
    "\n",
    "        cluster_dict = clusterInfo.groupby('cluster number')['business'].apply(list).to_dict()\n",
    "\n",
    "        #prepare the input data\n",
    "        attractions_filtered.to_csv(f'preprocess/{model}/filteredAttractions.csv', index=False)\n",
    "        with open (f'preprocess/{model}/filteredAttractions.csv', 'r') as f:\n",
    "            reader = csv.reader(f)\n",
    "            attractions_txt = ''\n",
    "            for row in reader:\n",
    "                attractions_txt += ','.join(row)+ '\\n'\n",
    "\n",
    "        restaurants_filtered.to_csv(f'preprocess/{model}/filteredRestaurants.csv', index=False)\n",
    "        with open (f'preprocess/{model}/filteredRestaurants.csv', 'r') as f:\n",
    "            reader = csv.reader(f)\n",
    "            restaurants_txt = ''\n",
    "            for row in reader:\n",
    "                restaurants_txt += ','.join(row)+ '\\n'\n",
    "\n",
    "        hotels_filtered.to_csv(f'preprocess/{model}/filteredHotels.csv', index=False)\n",
    "        with open (f'preprocess/{model}/filteredHotels.csv', 'r') as f:\n",
    "            reader = csv.reader(f)\n",
    "            hotels_txt = ''\n",
    "            for row in reader:\n",
    "                hotels_txt += ','.join(row)+ '\\n'\n",
    "        \n",
    "        data = [\n",
    "            {\n",
    "                \"Description\": \"Filtered Accommodations in Philadelphia\",\n",
    "                \"Content\": hotels_txt\n",
    "            },\n",
    "            {\n",
    "                \"Description\": \"Filtered Attractions in Philadelphia\",\n",
    "                \"Content\": attractions_txt\n",
    "            },{\n",
    "                \"Description\": \"Filtered Restaurants in Philadelphia\",\n",
    "                \"Content\": restaurants_txt\n",
    "            },\n",
    "            {\n",
    "                \"Description\": \"Near by businesses in clusters about attractions and hotels\",\n",
    "                \"Content\": cluster_dict\n",
    "            }\n",
    "        ]\n",
    "\n",
    "        filteredData.append({\"index\": i+1, \"filtered_data\": data})\n",
    "\n",
    "    with open(f'Dataset/{model}/filtered_data.jsonl', 'w') as f:\n",
    "        for item in filteredData:\n",
    "            json.dump(item, f)\n",
    "            f.write('\\n')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "\n",
    "    restaurants_base, hotels_base, attractions_base = loadbase()\n",
    "    \n",
    "    model = 'gpt4o'\n",
    "    #gpt4o\n",
    "    #attractions_final = parseAttraction(model,attractions_base)\n",
    "    #hotels_final = parseHotel(model,hotels_base)\n",
    "    #restaurants_final = parseRestaurant(model,restaurants_base)\n",
    "    #createAllData(model)\n",
    "    createFilteredData(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torchgpu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
