File size: 30,235 Bytes
cda78c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import json\n",
    "import pandas as pd\n",
    "from numpy import mean, percentile, array\n",
    "from numpy.random import permutation as perm\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = Path()\n",
    "data_dir = base_dir / \"data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_jsonl(file_path):\n",
    "    data = []\n",
    "    with open(file_path, \"r\") as file:\n",
    "        for i,line in enumerate(file):\n",
    "            data.append(json.loads(line))\n",
    "    return data\n",
    "\n",
    "reviews = read_jsonl(data_dir / \"cmu\" / \"raw\" / \"review.txt\")\n",
    "offering = read_jsonl(data_dir / \"cmu\" / \"raw\" / \"offering.txt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(data_dir / \"cmu\" / \"processed\" / \"cities.json\", \"w\") as f:\n",
    "    json.dump(list(cities), f)\n",
    "    \n",
    "with open(data_dir / \"cmu\" / \"processed\" / \"score_threshold_per_city.json\", \"w\") as f:\n",
    "    json.dump(score_threshold_per_city, f)\n",
    "\n",
    "with open(data_dir / \"cmu\" / \"processed\" / \"city_to_hotel_id_map.json\", \"w\") as f:\n",
    "    json.dump(city_to_hotel_id_map, f)\n",
    "\n",
    "with open(data_dir / \"cmu\" / \"processed\" / \"hotel_id_to_name_map.json\", \"w\") as f:\n",
    "    json.dump(hotel_id_to_name_map, f)\n",
    "\n",
    "with open(data_dir / \"cmu\" / \"processed\" / \"hotel_id_to_review_map.json\", \"w\") as f:\n",
    "    json.dump(hotel_id_to_review_map, f)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "cities = set([hotel['address']['locality'] for hotel in offering])\n",
    "city_to_hotel_id_map = {city: [hotel['id'] for hotel in offering \n",
    "                               if hotel['address']['locality'] == city] for city in cities}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotel_id_to_review_map = {}\n",
    "for review in reviews:\n",
    "    review_info = {'text': review['text'], 'score': review['ratings']['overall'], 'num_helpful': review['num_helpful_votes']}\n",
    "    hotel_id_to_review_map.setdefault(review['offering_id'], []).append(review_info)\n",
    "\n",
    "for hotel_id, review_info in hotel_id_to_review_map.items():\n",
    "    average_score = mean([rev['score'] for rev in review_info])\n",
    "    hotel_id_to_review_map[hotel_id] = {'average_score': average_score, 'reviews': review_info}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotel_id_to_name_map = {hotel[\"id\"]: hotel[\"name\"] for hotel in offering}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.histplot([len(hotel['reviews']) for hotel in hotel_id_to_review_map.values()]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_score_threshold_per_city(p):\n",
    "    res = {}\n",
    "    for city, idxs in city_to_hotel_id_map.items():\n",
    "        res[city] = percentile([hotel_id_to_review_map.get(idx, {'average_score': 0})['average_score'] for idx in idxs], p)  \n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_threshold_per_city=calc_score_threshold_per_city(80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_per_hotel_per_city = {}\n",
    "for city in city_to_hotel_id_map:\n",
    "    for hotel_id in city_to_hotel_id_map[city]:\n",
    "        n_reviews = len(hotel_id_to_review_map.get(hotel_id, {'reviews': []})['reviews'])\n",
    "        reviews_per_hotel_per_city.setdefault(city, []).append(n_reviews)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boston: 73 hotels with more than 100 reviews\n",
      "Seattle: 86 hotels with more than 100 reviews\n",
      "San Jose: 24 hotels with more than 100 reviews\n",
      "Charlotte: 49 hotels with more than 100 reviews\n",
      "Chicago: 106 hotels with more than 100 reviews\n",
      "Washington DC: 106 hotels with more than 100 reviews\n",
      "Fort Worth: 15 hotels with more than 100 reviews\n",
      "Jacksonville: 39 hotels with more than 100 reviews\n",
      "Denver: 70 hotels with more than 100 reviews\n",
      "Los Angeles: 142 hotels with more than 100 reviews\n",
      "New York City: 327 hotels with more than 100 reviews\n",
      "Dallas: 61 hotels with more than 100 reviews\n",
      "Memphis: 42 hotels with more than 100 reviews\n",
      "Phoenix: 62 hotels with more than 100 reviews\n",
      "San Diego: 148 hotels with more than 100 reviews\n",
      "Austin: 63 hotels with more than 100 reviews\n",
      "Baltimore: 41 hotels with more than 100 reviews\n",
      "San Antonio: 89 hotels with more than 100 reviews\n",
      "Detroit: 17 hotels with more than 100 reviews\n",
      "Indianapolis: 44 hotels with more than 100 reviews\n",
      "San Francisco: 177 hotels with more than 100 reviews\n",
      "Houston: 70 hotels with more than 100 reviews\n",
      "Columbus: 38 hotels with more than 100 reviews\n",
      "Philadelphia: 61 hotels with more than 100 reviews\n",
      "El Paso: 11 hotels with more than 100 reviews\n"
     ]
    }
   ],
   "source": [
    "for city, num_reviews in reviews_per_hotel_per_city.items():\n",
    "    score_threshold = score_threshold_per_city[city]\n",
    "    for idx in city_to_hotel_id_map[city]:\n",
    "        hotel_id_to_review_map.get(idx, {'average_score': 0})['average_score']\n",
    "    print(f\"{city}: {(array(num_reviews) > 70).sum()} hotels with more than 100 reviews\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "offering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "memory in gradio space\n",
    "\n",
    "Steps:\n",
    "* Embed space in site\n",
    "* Save preprocessed files\n",
    "* Load from interface\n",
    "* Return recommendation for boston irrespective of text input\n",
    "    * Return context for llm as output\n",
    "    * Set up open ai, return raw output with basic prompt\n",
    "* Scroll menu for city\n",
    "* Check box for kid friendly\n",
    "* At the end - understand free text input\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotel_id_to_review_map = {\n",
    "    hotel_id: {\n",
    "        \"reviews\": [\n",
    "            review for review in hotel_data[\"reviews\"] if review[\"num_helpful\"] > 10\n",
    "        ]\n",
    "    }\n",
    "    for hotel_id, hotel_data in hotel_id_to_review_map.items()\n",
    "    if len(hotel_id_to_review_map[hotel_id]['reviews']) > 100\n",
    "    and hotel_id_to_review_map[hotel_id]['average_score'] >= score_threshold_per_city[hotel_id_to_city_map[hotel_id]]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotel_id_to_city_map = {vi:k for k,v in city_to_hotel_id_map.items() for vi in v}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'hotel_name': 'Four Seasons Hotel Boston', 'positive': [{'text': \"FSHB is one of the best hotels in the country. Its employees are pleasant, professional, and well trained. They always have the guests' best interests in mind. The hotel is beautiful, the rooms spacious, clean, andextremely comfortable. The restaurant, Aujourd'hui, is among the best in the city. The food is exquisite. Room service and brunch are also wonderful. This is the best Boston has to offer!\", 'score': 5.0, 'num_helpful': 11}, {'text': 'I had the opportunity to stay at the 4 seasons for the Boston Marathon. It was my first time to Boston (where I had wanted to go since my early twenties, (48 now)!. I have to say the 4 seasons made this the best experience of my life. From the pickup at the airport to the drop off I was simply amazed! Everyone was so nice, Daniel the concierge helped me get a tour of the city prior to other relatives arriving and referred me to 5th Avenue Limo Service. Use them please, they are great! The driver was so knowledgeable and darn I cant remember his name! The service, the room, the ammenities were all above par. I have to admit the only thing i did not like was the darn curtain in the shower. seriously, all hotels that r 4 or 5 star even 3, need to have shower enclosures! the bedding was excellent, the executive suite overlooking boston gardens and boston common - oh my - the view and listening to the people out on the street was great. you can open a window from these rooms and feel the ambiance of the city. I will stay at the 4 seasons probably every time i return to boston!!! kudos to the hotel management!', 'score': 5.0, 'num_helpful': 11}, {'text': 'Just returned from a visit to Four Seasons Boston and the service was excellent. We arrived early in the AM from a red-eye flight and the terrific woman at the front desk (Eliva?)not only let us check in early but upgraded us to a newly renovated room. The doormen,bellman,concierge and staff at the health club all were first rate. Health club facilities were great, enjoyed the newly renovated steam room and sauna. Of course, as is typical of the Four Seasons the bed was sublime. We have stayed at this property in the past and never been disappointed. Last year due to some of the negative reviews about Four Seasons Boston we stayed at Beacon XV- what a mistake! The service at Beacon XV is truly awful, does not even compare.', 'score': 5.0, 'num_helpful': 11}], 'negative': [{'text': \"After staying at the hotel for a wedding, my room was broken into and a significant amount of Jewelry stolen. The Boston Police Department has since found the thief and has a video of the gentlemen leaving my hotel room. Through this terrible ordeal the 4 Seasons has been unbelievable. They have claimed that it is not their responsibility and refuse to insure my belongings. On top of everything, I was shocked when they didn't even bother to comp my room after I had to deal with a police report all day. Do not stay here unless you want to deal with a rude and terrible staff at a 2-3 star hotel. Try the Taj\", 'score': 1.0, 'num_helpful': 14}]}\n"
     ]
    }
   ],
   "source": [
    "city = \"Boston\"\n",
    "score_threshold = score_threshold_per_city[city]\n",
    "for hotel_id in perm(city_to_hotel_id_map[city]):\n",
    "    try:\n",
    "        hotel_reviews = hotel_id_to_review_map[hotel_id]['reviews']\n",
    "    except KeyError:\n",
    "        continue\n",
    "    res = {\"hotel_name\": hotel_id_to_name_map[hotel_id], 'positive': [], 'negative': []}    \n",
    "    hotel_reviews = hotel_id_to_review_map[hotel_id]['reviews']\n",
    "    for review in perm(hotel_reviews):\n",
    "        if review['num_helpful'] > 10:\n",
    "            if (review['score'] == 5) & (len(res['positive']) < 3):\n",
    "                res['positive'].append(review)\n",
    "            if (review['score'] <= 2) & (len(res['negative']) < 1):\n",
    "                res['negative'].append(review)\n",
    "    if (len(res['positive']) >= 3) & (len(res['negative']) >= 1):\n",
    "        break\n",
    "\n",
    "print(res)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "adults",
   "language": "python",
   "name": "adults"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}