hermanda commited on
Commit
051a9e7
·
verified ·
1 Parent(s): 28a77c6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +496 -0
app.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+ import time
4
+ from datetime import datetime, timedelta
5
+ from datetime import time as dt_time
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+ import geopy.distance
9
+ import requests
10
+ import re
11
+ import os
12
+ from bs4 import BeautifulSoup
13
+ from cachetools import cached, TTLCache
14
+ import concurrent.futures
15
+ from functools import partial
16
+ import traceback
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+
19
+
20
+
21
+ def get_geo_optimal_stop(method, selected_stops, show_top=20):
22
+ global DISTANCE_TABLE
23
+
24
+ dfs = []
25
+ for si, stop in tqdm(enumerate(selected_stops), desc="Calculating optimal stops", total=len(selected_stops)):
26
+ df = (
27
+ DISTANCE_TABLE
28
+ .filter(pl.col("from") == stop)
29
+ .drop("from")
30
+ .with_columns(
31
+ pl.col("to").alias("target_stop"),
32
+ pl.col("distance_in_km").alias(f"distance_in_km_{si}")
33
+ )
34
+ .select("target_stop", f"distance_in_km_{si}")
35
+ )
36
+ dfs.append(df)
37
+
38
+ print("Joining dataframes ...")
39
+ df = dfs[0]
40
+ for i in range(1, len(dfs)):
41
+ df = df.join(dfs[i], on="target_stop")
42
+
43
+ print("Finidng optimal stops ...")
44
+ df = df.with_columns(
45
+ pl.max_horizontal(*[f"distance_in_km_{si}" for si in range(len(selected_stops))]).alias("worst_case_km"),
46
+ pl.sum_horizontal(*[f"distance_in_km_{si}" for si in range(len(selected_stops))]).alias("total_km")
47
+ )
48
+
49
+ if method == "minimize-worst-case":
50
+ df = df.sort("worst_case_km")
51
+ elif method == "minimize-total":
52
+ df = df.sort("total_km")
53
+
54
+ return df.head(show_top)
55
+
56
+ def validate_date(date_str):
57
+ """
58
+ Validates that the date string is in DD/MM/YYYY format and represents a valid date.
59
+ """
60
+ try:
61
+ datetime.strptime(date_str, "%d/%m/%Y")
62
+ return True
63
+ except ValueError:
64
+ return False
65
+
66
+ def validate_time(time_str):
67
+ """
68
+ Validates that the time string is in HH:MM format and represents a valid time.
69
+ """
70
+ try:
71
+ datetime.strptime(time_str, "%H:%M")
72
+ return True
73
+ except ValueError:
74
+ return False
75
+
76
+ def validate_date_time(date_str, time_str):
77
+ """
78
+ Validates that:
79
+ 1. The date string is in DD/MM/YYYY format and represents a valid date.
80
+ 2. The time string is in HH:MM format and represents a valid time.
81
+ 3. The combined datetime is in the future and not more than 3 months ahead.
82
+
83
+ Returns:
84
+ Tuple[bool, str]: (is_valid, error_message)
85
+ """
86
+ try:
87
+ event_datetime = datetime.strptime(f"{date_str} {time_str}", "%d/%m/%Y %H:%M")
88
+ except ValueError:
89
+ return False, "Invalid date or time format. Please ensure date is DD/MM/YYYY and time is HH:MM."
90
+
91
+ now = datetime.now()
92
+ three_months_later = now + timedelta(days=90) # Approximation of 3 months
93
+
94
+ if event_datetime <= now:
95
+ return False, "The selected date and time must be in the future."
96
+ if event_datetime > three_months_later:
97
+ return False, "The selected date and time must not be more than 3 months in the future."
98
+
99
+ return True, ""
100
+
101
+
102
+ def get_next_meetup_time(target_weekday: int, target_hour: int) -> datetime:
103
+ """
104
+ Returns the next meetup datetime.
105
+
106
+ :param target_weekday: The target weekday (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
107
+ :param target_hour: The target hour (0-23).
108
+ """
109
+ start_dt = datetime.now()
110
+
111
+ current_weekday = start_dt.weekday()
112
+ days_ahead = target_weekday - current_weekday
113
+
114
+ if days_ahead == 0:
115
+ if start_dt.time() >= dt_time(target_hour, 0):
116
+ days_ahead = 7
117
+ else:
118
+ days_ahead = 0
119
+ elif days_ahead < 0:
120
+ days_ahead += 7
121
+
122
+ next_dt = start_dt + timedelta(days=days_ahead)
123
+ next_dt = next_dt.replace(hour=target_hour, minute=0, second=0, microsecond=0)
124
+ return next_dt
125
+
126
+
127
+ def parse_time_to_minutes(time_str: str) -> int:
128
+ """
129
+ Parses a time string and returns the total number of minutes as an integer.
130
+
131
+ Supported formats:
132
+ - "X hod Y min" (e.g., "1 hod 1 min")
133
+ - "X hod" (e.g., "2 hod")
134
+ - "Y min" (e.g., "20 min")
135
+
136
+ Forbidden inputs:
137
+ - Negative minutes or hours (e.g., "-1 min")
138
+ - Minutes equal to or exceeding 60 (e.g., "61 min")
139
+ - Incorrect formats
140
+
141
+ Args:
142
+ time_str (str): The time string to parse.
143
+
144
+ Returns:
145
+ int: Total number of minutes.
146
+
147
+ Raises:
148
+ ValueError: If the input format is invalid or contains forbidden values.
149
+ """
150
+ pattern = r'^\s*(?:(\d+)\s*hod)?(?:\s*(\d+)\s*min)?\s*$'
151
+ match = re.match(pattern, time_str, re.IGNORECASE)
152
+
153
+ if not match:
154
+ raise ValueError(f"Invalid time format: '{time_str}'")
155
+
156
+ hours_str, minutes_str = match.groups()
157
+
158
+ hours = int(hours_str) if hours_str else 0
159
+ minutes = int(minutes_str) if minutes_str else 0
160
+
161
+ if hours < 0:
162
+ raise ValueError("Hours cannot be negative.")
163
+ if minutes < 0:
164
+ raise ValueError("Minutes cannot be negative.")
165
+ if minutes >= 60:
166
+ raise ValueError("Minutes must be less than 60.")
167
+
168
+ total_minutes = hours * 60 + minutes
169
+ return total_minutes
170
+
171
+ def get_total_minutes(from_stop: str, to_stop: str, dt: datetime) -> int:
172
+ """
173
+ Sends a POST request to the specified URL using Webshare's rotating proxy and parses the response to extract time in minutes.
174
+
175
+ Args:
176
+ from_stop (str): The departure stop.
177
+ to_stop (str): The arrival stop.
178
+ dt (datetime.datetime): The date and time for the query.
179
+
180
+ Returns:
181
+ int: The total time in minutes extracted from the response.
182
+
183
+ Raises:
184
+ requests.HTTPError: If the HTTP request returned an unsuccessful status code.
185
+ ValueError: If expected HTML elements are not found in the response.
186
+ """
187
+
188
+ if from_stop == to_stop:
189
+ return 0
190
+
191
+ day_abbreviations = {
192
+ 0: 'po', # Monday -> po
193
+ 1: 'út', # Tuesday -> út
194
+ 2: 'st', # Wednesday -> st
195
+ 3: 'čt', # Thursday -> čt
196
+ 4: 'pá', # Friday -> pá
197
+ 5: 'so', # Saturday -> so
198
+ 6: 'ne' # Sunday -> ne
199
+ }
200
+
201
+ day = dt.day
202
+ month = dt.month
203
+ year = dt.year
204
+ weekday = dt.weekday()
205
+ abbreviation = day_abbreviations.get(weekday, '')
206
+ date_str = f"{day}.{month}.{year} {abbreviation}"
207
+ time_str = dt.strftime('%H:%M')
208
+
209
+
210
+ headers = {
211
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
212
+ 'accept-language': 'en-US,en;q=0.9',
213
+ 'cache-control': 'max-age=0',
214
+ 'content-type': 'application/x-www-form-urlencoded',
215
+ 'dnt': '1',
216
+ 'origin': 'https://idos.cz',
217
+ 'priority': 'u=0, i',
218
+ 'referer': 'https://idos.cz/pid/spojeni/',
219
+ }
220
+
221
+ data = [
222
+ ('From', from_stop),
223
+ ('positionACPosition', ''),
224
+ ('To', to_stop),
225
+ ('positionACPosition', ''),
226
+ ('AdvancedForm.Via[0]', ''),
227
+ ('AdvancedForm.ViaHidden[0]', ''),
228
+ ('Date', date_str),
229
+ ('Time', time_str),
230
+ ('IsArr', 'True'),
231
+ ]
232
+
233
+ url = 'https://idos.cz/pid/spojeni/'
234
+
235
+ proxy_domain = os.getenv("PROXY_DOMAIN")
236
+ proxy_port = os.getenv("PROXY_PORT")
237
+ proxy_username = os.getenv("PROXY_USERNAME")
238
+ proxy_password = os.getenv("PROXY_PASSWORD")
239
+
240
+ # Construct the proxy URL with authentication
241
+ proxy_url = f"http://{proxy_username}:{proxy_password}@{proxy_domain}:{proxy_port}"
242
+
243
+ proxies = {
244
+ 'http': proxy_url,
245
+ 'https': proxy_url,
246
+ }
247
+
248
+ try:
249
+ if proxy_domain is None:
250
+ response = requests.post(url, headers=headers, data=data, timeout=15)
251
+ else:
252
+ response = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=15)
253
+ response.raise_for_status()
254
+ except requests.RequestException as e:
255
+ raise requests.HTTPError(f"Failed to retrieve data from {url}.") from e
256
+
257
+ soup = BeautifulSoup(response.content, 'html.parser')
258
+ connection_head = soup.find(class_='connection-head')
259
+
260
+ if not connection_head:
261
+ raise ValueError("No elements found with the class 'connection-head'.")
262
+
263
+ strong_tag = connection_head.find('strong')
264
+
265
+ if not strong_tag:
266
+ raise ValueError("No <strong> tag found within the first 'connection-head' element.")
267
+
268
+ time_str_response = strong_tag.get_text(strip=True)
269
+ total_minutes = parse_time_to_minutes(time_str_response)
270
+ return total_minutes
271
+
272
+ @cached(cache=TTLCache(maxsize=10**6, ttl=24*60*60))
273
+ def get_total_minutes_with_retries(from_stop: str, to_stop: str, dt: datetime) -> int:
274
+ max_retries = 3
275
+ retry_delay = 2
276
+ attempt = 0
277
+
278
+ while attempt < max_retries:
279
+ try:
280
+ total_minutes = get_total_minutes(from_stop, to_stop, dt)
281
+ return total_minutes
282
+ except Exception as e:
283
+ attempt += 1
284
+ if attempt < max_retries:
285
+ print(f"Error processing pair ({from_stop}, {to_stop}): {e}. Retrying in {retry_delay} seconds... (Attempt {attempt}/{max_retries})")
286
+ time.sleep(retry_delay)
287
+ else:
288
+ print(f"Failed to process pair ({from_stop}, {to_stop}) after {max_retries} attempts.")
289
+ return None
290
+
291
+ def get_time_optimal_stop(method, selected_stops, target_stops, event_datetime, show_top=20):
292
+ def process_target_stop(args):
293
+ target_stop, selected_stops, event_datetime = args
294
+ row = {"target_stop": target_stop}
295
+ for si, from_stop in enumerate(selected_stops):
296
+ try:
297
+ total_minutes = get_total_minutes_with_retries(from_stop, target_stop, event_datetime)
298
+ row[f"total_minutes_{si}"] = total_minutes
299
+ except Exception as e:
300
+ print(f"Error processing pair ({from_stop}, {target_stop}): {e}")
301
+ traceback.print_exc()
302
+ row[f"total_minutes_{si}"] = None
303
+ return row
304
+
305
+ rows = []
306
+ arguments = [(target_stop, selected_stops, event_datetime) for target_stop in target_stops]
307
+ with ThreadPoolExecutor(max_workers=5) as executor:
308
+ futures = {executor.submit(process_target_stop, arg): arg[0] for arg in arguments}
309
+ for future in tqdm(as_completed(futures), total=len(arguments)):
310
+ try:
311
+ result = future.result()
312
+ rows.append(result)
313
+ except Exception as e:
314
+ print(f"An error occurred with target_stop={futures[future]}: {e}")
315
+
316
+ df_times = (
317
+ pl.DataFrame(rows)
318
+ .with_columns(
319
+ pl.max_horizontal(*[f"total_minutes_{si}" for si in range(len(selected_stops))]).alias("worst_case_minutes"),
320
+ pl.sum_horizontal(*[f"total_minutes_{si}" for si in range(len(selected_stops))]).alias("total_minutes")
321
+ )
322
+ )
323
+
324
+ if method == "minimize-worst-case":
325
+ df_times = df_times.sort("worst_case_minutes")
326
+ elif method == "minimize-total":
327
+ df_times = df_times.sort("total_minutes")
328
+
329
+ df_times = df_times.rename({"worst_case_minutes": "Worst Case Minutes", "total_minutes": "Total Minutes"})
330
+ for si in range(len(selected_stops)):
331
+ df_times = df_times.rename({f"total_minutes_{si}": f"t{si+1} mins"})
332
+
333
+ df_times = df_times.drop_nulls()
334
+
335
+ return df_times.head(show_top)
336
+
337
+ def cerate_app():
338
+ with gr.Blocks() as app:
339
+ gr.Markdown("## Optimal Public Transport Stop Finder in Prague")
340
+ gr.Markdown("""
341
+ Consider you are in Prague and you want to meet with your friends. What is the optimal stop to meet? Now you can find that with this app!
342
+
343
+ Time table data are being scraped from IDOS API, IDOS uses PID timetable data.""")
344
+
345
+ # Slider to select the number of people
346
+ number_of_stops = gr.Slider(
347
+ minimum=2,
348
+ maximum=12,
349
+ step=1,
350
+ value=3,
351
+ label="Number of People"
352
+ )
353
+
354
+ # Radio buttons to select the optimization method
355
+ method = gr.Radio(
356
+ choices=["Minimize worst case for each", "Minimize total time"],
357
+ value="Minimize worst case for each",
358
+ label="Optimization Method"
359
+ )
360
+
361
+ next_dt = get_next_meetup_time(4, 20) # Friday 20:00
362
+ next_date = next_dt.strftime("%d/%m/%Y")
363
+ next_time = next_dt.strftime("%H:%M")
364
+ # Date input in DD/MM/YYYY format
365
+ date_input = gr.Textbox(
366
+ label="Date (DD/MM/YYYY)",
367
+ placeholder=f"e.g., {next_date}",
368
+ value=next_date
369
+ )
370
+
371
+ # Time input in HH:MM format
372
+ time_input = gr.Textbox(
373
+ label="Time (HH:MM)",
374
+ placeholder=f"e.g., {next_time}",
375
+ value=next_time
376
+ )
377
+
378
+ # Dropdowns for selecting starting stops, initially hidden
379
+ dropdowns = []
380
+ for i in range(12):
381
+ dd = gr.Dropdown(
382
+ choices=ALL_STOPS,
383
+ label=f"Choose Starting Stop #{i+1}",
384
+ visible=False # Start hidden; we will unhide as needed
385
+ )
386
+ dropdowns.append(dd)
387
+
388
+ def update_dropdowns(n):
389
+ updates = []
390
+ for i in range(12):
391
+ if i < n:
392
+ updates.append(gr.update(visible=True))
393
+ else:
394
+ updates.append(gr.update(visible=False))
395
+ return updates
396
+
397
+ # Update the visibility of dropdowns based on the number of stops selected
398
+ number_of_stops.change(
399
+ fn=update_dropdowns,
400
+ inputs=number_of_stops,
401
+ outputs=dropdowns
402
+ )
403
+
404
+ # Search button to trigger the optimization
405
+ search_button = gr.Button("Search")
406
+
407
+ def search_optimal_stop(num_stops, chosen_method, date_str, time_str, *all_stops):
408
+ # Validate Date
409
+ is_valid, error_message = validate_date_time(date_str, time_str)
410
+ if not is_valid:
411
+ raise gr.Error(error_message)
412
+
413
+ # Extract selected stops based on the number of stops
414
+ selected_stops = [stop for stop in all_stops[:num_stops] if stop]
415
+ print("Number of stops:", num_stops)
416
+ print("Method selected:", chosen_method)
417
+ print("Selected stops:", selected_stops)
418
+ print("Selected date:", date_str)
419
+ print("Selected time:", time_str)
420
+
421
+ if chosen_method == "Minimize worst case for each":
422
+ method = "minimize-worst-case"
423
+ else:
424
+ method = "minimize-total"
425
+
426
+ # Here, you can modify how date_str and time_str are used in your logic
427
+ # For example, you might want to convert them to datetime objects
428
+ try:
429
+ # Optionally parse date and time to datetime objects
430
+ event_datetime = datetime.strptime(f"{date_str} {time_str}", "%d/%m/%Y %H:%M")
431
+ print("Event DateTime:", event_datetime)
432
+ except ValueError as e:
433
+ # This should not happen due to prior validation, but added for safety
434
+ raise gr.Error(f"Error parsing date and time: {e}")
435
+
436
+ df_top = get_geo_optimal_stop(method, selected_stops, show_top=SHOW_TOP+5)
437
+ target_stops = df_top["target_stop"].to_list()
438
+ df_times = get_time_optimal_stop(method, selected_stops, target_stops, event_datetime, show_top=SHOW_TOP)
439
+ df_times = df_times.with_row_index("#", offset=1)
440
+
441
+ return df_times
442
+
443
+ # Dataframe to display the results
444
+ results_table = gr.Dataframe(
445
+ headers=["Target Stop", "Worst Case Minutes", "Total Minutes"],
446
+ datatype=["str", "number", "str"],
447
+ label="Optimal Stops"
448
+ )
449
+
450
+ # Configure the search button to call the callback with the new inputs
451
+ search_button.click(
452
+ fn=search_optimal_stop,
453
+ inputs=[number_of_stops, method, date_input, time_input] + dropdowns,
454
+ outputs=results_table
455
+ )
456
+
457
+ # On load, display the first 3 dropdowns and hide the rest
458
+ app.load(
459
+ lambda: [gr.update(visible=True) for _ in range(3)] + [gr.update(visible=False) for _ in range(9)],
460
+ inputs=[],
461
+ outputs=dropdowns
462
+ )
463
+
464
+ gr.Markdown("---")
465
+ gr.Markdown("""
466
+ Created by [Daniel Herman](https://www.hermandaniel.com), check out the code [detrin/pub-finder](https://github.com/detrin/pub-finder).
467
+ """)
468
+ return app
469
+
470
+ print("Loading time table file ...")
471
+ prague_stops = pl.read_csv('Prague_stops_geo.csv')
472
+ print("Calculating distances between stops ...")
473
+ stops_geo_dist = (
474
+ prague_stops.join(prague_stops, how='cross')
475
+ .with_columns(
476
+ pl.struct(['lat', 'lon', 'lat_right', 'lon_right']).map_elements(
477
+ lambda x: geopy.distance.geodesic((x['lat'], x['lon']), (x['lat_right'], x['lon_right'])).km,
478
+ return_dtype=pl.Float64
479
+ ).alias('distance_in_km')
480
+ )
481
+ .rename({"name": "from", "name_right": "to"})
482
+ .select(["from", "to", "distance_in_km"])
483
+ )
484
+ # stops_geo_dist = pl.read_csv("Prague_stops_geo_dist.csv")
485
+ DISTANCE_TABLE = stops_geo_dist
486
+ from_stops = DISTANCE_TABLE["from"].unique().sort().to_list()
487
+ to_stops = DISTANCE_TABLE["to"].unique().sort().to_list()
488
+ ALL_STOPS = sorted(list(set(from_stops) & set(to_stops)))
489
+ SHOW_TOP = 15
490
+ # DISTANCE_TABLE = None
491
+ # ALL_STOPS = None
492
+
493
+ if __name__ == "__main__":
494
+ app = cerate_app()
495
+ print("starting app ...")
496
+ app.launch()