gabcares commited on
Commit
b38e594
·
verified ·
1 Parent(s): b552759

Update utils/data.py

Browse files

Request cache on app directory ./

Files changed (1) hide show
  1. utils/data.py +163 -163
utils/data.py CHANGED
@@ -1,163 +1,163 @@
1
- import logging
2
- import requests
3
- import requests_cache
4
- from geopy.geocoders import Nominatim
5
- from utils.config import WEATHER_FILE, WEATHER_FILE_URL, TEST_FILE, TEST_FILE_URL, TRAIN_FILE, TRAIN_FILE_URL, ONE_WEEK_SEC
6
- from cachetools import TTLCache, cached
7
- from typing import List, Dict
8
- from pathlib import Path
9
- import time
10
-
11
- import pandas as pd
12
- # Set pandas to display all columns
13
- pd.set_option("display.max_columns", None)
14
-
15
- # High precision longitudes and Latitudes
16
- pd.set_option('display.float_format', '{:.16f}'.format)
17
-
18
- # Install persistent cache
19
- requests_cache.install_cache('yassir_requests_cache', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
20
-
21
-
22
- # Log
23
- logging.basicConfig(level=logging.ERROR,
24
- format='%(asctime)s - %(levelname)s - %(message)s')
25
-
26
- # Date columns to parse
27
- parse_dates = ['Timestamp']
28
-
29
- dtype = {
30
- 'Origin_lat': 'float64',
31
- 'Origin_lon': 'float64',
32
- 'Destination_lat': 'float64',
33
- 'Destination_lon': 'float64',
34
- }
35
-
36
- dtype_weather = {
37
- 'dewpoint_2m_temperature': 'float64',
38
- 'maximum_2m_air_temperature': 'float64',
39
- 'mean_2m_air_temperature': 'float64',
40
- 'mean_sea_level_pressure': 'float64',
41
- 'minimum_2m_air_temperature': 'float64',
42
- 'surface_pressure': 'float64',
43
- 'total_precipitation': 'float64',
44
- 'u_component_of_wind_10m': 'float64',
45
- 'v_component_of_wind_10m': 'float64',
46
- }
47
-
48
- # Load CSV files
49
-
50
-
51
- # @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # change cache library
52
- def get_data_df(file: Path, file_url: str, parse_dates: List[str], dtype: Dict[str, str]) -> pd.DataFrame:
53
- df = None
54
- try:
55
- df = pd.read_csv(file_url, parse_dates=parse_dates, dtype=dtype)
56
- except Exception as e:
57
- df = pd.read_csv(file, parse_dates=parse_dates, dtype=dtype)
58
- logging.error(
59
- f"Oops, the file is not available on the url, trying a local version: {e}")
60
- finally:
61
- return df
62
-
63
-
64
- # @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # unhassable dict. change cache library
65
- def clean_df(df: pd.DataFrame) -> pd.DataFrame:
66
- df.columns = [col.lower() for col in df.columns]
67
- return df
68
-
69
-
70
- # Read and get cleaned data
71
- test_df = clean_df(get_data_df(TEST_FILE, TEST_FILE_URL,
72
- parse_dates=parse_dates, dtype=dtype))
73
- train_df = clean_df(get_data_df(TRAIN_FILE, TRAIN_FILE_URL,
74
- parse_dates=parse_dates, dtype=dtype))
75
- weather_df = clean_df(get_data_df(
76
- WEATHER_FILE, WEATHER_FILE_URL, parse_dates=['date'], dtype=dtype_weather))
77
-
78
-
79
- def time_sec_hms(sec: float) -> str:
80
- return time.strftime("%H:%M:%S", time.gmtime(sec))
81
-
82
- def full_time_sec_hms(sec: float) -> str:
83
- hours = sec // 3600
84
- minutes = (sec % 3600) // 60
85
- remaining_sec = sec % 60
86
- return f"{round(hours):,}h: {round(minutes)}m: {round(remaining_sec)}s"
87
-
88
-
89
- @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory
90
- def get_country_geojson():
91
- data = (
92
- pd.concat(
93
- [
94
- train_df[['origin_lat', 'origin_lon', ]].rename(columns={'origin_lat': 'latitude', 'origin_lon': 'longitude'}),
95
- train_df[['destination_lat', 'destination_lon']].rename(columns={'destination_lat': 'latitude', 'destination_lon': 'longitude'})
96
- ],
97
- ignore_index=True
98
- )
99
- .drop_duplicates()
100
- )
101
-
102
- # Initialize the Nominatim Geocoder
103
- geolocator = Nominatim(user_agent="yassirAPP")
104
-
105
- # Function to reverse geocode
106
- def reverse_geocode(lat, lon):
107
- location = geolocator.reverse((lat, lon), exactly_one=True)
108
- address = location.raw['address']
109
- country = address.get('country', '')
110
- return country
111
-
112
- # Apply reverse geocoding to min latitude and longitude pair and also the maximum in the DataFrame
113
- # Find the minimum latitude and longitude
114
- min_lat = data['latitude'].min()
115
- min_lon = data['longitude'].min()
116
- max_lat = data['latitude'].max()
117
- max_lon = data['longitude'].max()
118
- country_min = reverse_geocode(min_lat, min_lon)
119
- country_max = reverse_geocode(max_lat, max_lon)
120
-
121
- if country_min == country_max:
122
- country = country_min
123
-
124
-
125
- # Get the location for Kenya
126
- location = geolocator.geocode(country, exactly_one=True)
127
-
128
- # If the location is found
129
- if location:
130
- # Get the bounding box for Kenya
131
- bounding_box = location.raw['boundingbox']
132
- print(f"Bounding Box: {bounding_box}")
133
-
134
- # Nominatim API URL with query parameters
135
- url = "https://nominatim.openstreetmap.org/search"
136
-
137
- # Parameters for the request
138
- params = {
139
- 'q': country, # Kenya
140
- 'format': 'json',
141
- 'polygon_geojson': 1 # Request GeoJSON polygons in the response
142
- }
143
-
144
- # Headers for the request
145
- headers = {
146
- 'User-Agent': 'yassirAPP'
147
- }
148
-
149
- # Send the request to Nominatim with headers
150
- response = requests.get(url, params=params, headers=headers)
151
-
152
- # Check if the request was successful
153
- if response.status_code == 200:
154
- country_geojson = response.json()
155
-
156
- geojson = country_geojson[0]['geojson']
157
-
158
-
159
- return country, geojson, data
160
-
161
-
162
-
163
-
 
1
+ import logging
2
+ import requests
3
+ import requests_cache
4
+ from geopy.geocoders import Nominatim
5
+ from utils.config import WEATHER_FILE, WEATHER_FILE_URL, TEST_FILE, TEST_FILE_URL, TRAIN_FILE, TRAIN_FILE_URL, ONE_WEEK_SEC
6
+ from cachetools import TTLCache, cached
7
+ from typing import List, Dict
8
+ from pathlib import Path
9
+ import time
10
+
11
+ import pandas as pd
12
+ # Set pandas to display all columns
13
+ pd.set_option("display.max_columns", None)
14
+
15
+ # High precision longitudes and Latitudes
16
+ pd.set_option('display.float_format', '{:.16f}'.format)
17
+
18
+ # Install persistent cache
19
+ requests_cache.install_cache('./yassir_requests_cache', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
20
+
21
+
22
+ # Log
23
+ logging.basicConfig(level=logging.ERROR,
24
+ format='%(asctime)s - %(levelname)s - %(message)s')
25
+
26
+ # Date columns to parse
27
+ parse_dates = ['Timestamp']
28
+
29
+ dtype = {
30
+ 'Origin_lat': 'float64',
31
+ 'Origin_lon': 'float64',
32
+ 'Destination_lat': 'float64',
33
+ 'Destination_lon': 'float64',
34
+ }
35
+
36
+ dtype_weather = {
37
+ 'dewpoint_2m_temperature': 'float64',
38
+ 'maximum_2m_air_temperature': 'float64',
39
+ 'mean_2m_air_temperature': 'float64',
40
+ 'mean_sea_level_pressure': 'float64',
41
+ 'minimum_2m_air_temperature': 'float64',
42
+ 'surface_pressure': 'float64',
43
+ 'total_precipitation': 'float64',
44
+ 'u_component_of_wind_10m': 'float64',
45
+ 'v_component_of_wind_10m': 'float64',
46
+ }
47
+
48
+ # Load CSV files
49
+
50
+
51
+ # @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # change cache library
52
+ def get_data_df(file: Path, file_url: str, parse_dates: List[str], dtype: Dict[str, str]) -> pd.DataFrame:
53
+ df = None
54
+ try:
55
+ df = pd.read_csv(file_url, parse_dates=parse_dates, dtype=dtype)
56
+ except Exception as e:
57
+ df = pd.read_csv(file, parse_dates=parse_dates, dtype=dtype)
58
+ logging.error(
59
+ f"Oops, the file is not available on the url, trying a local version: {e}")
60
+ finally:
61
+ return df
62
+
63
+
64
+ # @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # unhassable dict. change cache library
65
+ def clean_df(df: pd.DataFrame) -> pd.DataFrame:
66
+ df.columns = [col.lower() for col in df.columns]
67
+ return df
68
+
69
+
70
+ # Read and get cleaned data
71
+ test_df = clean_df(get_data_df(TEST_FILE, TEST_FILE_URL,
72
+ parse_dates=parse_dates, dtype=dtype))
73
+ train_df = clean_df(get_data_df(TRAIN_FILE, TRAIN_FILE_URL,
74
+ parse_dates=parse_dates, dtype=dtype))
75
+ weather_df = clean_df(get_data_df(
76
+ WEATHER_FILE, WEATHER_FILE_URL, parse_dates=['date'], dtype=dtype_weather))
77
+
78
+
79
+ def time_sec_hms(sec: float) -> str:
80
+ return time.strftime("%H:%M:%S", time.gmtime(sec))
81
+
82
+ def full_time_sec_hms(sec: float) -> str:
83
+ hours = sec // 3600
84
+ minutes = (sec % 3600) // 60
85
+ remaining_sec = sec % 60
86
+ return f"{round(hours):,}h: {round(minutes)}m: {round(remaining_sec)}s"
87
+
88
+
89
+ @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory
90
+ def get_country_geojson():
91
+ data = (
92
+ pd.concat(
93
+ [
94
+ train_df[['origin_lat', 'origin_lon', ]].rename(columns={'origin_lat': 'latitude', 'origin_lon': 'longitude'}),
95
+ train_df[['destination_lat', 'destination_lon']].rename(columns={'destination_lat': 'latitude', 'destination_lon': 'longitude'})
96
+ ],
97
+ ignore_index=True
98
+ )
99
+ .drop_duplicates()
100
+ )
101
+
102
+ # Initialize the Nominatim Geocoder
103
+ geolocator = Nominatim(user_agent="yassirAPP")
104
+
105
+ # Function to reverse geocode
106
+ def reverse_geocode(lat, lon):
107
+ location = geolocator.reverse((lat, lon), exactly_one=True)
108
+ address = location.raw['address']
109
+ country = address.get('country', '')
110
+ return country
111
+
112
+ # Apply reverse geocoding to min latitude and longitude pair and also the maximum in the DataFrame
113
+ # Find the minimum latitude and longitude
114
+ min_lat = data['latitude'].min()
115
+ min_lon = data['longitude'].min()
116
+ max_lat = data['latitude'].max()
117
+ max_lon = data['longitude'].max()
118
+ country_min = reverse_geocode(min_lat, min_lon)
119
+ country_max = reverse_geocode(max_lat, max_lon)
120
+
121
+ if country_min == country_max:
122
+ country = country_min
123
+
124
+
125
+ # Get the location for Kenya
126
+ location = geolocator.geocode(country, exactly_one=True)
127
+
128
+ # If the location is found
129
+ if location:
130
+ # Get the bounding box for Kenya
131
+ bounding_box = location.raw['boundingbox']
132
+ print(f"Bounding Box: {bounding_box}")
133
+
134
+ # Nominatim API URL with query parameters
135
+ url = "https://nominatim.openstreetmap.org/search"
136
+
137
+ # Parameters for the request
138
+ params = {
139
+ 'q': country, # Kenya
140
+ 'format': 'json',
141
+ 'polygon_geojson': 1 # Request GeoJSON polygons in the response
142
+ }
143
+
144
+ # Headers for the request
145
+ headers = {
146
+ 'User-Agent': 'yassirAPP'
147
+ }
148
+
149
+ # Send the request to Nominatim with headers
150
+ response = requests.get(url, params=params, headers=headers)
151
+
152
+ # Check if the request was successful
153
+ if response.status_code == 200:
154
+ country_geojson = response.json()
155
+
156
+ geojson = country_geojson[0]['geojson']
157
+
158
+
159
+ return country, geojson, data
160
+
161
+
162
+
163
+