Spaces:
Sleeping
Sleeping
Update utils/data.py
Browse filesRequest cache on app directory ./
- utils/data.py +163 -163
utils/data.py
CHANGED
@@ -1,163 +1,163 @@
|
|
1 |
-
import logging
|
2 |
-
import requests
|
3 |
-
import requests_cache
|
4 |
-
from geopy.geocoders import Nominatim
|
5 |
-
from utils.config import WEATHER_FILE, WEATHER_FILE_URL, TEST_FILE, TEST_FILE_URL, TRAIN_FILE, TRAIN_FILE_URL, ONE_WEEK_SEC
|
6 |
-
from cachetools import TTLCache, cached
|
7 |
-
from typing import List, Dict
|
8 |
-
from pathlib import Path
|
9 |
-
import time
|
10 |
-
|
11 |
-
import pandas as pd
|
12 |
-
# Set pandas to display all columns
|
13 |
-
pd.set_option("display.max_columns", None)
|
14 |
-
|
15 |
-
# High precision longitudes and Latitudes
|
16 |
-
pd.set_option('display.float_format', '{:.16f}'.format)
|
17 |
-
|
18 |
-
# Install persistent cache
|
19 |
-
requests_cache.install_cache('yassir_requests_cache', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
|
20 |
-
|
21 |
-
|
22 |
-
# Log
|
23 |
-
logging.basicConfig(level=logging.ERROR,
|
24 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
25 |
-
|
26 |
-
# Date columns to parse
|
27 |
-
parse_dates = ['Timestamp']
|
28 |
-
|
29 |
-
dtype = {
|
30 |
-
'Origin_lat': 'float64',
|
31 |
-
'Origin_lon': 'float64',
|
32 |
-
'Destination_lat': 'float64',
|
33 |
-
'Destination_lon': 'float64',
|
34 |
-
}
|
35 |
-
|
36 |
-
dtype_weather = {
|
37 |
-
'dewpoint_2m_temperature': 'float64',
|
38 |
-
'maximum_2m_air_temperature': 'float64',
|
39 |
-
'mean_2m_air_temperature': 'float64',
|
40 |
-
'mean_sea_level_pressure': 'float64',
|
41 |
-
'minimum_2m_air_temperature': 'float64',
|
42 |
-
'surface_pressure': 'float64',
|
43 |
-
'total_precipitation': 'float64',
|
44 |
-
'u_component_of_wind_10m': 'float64',
|
45 |
-
'v_component_of_wind_10m': 'float64',
|
46 |
-
}
|
47 |
-
|
48 |
-
# Load CSV files
|
49 |
-
|
50 |
-
|
51 |
-
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # change cache library
|
52 |
-
def get_data_df(file: Path, file_url: str, parse_dates: List[str], dtype: Dict[str, str]) -> pd.DataFrame:
|
53 |
-
df = None
|
54 |
-
try:
|
55 |
-
df = pd.read_csv(file_url, parse_dates=parse_dates, dtype=dtype)
|
56 |
-
except Exception as e:
|
57 |
-
df = pd.read_csv(file, parse_dates=parse_dates, dtype=dtype)
|
58 |
-
logging.error(
|
59 |
-
f"Oops, the file is not available on the url, trying a local version: {e}")
|
60 |
-
finally:
|
61 |
-
return df
|
62 |
-
|
63 |
-
|
64 |
-
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # unhassable dict. change cache library
|
65 |
-
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
|
66 |
-
df.columns = [col.lower() for col in df.columns]
|
67 |
-
return df
|
68 |
-
|
69 |
-
|
70 |
-
# Read and get cleaned data
|
71 |
-
test_df = clean_df(get_data_df(TEST_FILE, TEST_FILE_URL,
|
72 |
-
parse_dates=parse_dates, dtype=dtype))
|
73 |
-
train_df = clean_df(get_data_df(TRAIN_FILE, TRAIN_FILE_URL,
|
74 |
-
parse_dates=parse_dates, dtype=dtype))
|
75 |
-
weather_df = clean_df(get_data_df(
|
76 |
-
WEATHER_FILE, WEATHER_FILE_URL, parse_dates=['date'], dtype=dtype_weather))
|
77 |
-
|
78 |
-
|
79 |
-
def time_sec_hms(sec: float) -> str:
|
80 |
-
return time.strftime("%H:%M:%S", time.gmtime(sec))
|
81 |
-
|
82 |
-
def full_time_sec_hms(sec: float) -> str:
|
83 |
-
hours = sec // 3600
|
84 |
-
minutes = (sec % 3600) // 60
|
85 |
-
remaining_sec = sec % 60
|
86 |
-
return f"{round(hours):,}h: {round(minutes)}m: {round(remaining_sec)}s"
|
87 |
-
|
88 |
-
|
89 |
-
@cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory
|
90 |
-
def get_country_geojson():
|
91 |
-
data = (
|
92 |
-
pd.concat(
|
93 |
-
[
|
94 |
-
train_df[['origin_lat', 'origin_lon', ]].rename(columns={'origin_lat': 'latitude', 'origin_lon': 'longitude'}),
|
95 |
-
train_df[['destination_lat', 'destination_lon']].rename(columns={'destination_lat': 'latitude', 'destination_lon': 'longitude'})
|
96 |
-
],
|
97 |
-
ignore_index=True
|
98 |
-
)
|
99 |
-
.drop_duplicates()
|
100 |
-
)
|
101 |
-
|
102 |
-
# Initialize the Nominatim Geocoder
|
103 |
-
geolocator = Nominatim(user_agent="yassirAPP")
|
104 |
-
|
105 |
-
# Function to reverse geocode
|
106 |
-
def reverse_geocode(lat, lon):
|
107 |
-
location = geolocator.reverse((lat, lon), exactly_one=True)
|
108 |
-
address = location.raw['address']
|
109 |
-
country = address.get('country', '')
|
110 |
-
return country
|
111 |
-
|
112 |
-
# Apply reverse geocoding to min latitude and longitude pair and also the maximum in the DataFrame
|
113 |
-
# Find the minimum latitude and longitude
|
114 |
-
min_lat = data['latitude'].min()
|
115 |
-
min_lon = data['longitude'].min()
|
116 |
-
max_lat = data['latitude'].max()
|
117 |
-
max_lon = data['longitude'].max()
|
118 |
-
country_min = reverse_geocode(min_lat, min_lon)
|
119 |
-
country_max = reverse_geocode(max_lat, max_lon)
|
120 |
-
|
121 |
-
if country_min == country_max:
|
122 |
-
country = country_min
|
123 |
-
|
124 |
-
|
125 |
-
# Get the location for Kenya
|
126 |
-
location = geolocator.geocode(country, exactly_one=True)
|
127 |
-
|
128 |
-
# If the location is found
|
129 |
-
if location:
|
130 |
-
# Get the bounding box for Kenya
|
131 |
-
bounding_box = location.raw['boundingbox']
|
132 |
-
print(f"Bounding Box: {bounding_box}")
|
133 |
-
|
134 |
-
# Nominatim API URL with query parameters
|
135 |
-
url = "https://nominatim.openstreetmap.org/search"
|
136 |
-
|
137 |
-
# Parameters for the request
|
138 |
-
params = {
|
139 |
-
'q': country, # Kenya
|
140 |
-
'format': 'json',
|
141 |
-
'polygon_geojson': 1 # Request GeoJSON polygons in the response
|
142 |
-
}
|
143 |
-
|
144 |
-
# Headers for the request
|
145 |
-
headers = {
|
146 |
-
'User-Agent': 'yassirAPP'
|
147 |
-
}
|
148 |
-
|
149 |
-
# Send the request to Nominatim with headers
|
150 |
-
response = requests.get(url, params=params, headers=headers)
|
151 |
-
|
152 |
-
# Check if the request was successful
|
153 |
-
if response.status_code == 200:
|
154 |
-
country_geojson = response.json()
|
155 |
-
|
156 |
-
geojson = country_geojson[0]['geojson']
|
157 |
-
|
158 |
-
|
159 |
-
return country, geojson, data
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
1 |
+
import logging
|
2 |
+
import requests
|
3 |
+
import requests_cache
|
4 |
+
from geopy.geocoders import Nominatim
|
5 |
+
from utils.config import WEATHER_FILE, WEATHER_FILE_URL, TEST_FILE, TEST_FILE_URL, TRAIN_FILE, TRAIN_FILE_URL, ONE_WEEK_SEC
|
6 |
+
from cachetools import TTLCache, cached
|
7 |
+
from typing import List, Dict
|
8 |
+
from pathlib import Path
|
9 |
+
import time
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
# Set pandas to display all columns
|
13 |
+
pd.set_option("display.max_columns", None)
|
14 |
+
|
15 |
+
# High precision longitudes and Latitudes
|
16 |
+
pd.set_option('display.float_format', '{:.16f}'.format)
|
17 |
+
|
18 |
+
# Install persistent cache
|
19 |
+
requests_cache.install_cache('./yassir_requests_cache', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
|
20 |
+
|
21 |
+
|
22 |
+
# Log
|
23 |
+
logging.basicConfig(level=logging.ERROR,
|
24 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
25 |
+
|
26 |
+
# Date columns to parse
|
27 |
+
parse_dates = ['Timestamp']
|
28 |
+
|
29 |
+
dtype = {
|
30 |
+
'Origin_lat': 'float64',
|
31 |
+
'Origin_lon': 'float64',
|
32 |
+
'Destination_lat': 'float64',
|
33 |
+
'Destination_lon': 'float64',
|
34 |
+
}
|
35 |
+
|
36 |
+
dtype_weather = {
|
37 |
+
'dewpoint_2m_temperature': 'float64',
|
38 |
+
'maximum_2m_air_temperature': 'float64',
|
39 |
+
'mean_2m_air_temperature': 'float64',
|
40 |
+
'mean_sea_level_pressure': 'float64',
|
41 |
+
'minimum_2m_air_temperature': 'float64',
|
42 |
+
'surface_pressure': 'float64',
|
43 |
+
'total_precipitation': 'float64',
|
44 |
+
'u_component_of_wind_10m': 'float64',
|
45 |
+
'v_component_of_wind_10m': 'float64',
|
46 |
+
}
|
47 |
+
|
48 |
+
# Load CSV files
|
49 |
+
|
50 |
+
|
51 |
+
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # change cache library
|
52 |
+
def get_data_df(file: Path, file_url: str, parse_dates: List[str], dtype: Dict[str, str]) -> pd.DataFrame:
|
53 |
+
df = None
|
54 |
+
try:
|
55 |
+
df = pd.read_csv(file_url, parse_dates=parse_dates, dtype=dtype)
|
56 |
+
except Exception as e:
|
57 |
+
df = pd.read_csv(file, parse_dates=parse_dates, dtype=dtype)
|
58 |
+
logging.error(
|
59 |
+
f"Oops, the file is not available on the url, trying a local version: {e}")
|
60 |
+
finally:
|
61 |
+
return df
|
62 |
+
|
63 |
+
|
64 |
+
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # unhassable dict. change cache library
|
65 |
+
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
|
66 |
+
df.columns = [col.lower() for col in df.columns]
|
67 |
+
return df
|
68 |
+
|
69 |
+
|
70 |
+
# Read and get cleaned data
|
71 |
+
test_df = clean_df(get_data_df(TEST_FILE, TEST_FILE_URL,
|
72 |
+
parse_dates=parse_dates, dtype=dtype))
|
73 |
+
train_df = clean_df(get_data_df(TRAIN_FILE, TRAIN_FILE_URL,
|
74 |
+
parse_dates=parse_dates, dtype=dtype))
|
75 |
+
weather_df = clean_df(get_data_df(
|
76 |
+
WEATHER_FILE, WEATHER_FILE_URL, parse_dates=['date'], dtype=dtype_weather))
|
77 |
+
|
78 |
+
|
79 |
+
def time_sec_hms(sec: float) -> str:
|
80 |
+
return time.strftime("%H:%M:%S", time.gmtime(sec))
|
81 |
+
|
82 |
+
def full_time_sec_hms(sec: float) -> str:
|
83 |
+
hours = sec // 3600
|
84 |
+
minutes = (sec % 3600) // 60
|
85 |
+
remaining_sec = sec % 60
|
86 |
+
return f"{round(hours):,}h: {round(minutes)}m: {round(remaining_sec)}s"
|
87 |
+
|
88 |
+
|
89 |
+
@cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory
|
90 |
+
def get_country_geojson():
|
91 |
+
data = (
|
92 |
+
pd.concat(
|
93 |
+
[
|
94 |
+
train_df[['origin_lat', 'origin_lon', ]].rename(columns={'origin_lat': 'latitude', 'origin_lon': 'longitude'}),
|
95 |
+
train_df[['destination_lat', 'destination_lon']].rename(columns={'destination_lat': 'latitude', 'destination_lon': 'longitude'})
|
96 |
+
],
|
97 |
+
ignore_index=True
|
98 |
+
)
|
99 |
+
.drop_duplicates()
|
100 |
+
)
|
101 |
+
|
102 |
+
# Initialize the Nominatim Geocoder
|
103 |
+
geolocator = Nominatim(user_agent="yassirAPP")
|
104 |
+
|
105 |
+
# Function to reverse geocode
|
106 |
+
def reverse_geocode(lat, lon):
|
107 |
+
location = geolocator.reverse((lat, lon), exactly_one=True)
|
108 |
+
address = location.raw['address']
|
109 |
+
country = address.get('country', '')
|
110 |
+
return country
|
111 |
+
|
112 |
+
# Apply reverse geocoding to min latitude and longitude pair and also the maximum in the DataFrame
|
113 |
+
# Find the minimum latitude and longitude
|
114 |
+
min_lat = data['latitude'].min()
|
115 |
+
min_lon = data['longitude'].min()
|
116 |
+
max_lat = data['latitude'].max()
|
117 |
+
max_lon = data['longitude'].max()
|
118 |
+
country_min = reverse_geocode(min_lat, min_lon)
|
119 |
+
country_max = reverse_geocode(max_lat, max_lon)
|
120 |
+
|
121 |
+
if country_min == country_max:
|
122 |
+
country = country_min
|
123 |
+
|
124 |
+
|
125 |
+
# Get the location for Kenya
|
126 |
+
location = geolocator.geocode(country, exactly_one=True)
|
127 |
+
|
128 |
+
# If the location is found
|
129 |
+
if location:
|
130 |
+
# Get the bounding box for Kenya
|
131 |
+
bounding_box = location.raw['boundingbox']
|
132 |
+
print(f"Bounding Box: {bounding_box}")
|
133 |
+
|
134 |
+
# Nominatim API URL with query parameters
|
135 |
+
url = "https://nominatim.openstreetmap.org/search"
|
136 |
+
|
137 |
+
# Parameters for the request
|
138 |
+
params = {
|
139 |
+
'q': country, # Kenya
|
140 |
+
'format': 'json',
|
141 |
+
'polygon_geojson': 1 # Request GeoJSON polygons in the response
|
142 |
+
}
|
143 |
+
|
144 |
+
# Headers for the request
|
145 |
+
headers = {
|
146 |
+
'User-Agent': 'yassirAPP'
|
147 |
+
}
|
148 |
+
|
149 |
+
# Send the request to Nominatim with headers
|
150 |
+
response = requests.get(url, params=params, headers=headers)
|
151 |
+
|
152 |
+
# Check if the request was successful
|
153 |
+
if response.status_code == 200:
|
154 |
+
country_geojson = response.json()
|
155 |
+
|
156 |
+
geojson = country_geojson[0]['geojson']
|
157 |
+
|
158 |
+
|
159 |
+
return country, geojson, data
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|