Spaces:
Sleeping
Sleeping
File size: 5,393 Bytes
b38e594 984358c b38e594 984358c b38e594 984358c b38e594 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import logging
import requests
import requests_cache
from geopy.geocoders import Nominatim
from utils.config import WEATHER_FILE, WEATHER_FILE_URL, TEST_FILE, TEST_FILE_URL, TRAIN_FILE, TRAIN_FILE_URL, ONE_WEEK_SEC
from cachetools import TTLCache, cached
from typing import List, Dict
from pathlib import Path
import time
import pandas as pd
# Set pandas to display all columns
pd.set_option("display.max_columns", None)
# High precision longitudes and Latitudes
pd.set_option('display.float_format', '{:.16f}'.format)
# Install persistent cache
# requests_cache.install_cache('/client/requests_cache/yassir_requests_cache', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
requests_cache.install_cache(backend='memory', expire_after=ONE_WEEK_SEC) # Cache expires after 1 week
# Log
logging.basicConfig(level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s')
# Date columns to parse
parse_dates = ['Timestamp']
dtype = {
'Origin_lat': 'float64',
'Origin_lon': 'float64',
'Destination_lat': 'float64',
'Destination_lon': 'float64',
}
dtype_weather = {
'dewpoint_2m_temperature': 'float64',
'maximum_2m_air_temperature': 'float64',
'mean_2m_air_temperature': 'float64',
'mean_sea_level_pressure': 'float64',
'minimum_2m_air_temperature': 'float64',
'surface_pressure': 'float64',
'total_precipitation': 'float64',
'u_component_of_wind_10m': 'float64',
'v_component_of_wind_10m': 'float64',
}
# Load CSV files
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # change cache library
def get_data_df(file: Path, file_url: str, parse_dates: List[str], dtype: Dict[str, str]) -> pd.DataFrame:
df = None
try:
df = pd.read_csv(file_url, parse_dates=parse_dates, dtype=dtype)
except Exception as e:
df = pd.read_csv(file, parse_dates=parse_dates, dtype=dtype)
logging.error(
f"Oops, the file is not available on the url, trying a local version: {e}")
finally:
return df
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory # unhassable dict. change cache library
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
df.columns = [col.lower() for col in df.columns]
return df
# Read and get cleaned data
test_df = clean_df(get_data_df(TEST_FILE, TEST_FILE_URL,
parse_dates=parse_dates, dtype=dtype))
train_df = clean_df(get_data_df(TRAIN_FILE, TRAIN_FILE_URL,
parse_dates=parse_dates, dtype=dtype))
weather_df = clean_df(get_data_df(
WEATHER_FILE, WEATHER_FILE_URL, parse_dates=['date'], dtype=dtype_weather))
def time_sec_hms(sec: float) -> str:
return time.strftime("%H:%M:%S", time.gmtime(sec))
def full_time_sec_hms(sec: float) -> str:
hours = sec // 3600
minutes = (sec % 3600) // 60
remaining_sec = sec % 60
return f"{round(hours):,}h: {round(minutes)}m: {round(remaining_sec)}s"
# @cached(cache=TTLCache(maxsize=100000, ttl=ONE_WEEK_SEC)) # Memory
def get_country_geojson():
data = (
pd.concat(
[
train_df[['origin_lat', 'origin_lon', ]].rename(columns={'origin_lat': 'latitude', 'origin_lon': 'longitude'}),
train_df[['destination_lat', 'destination_lon']].rename(columns={'destination_lat': 'latitude', 'destination_lon': 'longitude'})
],
ignore_index=True
)
.drop_duplicates()
)
# Initialize the Nominatim Geocoder
geolocator = Nominatim(user_agent="yassirAPP")
# Function to reverse geocode
def reverse_geocode(lat, lon):
location = geolocator.reverse((lat, lon), exactly_one=True)
address = location.raw['address']
country = address.get('country', '')
return country
# Apply reverse geocoding to min latitude and longitude pair and also the maximum in the DataFrame
# Find the minimum latitude and longitude
min_lat = data['latitude'].min()
min_lon = data['longitude'].min()
max_lat = data['latitude'].max()
max_lon = data['longitude'].max()
country_min = reverse_geocode(min_lat, min_lon)
country_max = reverse_geocode(max_lat, max_lon)
if country_min == country_max:
country = country_min
# Get the location for Kenya
location = geolocator.geocode(country, exactly_one=True)
# If the location is found
if location:
# Get the bounding box for Kenya
bounding_box = location.raw['boundingbox']
print(f"Bounding Box: {bounding_box}")
# Nominatim API URL with query parameters
url = "https://nominatim.openstreetmap.org/search"
# Parameters for the request
params = {
'q': country, # Kenya
'format': 'json',
'polygon_geojson': 1 # Request GeoJSON polygons in the response
}
# Headers for the request
headers = {
'User-Agent': 'yassirAPP'
}
# Send the request to Nominatim with headers
response = requests.get(url, params=params, headers=headers)
# Check if the request was successful
if response.status_code == 200:
country_geojson = response.json()
geojson = country_geojson[0]['geojson']
return country, geojson, data
|