Spaces:
Sleeping
Sleeping
Shrijayan Rajendran8
commited on
Commit
·
5754a38
1
Parent(s):
fda387d
Add initial project setup with FastAPI, Docker, and geocoding functionality
Browse files- Dockerfile +9 -0
- distance_calculator.py +27 -0
- docker-compose.yml +9 -0
- main.py +225 -0
- place2geocode.py +60 -0
- requirements.txt +8 -0
- utils.py +75 -0
Dockerfile
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
distance_calculator.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
def get_distance(lon1, lat1, lon2, lat2):
|
7 |
+
API_URL = f'{os.getenv("ENDPOINT")}/route/v1/driving/{lon1},{lat1};{lon2},{lat2}'
|
8 |
+
print(API_URL)
|
9 |
+
API_TOKEN = os.getenv("HUGGINGFACE_API_KEY", "")
|
10 |
+
headers = {"Authorization": f"Bearer {API_TOKEN}"} if API_TOKEN else {}
|
11 |
+
|
12 |
+
response = requests.get(API_URL, headers=headers)
|
13 |
+
|
14 |
+
# print(response['routes'][0]['distance'])
|
15 |
+
|
16 |
+
if response.status_code == 200:
|
17 |
+
result = response.json()
|
18 |
+
return result
|
19 |
+
else:
|
20 |
+
return 0
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
lon1, lat1 = 13.388860,52.517037
|
24 |
+
lon2, lat2 = 13.397634,52.529407
|
25 |
+
|
26 |
+
result = get_distance(lon1, lat1, lon2, lat2)
|
27 |
+
print(result)
|
docker-compose.yml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3'
|
2 |
+
services:
|
3 |
+
app:
|
4 |
+
build: .
|
5 |
+
ports:
|
6 |
+
- "7860:7860"
|
7 |
+
environment:
|
8 |
+
- ENDPOINT=${ENDPOINT}
|
9 |
+
- HUGGINGFACE_API_KEY=${HUGGINGFACE_API_KEY}
|
main.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
2 |
+
from fastapi.responses import StreamingResponse
|
3 |
+
import pandas as pd
|
4 |
+
from io import BytesIO
|
5 |
+
import time
|
6 |
+
from place2geocode import get_lat_long
|
7 |
+
from utils import setup_logging, clean_address, handle_empty_values, validate_excel_file, meters_to_miles
|
8 |
+
from distance_calculator import get_distance
|
9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
logger = setup_logging()
|
13 |
+
|
14 |
+
|
15 |
+
app.add_middleware(
|
16 |
+
CORSMiddleware,
|
17 |
+
allow_origins=["*"],
|
18 |
+
allow_credentials=True,
|
19 |
+
allow_methods=["*"],
|
20 |
+
allow_headers=["*"],
|
21 |
+
)
|
22 |
+
|
23 |
+
def process_uploaded_file(file_stream):
|
24 |
+
"""Process uploaded file stream and return output Excel bytes"""
|
25 |
+
# Validate file
|
26 |
+
is_valid, message = validate_excel_file(file_stream)
|
27 |
+
if not is_valid:
|
28 |
+
raise ValueError(message)
|
29 |
+
|
30 |
+
# Reset stream position after validation
|
31 |
+
file_stream.seek(0)
|
32 |
+
|
33 |
+
# Rest of the processing code remains the same...
|
34 |
+
# Create in-memory output file
|
35 |
+
output = BytesIO()
|
36 |
+
|
37 |
+
with pd.ExcelWriter(output) as writer:
|
38 |
+
sheet_data = extract_address_data(file_stream)
|
39 |
+
|
40 |
+
if not sheet_data:
|
41 |
+
raise ValueError("No valid data found in any sheet.")
|
42 |
+
|
43 |
+
for sheet_name, df in sheet_data.items():
|
44 |
+
addresses = create_address_strings(df)
|
45 |
+
results = get_route_distances(addresses)
|
46 |
+
output_df = create_output_dataframe(sheet_name, results)
|
47 |
+
output_df.to_excel(writer, sheet_name=sheet_name, index=False)
|
48 |
+
|
49 |
+
output.seek(0)
|
50 |
+
return output
|
51 |
+
|
52 |
+
def extract_address_data(file_stream):
|
53 |
+
"""Extract address data from Excel file stream"""
|
54 |
+
sheet_data = {}
|
55 |
+
|
56 |
+
try:
|
57 |
+
# Read the Excel file directly from the file stream
|
58 |
+
excel_file = pd.read_excel(file_stream, sheet_name=None, header=1)
|
59 |
+
except Exception as e:
|
60 |
+
logger.error(f"Error reading Excel file: {str(e)}")
|
61 |
+
return {}
|
62 |
+
|
63 |
+
required_columns = [
|
64 |
+
'Address', 'City', 'Zipcode',
|
65 |
+
'Drop Address', 'Drop City', 'Drop Zipcode'
|
66 |
+
]
|
67 |
+
|
68 |
+
for sheet_name, df in excel_file.items():
|
69 |
+
if sheet_name.startswith('!'):
|
70 |
+
continue
|
71 |
+
|
72 |
+
logger.info(f"Processing sheet: {sheet_name}")
|
73 |
+
|
74 |
+
try:
|
75 |
+
if not all(col in df.columns for col in required_columns):
|
76 |
+
missing_cols = [col for col in required_columns if col not in df.columns]
|
77 |
+
logger.warning(f"Sheet '{sheet_name}' missing columns: {missing_cols}")
|
78 |
+
continue
|
79 |
+
|
80 |
+
df = handle_empty_values(df, required_columns)
|
81 |
+
df = df.loc[(df['Address'].str.strip() != '') &
|
82 |
+
(df['Drop Address'].str.strip() != '')]
|
83 |
+
|
84 |
+
if df.empty:
|
85 |
+
logger.warning(f"Sheet '{sheet_name}' has no valid data")
|
86 |
+
continue
|
87 |
+
|
88 |
+
sheet_data[sheet_name] = df
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Sheet {sheet_name} error: {str(e)}")
|
92 |
+
|
93 |
+
return sheet_data
|
94 |
+
|
95 |
+
@app.post("/upload")
|
96 |
+
async def upload_files(files: list[UploadFile] = File(...)):
|
97 |
+
"""Handle multiple file uploads and return processed ZIP"""
|
98 |
+
try:
|
99 |
+
zip_buffer = BytesIO()
|
100 |
+
with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED) as zip_file:
|
101 |
+
for file in files:
|
102 |
+
if not allowed_file(file.filename):
|
103 |
+
continue
|
104 |
+
|
105 |
+
file_stream = await file.read()
|
106 |
+
output = process_uploaded_file(BytesIO(file_stream))
|
107 |
+
zip_file.writestr(f"processed_{file.filename}", output.getvalue())
|
108 |
+
|
109 |
+
zip_buffer.seek(0)
|
110 |
+
return StreamingResponse(
|
111 |
+
zip_buffer,
|
112 |
+
media_type="application/zip",
|
113 |
+
headers={"Content-Disposition": "attachment; filename=processed_files.zip"}
|
114 |
+
)
|
115 |
+
|
116 |
+
except Exception as e:
|
117 |
+
logger.error(f"Processing error: {str(e)}")
|
118 |
+
raise HTTPException(status_code=500, detail=str(e))
|
119 |
+
|
120 |
+
def allowed_file(filename):
|
121 |
+
return '.' in filename and \
|
122 |
+
filename.rsplit('.', 1)[1].lower() in \
|
123 |
+
{'xlsx', 'xls'}
|
124 |
+
|
125 |
+
def create_address_strings(df):
|
126 |
+
"""Create formatted address strings for geocoding."""
|
127 |
+
addresses = []
|
128 |
+
|
129 |
+
for _, row in df.iterrows():
|
130 |
+
# Clean and format start address
|
131 |
+
start_address = clean_address(f"{row['Address']}, {row['City']}")
|
132 |
+
|
133 |
+
# Clean and format drop address
|
134 |
+
drop_address = clean_address(f"{row['Drop Address']}, {row['Drop City']}")
|
135 |
+
|
136 |
+
addresses.append((start_address, drop_address))
|
137 |
+
|
138 |
+
return addresses
|
139 |
+
|
140 |
+
def get_route_distances(addresses):
|
141 |
+
logger = setup_logging()
|
142 |
+
results = []
|
143 |
+
|
144 |
+
for start_address, drop_address in addresses:
|
145 |
+
try:
|
146 |
+
# Get coordinates for start address
|
147 |
+
logger.info(f"Geocoding start address: '{start_address}'")
|
148 |
+
|
149 |
+
start_coords = get_lat_long(start_address)
|
150 |
+
|
151 |
+
logger.info(f"Start coordinates: {start_coords}")
|
152 |
+
if not start_coords:
|
153 |
+
logger.warning(f"Could not geocode start address: '{start_address}'")
|
154 |
+
results.append((start_address, drop_address, None))
|
155 |
+
continue
|
156 |
+
|
157 |
+
# Get coordinates for drop address
|
158 |
+
logger.info(f"Geocoding drop address: '{drop_address}'")
|
159 |
+
|
160 |
+
drop_coords = get_lat_long(drop_address)
|
161 |
+
|
162 |
+
logger.info(f"Drop coordinates: {drop_coords}")
|
163 |
+
if not drop_coords:
|
164 |
+
logger.warning(f"Could not geocode drop address: '{drop_address}'")
|
165 |
+
results.append((start_address, drop_address, None))
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Geopy returns coordinates as (latitude, longitude)
|
169 |
+
lat1, lon1 = start_coords
|
170 |
+
lat2, lon2 = drop_coords
|
171 |
+
|
172 |
+
logger.info(f"Getting route from ({lat1}, {lon1}) to ({lat2}, {lon2})")
|
173 |
+
|
174 |
+
# Try to get driving distance from routing API
|
175 |
+
distance = get_distance(lon1, lat1, lon2, lat2)
|
176 |
+
print(distance)
|
177 |
+
|
178 |
+
# If API fails, fall back to direct distance
|
179 |
+
if distance is None:
|
180 |
+
logger.warning("API routing failed, falling back to direct distance calculation")
|
181 |
+
distance = get_distance(lat1, lon1, lat2, lon2)
|
182 |
+
logger.info(f"Direct distance calculated: {distance} miles")
|
183 |
+
else:
|
184 |
+
logger.info(f"Route distance calculated: {distance} miles")
|
185 |
+
|
186 |
+
distance_in_miles = round(meters_to_miles(distance['routes'][0]['distance']), 2)
|
187 |
+
results.append((start_address, drop_address, distance_in_miles))
|
188 |
+
|
189 |
+
# Add a small delay to avoid overwhelming the geocoding service
|
190 |
+
time.sleep(0.5)
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Error processing route from '{start_address}' to '{drop_address}': {str(e)}")
|
194 |
+
results.append((start_address, drop_address, None))
|
195 |
+
|
196 |
+
return results
|
197 |
+
|
198 |
+
def create_output_dataframe(sheet_name, results):
|
199 |
+
# logger = setup_logging()
|
200 |
+
|
201 |
+
data = {
|
202 |
+
'start': [result[0] for result in results],
|
203 |
+
'drop': [result[1] for result in results],
|
204 |
+
'distance': [result[2] for result in results]
|
205 |
+
}
|
206 |
+
|
207 |
+
df = pd.DataFrame(data)
|
208 |
+
|
209 |
+
# # Log the dataframe content for debugging
|
210 |
+
# logger.info(f"Output dataframe for sheet {sheet_name}:")
|
211 |
+
# logger.info(f"DataFrame shape: {df.shape}")
|
212 |
+
# logger.info(f"DataFrame columns: {df.columns.tolist()}")
|
213 |
+
# logger.info(f"First few rows:\n{df.head()}")
|
214 |
+
# logger.info(f"Distance column data types: {df['distance'].dtype}")
|
215 |
+
# logger.info(f"Distance column values: {df['distance'].tolist()}")
|
216 |
+
|
217 |
+
return df
|
218 |
+
|
219 |
+
@app.get("/")
|
220 |
+
def index():
|
221 |
+
return "Welcome to the Excel Processor API. Use the /upload endpoint to upload an Excel file."
|
222 |
+
|
223 |
+
if __name__ == '__main__':
|
224 |
+
import uvicorn
|
225 |
+
uvicorn.run(app, host='0.0.0.0', port=7860, log_level="debug")
|
place2geocode.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from geopy.geocoders import Nominatim
|
2 |
+
import logging
|
3 |
+
import time
|
4 |
+
|
5 |
+
def get_lat_long(address):
|
6 |
+
logger = logging.getLogger()
|
7 |
+
|
8 |
+
geolocator = Nominatim(user_agent="coordinate_finder", timeout=5)
|
9 |
+
|
10 |
+
try:
|
11 |
+
logger.info(f"Geocoding address: '{address}'")
|
12 |
+
|
13 |
+
location = geolocator.geocode(address)
|
14 |
+
|
15 |
+
if location:
|
16 |
+
logger.info(f"Geocoded '{address}' to ({location.latitude}, {location.longitude})")
|
17 |
+
return (location.latitude, location.longitude)
|
18 |
+
|
19 |
+
else:
|
20 |
+
logger.warning(f"Failed to geocode address: '{address}'")
|
21 |
+
|
22 |
+
# Try a simpler version of the address by removing zip code if present
|
23 |
+
if ',' in address:
|
24 |
+
simpler_address = address.split(',')[0]
|
25 |
+
logger.info(f"Trying simpler address: '{simpler_address}'")
|
26 |
+
time.sleep(1) # Wait a bit before trying again
|
27 |
+
location = geolocator.geocode(simpler_address)
|
28 |
+
|
29 |
+
if location:
|
30 |
+
logger.info(f"Geocoded simplified '{simpler_address}' to ({location.latitude}, {location.longitude})")
|
31 |
+
return (location.latitude, location.longitude)
|
32 |
+
else:
|
33 |
+
logger.warning(f"Also failed with simpler address: '{simpler_address}'")
|
34 |
+
|
35 |
+
return None
|
36 |
+
except Exception as e:
|
37 |
+
logger.error(f"Error geocoding '{address}': {str(e)}")
|
38 |
+
return None
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
logging.basicConfig(
|
42 |
+
level=logging.INFO,
|
43 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
44 |
+
)
|
45 |
+
logger = logging.getLogger()
|
46 |
+
|
47 |
+
start_address = "2665 SOUTH DR, SANTA CLARA, 95051"
|
48 |
+
end_address = "450 E PERSIAN DR, SUNNYVALE, 94089"
|
49 |
+
|
50 |
+
logger.info(f"Testing geocoding with two addresses")
|
51 |
+
|
52 |
+
start_coords = get_lat_long(start_address)
|
53 |
+
logger.info(f"Coordinates for '{start_address}': {start_coords}")
|
54 |
+
|
55 |
+
end_coords = get_lat_long(end_address)
|
56 |
+
logger.info(f"Coordinates for '{end_address}': {end_coords}")
|
57 |
+
|
58 |
+
if start_coords and end_coords:
|
59 |
+
# Calculate distance (this would require additional imports)
|
60 |
+
logger.info(f"Coordinates successfully retrieved for both addresses")
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi>=0.68.0
|
2 |
+
uvicorn>=0.15.0
|
3 |
+
python-dotenv>=0.19.0
|
4 |
+
pandas>=1.3.0
|
5 |
+
openpyxl>=3.0.9
|
6 |
+
geopy>=2.2.0
|
7 |
+
requests>=2.26.0
|
8 |
+
python-multipart>=0.0.5
|
utils.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from io import BytesIO
|
3 |
+
|
4 |
+
|
5 |
+
def setup_logging():
|
6 |
+
"""Set up logging configuration."""
|
7 |
+
logging.basicConfig(
|
8 |
+
level=logging.INFO,
|
9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
10 |
+
handlers=[
|
11 |
+
logging.FileHandler("address_processing.log"),
|
12 |
+
logging.StreamHandler()
|
13 |
+
]
|
14 |
+
)
|
15 |
+
return logging.getLogger(__name__)
|
16 |
+
|
17 |
+
def meters_to_miles(meters):
|
18 |
+
"""Convert distance in meters to miles."""
|
19 |
+
return meters * 0.000621371
|
20 |
+
|
21 |
+
def validate_excel_file(file_stream: BytesIO) -> tuple[bool, str]:
|
22 |
+
"""Validate the uploaded file is an Excel file by its magic numbers."""
|
23 |
+
try:
|
24 |
+
# Read the first 4 bytes to check the file signature
|
25 |
+
header = file_stream.read(4)
|
26 |
+
file_stream.seek(0) # Reset stream position for further processing
|
27 |
+
|
28 |
+
# Check for Excel file signatures
|
29 |
+
if header == b'\x50\x4B\x03\x04': # ZIP archive (xlsx)
|
30 |
+
return True, "Valid Excel file"
|
31 |
+
elif header == b'\xD0\xCF\x11\xE0': # Compound File (xls)
|
32 |
+
return True, "Valid Excel file"
|
33 |
+
else:
|
34 |
+
return False, "Invalid file type: Not an Excel file"
|
35 |
+
except Exception as e:
|
36 |
+
return False, f"Validation error: {str(e)}"
|
37 |
+
|
38 |
+
def clean_address(address):
|
39 |
+
"""Clean and standardize address strings."""
|
40 |
+
if not isinstance(address, str):
|
41 |
+
return ""
|
42 |
+
|
43 |
+
# Remove extra whitespace
|
44 |
+
cleaned = " ".join(address.split())
|
45 |
+
|
46 |
+
# Remove common abbreviations and standardize format
|
47 |
+
replacements = {
|
48 |
+
"ST.": "STREET",
|
49 |
+
"ST ": "STREET ",
|
50 |
+
"AVE.": "AVENUE",
|
51 |
+
"AVE ": "AVENUE ",
|
52 |
+
"RD.": "ROAD",
|
53 |
+
"RD ": "ROAD ",
|
54 |
+
"BLVD.": "BOULEVARD",
|
55 |
+
"BLVD ": "BOULEVARD ",
|
56 |
+
"DR.": "DRIVE",
|
57 |
+
"DR ": "DRIVE ",
|
58 |
+
}
|
59 |
+
|
60 |
+
for old, new in replacements.items():
|
61 |
+
cleaned = cleaned.replace(old, new)
|
62 |
+
|
63 |
+
return cleaned
|
64 |
+
|
65 |
+
def handle_empty_values(df, required_columns):
|
66 |
+
"""Handle empty values in required columns."""
|
67 |
+
# Create a copy to avoid modifying the original DataFrame
|
68 |
+
clean_df = df.copy()
|
69 |
+
|
70 |
+
# Fill empty values with empty strings
|
71 |
+
for col in required_columns:
|
72 |
+
if col in clean_df.columns:
|
73 |
+
clean_df[col] = clean_df[col].fillna("")
|
74 |
+
|
75 |
+
return clean_df
|