Shrijayan Rajendran8 commited on
Commit
5754a38
·
1 Parent(s): fda387d

Add initial project setup with FastAPI, Docker, and geocoding functionality

Browse files
Files changed (7) hide show
  1. Dockerfile +9 -0
  2. distance_calculator.py +27 -0
  3. docker-compose.yml +9 -0
  4. main.py +225 -0
  5. place2geocode.py +60 -0
  6. requirements.txt +8 -0
  7. utils.py +75 -0
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
distance_calculator.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ def get_distance(lon1, lat1, lon2, lat2):
7
+ API_URL = f'{os.getenv("ENDPOINT")}/route/v1/driving/{lon1},{lat1};{lon2},{lat2}'
8
+ print(API_URL)
9
+ API_TOKEN = os.getenv("HUGGINGFACE_API_KEY", "")
10
+ headers = {"Authorization": f"Bearer {API_TOKEN}"} if API_TOKEN else {}
11
+
12
+ response = requests.get(API_URL, headers=headers)
13
+
14
+ # print(response['routes'][0]['distance'])
15
+
16
+ if response.status_code == 200:
17
+ result = response.json()
18
+ return result
19
+ else:
20
+ return 0
21
+
22
+ if __name__ == "__main__":
23
+ lon1, lat1 = 13.388860,52.517037
24
+ lon2, lat2 = 13.397634,52.529407
25
+
26
+ result = get_distance(lon1, lat1, lon2, lat2)
27
+ print(result)
docker-compose.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+ services:
3
+ app:
4
+ build: .
5
+ ports:
6
+ - "7860:7860"
7
+ environment:
8
+ - ENDPOINT=${ENDPOINT}
9
+ - HUGGINGFACE_API_KEY=${HUGGINGFACE_API_KEY}
main.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import StreamingResponse
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ import time
6
+ from place2geocode import get_lat_long
7
+ from utils import setup_logging, clean_address, handle_empty_values, validate_excel_file, meters_to_miles
8
+ from distance_calculator import get_distance
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+
11
+ app = FastAPI()
12
+ logger = setup_logging()
13
+
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=["*"],
18
+ allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+ def process_uploaded_file(file_stream):
24
+ """Process uploaded file stream and return output Excel bytes"""
25
+ # Validate file
26
+ is_valid, message = validate_excel_file(file_stream)
27
+ if not is_valid:
28
+ raise ValueError(message)
29
+
30
+ # Reset stream position after validation
31
+ file_stream.seek(0)
32
+
33
+ # Rest of the processing code remains the same...
34
+ # Create in-memory output file
35
+ output = BytesIO()
36
+
37
+ with pd.ExcelWriter(output) as writer:
38
+ sheet_data = extract_address_data(file_stream)
39
+
40
+ if not sheet_data:
41
+ raise ValueError("No valid data found in any sheet.")
42
+
43
+ for sheet_name, df in sheet_data.items():
44
+ addresses = create_address_strings(df)
45
+ results = get_route_distances(addresses)
46
+ output_df = create_output_dataframe(sheet_name, results)
47
+ output_df.to_excel(writer, sheet_name=sheet_name, index=False)
48
+
49
+ output.seek(0)
50
+ return output
51
+
52
+ def extract_address_data(file_stream):
53
+ """Extract address data from Excel file stream"""
54
+ sheet_data = {}
55
+
56
+ try:
57
+ # Read the Excel file directly from the file stream
58
+ excel_file = pd.read_excel(file_stream, sheet_name=None, header=1)
59
+ except Exception as e:
60
+ logger.error(f"Error reading Excel file: {str(e)}")
61
+ return {}
62
+
63
+ required_columns = [
64
+ 'Address', 'City', 'Zipcode',
65
+ 'Drop Address', 'Drop City', 'Drop Zipcode'
66
+ ]
67
+
68
+ for sheet_name, df in excel_file.items():
69
+ if sheet_name.startswith('!'):
70
+ continue
71
+
72
+ logger.info(f"Processing sheet: {sheet_name}")
73
+
74
+ try:
75
+ if not all(col in df.columns for col in required_columns):
76
+ missing_cols = [col for col in required_columns if col not in df.columns]
77
+ logger.warning(f"Sheet '{sheet_name}' missing columns: {missing_cols}")
78
+ continue
79
+
80
+ df = handle_empty_values(df, required_columns)
81
+ df = df.loc[(df['Address'].str.strip() != '') &
82
+ (df['Drop Address'].str.strip() != '')]
83
+
84
+ if df.empty:
85
+ logger.warning(f"Sheet '{sheet_name}' has no valid data")
86
+ continue
87
+
88
+ sheet_data[sheet_name] = df
89
+
90
+ except Exception as e:
91
+ logger.error(f"Sheet {sheet_name} error: {str(e)}")
92
+
93
+ return sheet_data
94
+
95
+ @app.post("/upload")
96
+ async def upload_files(files: list[UploadFile] = File(...)):
97
+ """Handle multiple file uploads and return processed ZIP"""
98
+ try:
99
+ zip_buffer = BytesIO()
100
+ with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED) as zip_file:
101
+ for file in files:
102
+ if not allowed_file(file.filename):
103
+ continue
104
+
105
+ file_stream = await file.read()
106
+ output = process_uploaded_file(BytesIO(file_stream))
107
+ zip_file.writestr(f"processed_{file.filename}", output.getvalue())
108
+
109
+ zip_buffer.seek(0)
110
+ return StreamingResponse(
111
+ zip_buffer,
112
+ media_type="application/zip",
113
+ headers={"Content-Disposition": "attachment; filename=processed_files.zip"}
114
+ )
115
+
116
+ except Exception as e:
117
+ logger.error(f"Processing error: {str(e)}")
118
+ raise HTTPException(status_code=500, detail=str(e))
119
+
120
+ def allowed_file(filename):
121
+ return '.' in filename and \
122
+ filename.rsplit('.', 1)[1].lower() in \
123
+ {'xlsx', 'xls'}
124
+
125
+ def create_address_strings(df):
126
+ """Create formatted address strings for geocoding."""
127
+ addresses = []
128
+
129
+ for _, row in df.iterrows():
130
+ # Clean and format start address
131
+ start_address = clean_address(f"{row['Address']}, {row['City']}")
132
+
133
+ # Clean and format drop address
134
+ drop_address = clean_address(f"{row['Drop Address']}, {row['Drop City']}")
135
+
136
+ addresses.append((start_address, drop_address))
137
+
138
+ return addresses
139
+
140
+ def get_route_distances(addresses):
141
+ logger = setup_logging()
142
+ results = []
143
+
144
+ for start_address, drop_address in addresses:
145
+ try:
146
+ # Get coordinates for start address
147
+ logger.info(f"Geocoding start address: '{start_address}'")
148
+
149
+ start_coords = get_lat_long(start_address)
150
+
151
+ logger.info(f"Start coordinates: {start_coords}")
152
+ if not start_coords:
153
+ logger.warning(f"Could not geocode start address: '{start_address}'")
154
+ results.append((start_address, drop_address, None))
155
+ continue
156
+
157
+ # Get coordinates for drop address
158
+ logger.info(f"Geocoding drop address: '{drop_address}'")
159
+
160
+ drop_coords = get_lat_long(drop_address)
161
+
162
+ logger.info(f"Drop coordinates: {drop_coords}")
163
+ if not drop_coords:
164
+ logger.warning(f"Could not geocode drop address: '{drop_address}'")
165
+ results.append((start_address, drop_address, None))
166
+ continue
167
+
168
+ # Geopy returns coordinates as (latitude, longitude)
169
+ lat1, lon1 = start_coords
170
+ lat2, lon2 = drop_coords
171
+
172
+ logger.info(f"Getting route from ({lat1}, {lon1}) to ({lat2}, {lon2})")
173
+
174
+ # Try to get driving distance from routing API
175
+ distance = get_distance(lon1, lat1, lon2, lat2)
176
+ print(distance)
177
+
178
+ # If API fails, fall back to direct distance
179
+ if distance is None:
180
+ logger.warning("API routing failed, falling back to direct distance calculation")
181
+ distance = get_distance(lat1, lon1, lat2, lon2)
182
+ logger.info(f"Direct distance calculated: {distance} miles")
183
+ else:
184
+ logger.info(f"Route distance calculated: {distance} miles")
185
+
186
+ distance_in_miles = round(meters_to_miles(distance['routes'][0]['distance']), 2)
187
+ results.append((start_address, drop_address, distance_in_miles))
188
+
189
+ # Add a small delay to avoid overwhelming the geocoding service
190
+ time.sleep(0.5)
191
+
192
+ except Exception as e:
193
+ logger.error(f"Error processing route from '{start_address}' to '{drop_address}': {str(e)}")
194
+ results.append((start_address, drop_address, None))
195
+
196
+ return results
197
+
198
+ def create_output_dataframe(sheet_name, results):
199
+ # logger = setup_logging()
200
+
201
+ data = {
202
+ 'start': [result[0] for result in results],
203
+ 'drop': [result[1] for result in results],
204
+ 'distance': [result[2] for result in results]
205
+ }
206
+
207
+ df = pd.DataFrame(data)
208
+
209
+ # # Log the dataframe content for debugging
210
+ # logger.info(f"Output dataframe for sheet {sheet_name}:")
211
+ # logger.info(f"DataFrame shape: {df.shape}")
212
+ # logger.info(f"DataFrame columns: {df.columns.tolist()}")
213
+ # logger.info(f"First few rows:\n{df.head()}")
214
+ # logger.info(f"Distance column data types: {df['distance'].dtype}")
215
+ # logger.info(f"Distance column values: {df['distance'].tolist()}")
216
+
217
+ return df
218
+
219
+ @app.get("/")
220
+ def index():
221
+ return "Welcome to the Excel Processor API. Use the /upload endpoint to upload an Excel file."
222
+
223
+ if __name__ == '__main__':
224
+ import uvicorn
225
+ uvicorn.run(app, host='0.0.0.0', port=7860, log_level="debug")
place2geocode.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from geopy.geocoders import Nominatim
2
+ import logging
3
+ import time
4
+
5
+ def get_lat_long(address):
6
+ logger = logging.getLogger()
7
+
8
+ geolocator = Nominatim(user_agent="coordinate_finder", timeout=5)
9
+
10
+ try:
11
+ logger.info(f"Geocoding address: '{address}'")
12
+
13
+ location = geolocator.geocode(address)
14
+
15
+ if location:
16
+ logger.info(f"Geocoded '{address}' to ({location.latitude}, {location.longitude})")
17
+ return (location.latitude, location.longitude)
18
+
19
+ else:
20
+ logger.warning(f"Failed to geocode address: '{address}'")
21
+
22
+ # Try a simpler version of the address by removing zip code if present
23
+ if ',' in address:
24
+ simpler_address = address.split(',')[0]
25
+ logger.info(f"Trying simpler address: '{simpler_address}'")
26
+ time.sleep(1) # Wait a bit before trying again
27
+ location = geolocator.geocode(simpler_address)
28
+
29
+ if location:
30
+ logger.info(f"Geocoded simplified '{simpler_address}' to ({location.latitude}, {location.longitude})")
31
+ return (location.latitude, location.longitude)
32
+ else:
33
+ logger.warning(f"Also failed with simpler address: '{simpler_address}'")
34
+
35
+ return None
36
+ except Exception as e:
37
+ logger.error(f"Error geocoding '{address}': {str(e)}")
38
+ return None
39
+
40
+ if __name__ == "__main__":
41
+ logging.basicConfig(
42
+ level=logging.INFO,
43
+ format='%(asctime)s - %(levelname)s - %(message)s'
44
+ )
45
+ logger = logging.getLogger()
46
+
47
+ start_address = "2665 SOUTH DR, SANTA CLARA, 95051"
48
+ end_address = "450 E PERSIAN DR, SUNNYVALE, 94089"
49
+
50
+ logger.info(f"Testing geocoding with two addresses")
51
+
52
+ start_coords = get_lat_long(start_address)
53
+ logger.info(f"Coordinates for '{start_address}': {start_coords}")
54
+
55
+ end_coords = get_lat_long(end_address)
56
+ logger.info(f"Coordinates for '{end_address}': {end_coords}")
57
+
58
+ if start_coords and end_coords:
59
+ # Calculate distance (this would require additional imports)
60
+ logger.info(f"Coordinates successfully retrieved for both addresses")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.68.0
2
+ uvicorn>=0.15.0
3
+ python-dotenv>=0.19.0
4
+ pandas>=1.3.0
5
+ openpyxl>=3.0.9
6
+ geopy>=2.2.0
7
+ requests>=2.26.0
8
+ python-multipart>=0.0.5
utils.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from io import BytesIO
3
+
4
+
5
+ def setup_logging():
6
+ """Set up logging configuration."""
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
10
+ handlers=[
11
+ logging.FileHandler("address_processing.log"),
12
+ logging.StreamHandler()
13
+ ]
14
+ )
15
+ return logging.getLogger(__name__)
16
+
17
+ def meters_to_miles(meters):
18
+ """Convert distance in meters to miles."""
19
+ return meters * 0.000621371
20
+
21
+ def validate_excel_file(file_stream: BytesIO) -> tuple[bool, str]:
22
+ """Validate the uploaded file is an Excel file by its magic numbers."""
23
+ try:
24
+ # Read the first 4 bytes to check the file signature
25
+ header = file_stream.read(4)
26
+ file_stream.seek(0) # Reset stream position for further processing
27
+
28
+ # Check for Excel file signatures
29
+ if header == b'\x50\x4B\x03\x04': # ZIP archive (xlsx)
30
+ return True, "Valid Excel file"
31
+ elif header == b'\xD0\xCF\x11\xE0': # Compound File (xls)
32
+ return True, "Valid Excel file"
33
+ else:
34
+ return False, "Invalid file type: Not an Excel file"
35
+ except Exception as e:
36
+ return False, f"Validation error: {str(e)}"
37
+
38
+ def clean_address(address):
39
+ """Clean and standardize address strings."""
40
+ if not isinstance(address, str):
41
+ return ""
42
+
43
+ # Remove extra whitespace
44
+ cleaned = " ".join(address.split())
45
+
46
+ # Remove common abbreviations and standardize format
47
+ replacements = {
48
+ "ST.": "STREET",
49
+ "ST ": "STREET ",
50
+ "AVE.": "AVENUE",
51
+ "AVE ": "AVENUE ",
52
+ "RD.": "ROAD",
53
+ "RD ": "ROAD ",
54
+ "BLVD.": "BOULEVARD",
55
+ "BLVD ": "BOULEVARD ",
56
+ "DR.": "DRIVE",
57
+ "DR ": "DRIVE ",
58
+ }
59
+
60
+ for old, new in replacements.items():
61
+ cleaned = cleaned.replace(old, new)
62
+
63
+ return cleaned
64
+
65
+ def handle_empty_values(df, required_columns):
66
+ """Handle empty values in required columns."""
67
+ # Create a copy to avoid modifying the original DataFrame
68
+ clean_df = df.copy()
69
+
70
+ # Fill empty values with empty strings
71
+ for col in required_columns:
72
+ if col in clean_df.columns:
73
+ clean_df[col] = clean_df[col].fillna("")
74
+
75
+ return clean_df