Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
import uvicorn | |
import pandas as pd | |
import numpy as np | |
import requests | |
from urllib.parse import urlparse, quote | |
import re | |
from bs4 import BeautifulSoup | |
import time | |
from joblib import Parallel, delayed | |
from nltk import ngrams | |
app = FastAPI() | |
#Endpoints | |
#Root endpoints | |
def root(): | |
return {"API": "Google Address Scrap"} | |
def normalize_string(string): | |
normalized_string = string.lower() | |
normalized_string = re.sub(r'[^\w\s]', '', normalized_string) | |
return normalized_string | |
def jaccard_similarity(string1, string2,n = 2, normalize=True): | |
try: | |
if normalize: | |
string1,string2= normalize_string(string1),normalize_string(string2) | |
grams1 = set(ngrams(string1, n)) | |
grams2 = set(ngrams(string2, n)) | |
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2)) | |
except: | |
similarity=0 | |
if string2=='did not extract address': | |
similarity=0 | |
return similarity | |
def jaccard_sim_split_word_number(string1,string2): | |
numbers1 = ' '.join(re.findall(r'\d+', string1)) | |
words1 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string1)) | |
numbers2 = ' '.join(re.findall(r'\d+', string2)) | |
words2 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string2)) | |
number_similarity=jaccard_similarity(numbers1,numbers2) | |
words_similarity=jaccard_similarity(words1,words2) | |
return (number_similarity+words_similarity)/2 | |
def extract_website_domain(url): | |
parsed_url = urlparse(url) | |
return parsed_url.netloc | |
def google_address(address): | |
search_query = quote(address) | |
url=f'https://www.google.com/search?q={search_query}' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
texts_links = [] | |
for link in soup.find_all("a"): | |
t,l=link.get_text(), link.get("href") | |
if (l[:11]=='/url?q=http') and (len(t)>20 ): | |
texts_links.append((t,l)) | |
text = soup.get_text() | |
texts_links_des=[] | |
for i,t_l in enumerate(texts_links): | |
start=text.find(texts_links[i][0][:50]) | |
try: | |
end=text.find(texts_links[i+1][0][:50]) | |
except: | |
end=text.find('Related searches') | |
description=text[start:end] | |
texts_links_des.append((t_l[0],t_l[1],description)) | |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description']) | |
df['Description']=df['Description'].bfill() | |
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**") | |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']] | |
df['Website'] = df['Link'].apply(extract_website_domain) | |
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0] | |
try: | |
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '') | |
except: | |
pass | |
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed") | |
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0] | |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float) | |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})") | |
df['Match Percent']=[jaccard_sim_split_word_number(address,i)*100 for i in df['Address Output']] | |
df['Google Search Result']=[*range(1,df.shape[0]+1)] | |
df.insert(0,'Address Input',address) | |
return df | |
def catch_errors(addresses): | |
try: | |
return google_address(addresses) | |
except: | |
return pd.DataFrame({'Address Input':[addresses]}) | |
def process_multiple_address(addresses): | |
results=Parallel(n_jobs=32, prefer="threads")(delayed(catch_errors)(i) for i in addresses) | |
return results | |
async def predict(address_input: str): | |
address_input_split = address_input.split(';') | |
results = process_multiple_address(address_input_split) | |
results = pd.concat(results).reset_index(drop=1) | |
prediction = results[['Address Input', 'Address Output', 'Match Percent', 'Website', 'Square Footage', 'Beds', 'Baths', 'Year Built', | |
'Link', 'Google Search Result', 'Description']] | |
return prediction.to_json() | |