Spaces:
Sleeping
Sleeping
File size: 3,895 Bytes
3671cba ee259c3 3671cba 2b16351 2a74417 eac0454 2b16351 267df2c 3671cba 697988f c4f7382 d687e0e 2b16351 eac0454 2a74417 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e c4f7382 6291600 2b16351 1fb03fe 2b16351 aa9579e 9db1d02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from fastapi import FastAPI
import uvicorn
import pandas as pd
import numpy as np
import requests
from urllib.parse import urlparse, quote
import re
from bs4 import BeautifulSoup
import time
from joblib import Parallel, delayed
from nltk import ngrams
from googlesearch import search
app = FastAPI()
#Endpoints
#Root endpoints
@app.get("/")
def root():
return {"API": "AdressScrap"}
def normalize_string(string):
normalized_string = string.lower()
normalized_string = re.sub(r'[^\w\s]', '', normalized_string)
return normalized_string
def jaccard_similarity(string1, string2,n = 2, normalize=True):
try:
if normalize:
string1,string2= normalize_string(string1),normalize_string(string2)
grams1 = set(ngrams(string1, n))
grams2 = set(ngrams(string2, n))
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
except:
similarity=0
if string2=='did not extract address':
similarity=0
return similarity
def jaccard_sim_split_word_number(string1,string2):
numbers1 = ' '.join(re.findall(r'\d+', string1))
words1 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string1))
numbers2 = ' '.join(re.findall(r'\d+', string2))
words2 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string2))
number_similarity=jaccard_similarity(numbers1,numbers2)
words_similarity=jaccard_similarity(words1,words2)
return (number_similarity+words_similarity)/2
def extract_website_domain(url):
parsed_url = urlparse(url)
return parsed_url.netloc
def google_address(address):
all_data=[i for i in search(address, ssl_verify=False, advanced=True,
num_results=11)]
df=pd.DataFrame({'Title':[i.title for i in all_data],
'Link':[i.url for i in all_data],
'Description':[i.description for i in all_data],})
df=df.query("Title==Title")
df['Link']=df['Link'].str.replace('/www.','https://www.')
# df['Description']=df['Description'].bfill()
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
df['Website'] = df['Link'].apply(extract_website_domain)
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
try:
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
except:
pass
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0]
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
df['Match Percent']=[jaccard_sim_split_word_number(address,i)*100 for i in df['Address Output']]
df['Google Search Result']=[*range(1,df.shape[0]+1)]
# df_final=df[df['Address Output'].notnull()]
# df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
df.insert(0,'Address Input',address)
return df
@app.get('/AddressScrap')
async def predict(address: str):
try:
results= google_address(addresses)
results=results[['Address Input', 'Address Output','Match Percent','Website','Square Footage', 'Beds', 'Baths', 'Year Built',
'Link','Google Search Result', 'Description' ]]
except:
results= pd.DataFrame({'Address Input':[addresses]})
return results
|