Spaces:
Sleeping
Sleeping
File size: 3,750 Bytes
3671cba ee259c3 3671cba 2b16351 eac0454 2b16351 267df2c 3671cba 697988f c4f7382 d687e0e 2b16351 eac0454 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 2fb9f93 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e 2b16351 aa9579e c4f7382 1fb03fe 2b16351 1fb03fe 2b16351 aa9579e 9db1d02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from fastapi import FastAPI
import uvicorn
import streamlit as st
import pandas as pd
import numpy as np
import requests
from urllib.parse import urlparse, quote
import re
from bs4 import BeautifulSoup
import time
from joblib import Parallel, delayed
from nltk import ngrams
from googlesearch import search
app = FastAPI()
#Endpoints
#Root endpoints
@app.get("/")
def root():
return {"API": "AdressScrap"}
def jaccard_similarity(string1, string2,n = 2, normalize=True):
try:
if normalize:
string1,string2= normalize_string(string1),normalize_string(string2)
grams1 = set(ngrams(string1, n))
grams2 = set(ngrams(string2, n))
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
except:
similarity=0
if string2=='did not extract address':
similarity=0
return similarity
def jaccard_sim_split_word_number(string1,string2):
numbers1 = ' '.join(re.findall(r'\d+', string1))
words1 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string1))
numbers2 = ' '.join(re.findall(r'\d+', string2))
words2 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string2))
number_similarity=jaccard_similarity(numbers1,numbers2)
words_similarity=jaccard_similarity(words1,words2)
return (number_similarity+words_similarity)/2
def extract_website_domain(url):
parsed_url = urlparse(url)
return parsed_url.netloc
def google_address(address):
all_data=[i for i in search(address, ssl_verify=False, advanced=True,
num_results=11)]
df=pd.DataFrame({'Title':[i.title for i in all_data],
'Link':[i.url for i in all_data],
'Description':[i.description for i in all_data],})
df=df.query("Title==Title")
df['Link']=df['Link'].str.replace('/www.','https://www.')
# df['Description']=df['Description'].bfill()
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
df['Website'] = df['Link'].apply(extract_website_domain)
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
try:
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
except:
pass
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0]
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
df['Match Percent']=[jaccard_sim_split_word_number(address,i)*100 for i in df['Address Output']]
df['Google Search Result']=[*range(1,df.shape[0]+1)]
# df_final=df[df['Address Output'].notnull()]
# df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
df.insert(0,'Address Input',address)
return df
@app.get('/Hail_Docker_Data')
async def predict(address: str):
try:
results= google_address(addresses)
results=results[['Address Input', 'Address Output','Match Percent','Website','Square Footage', 'Beds', 'Baths', 'Year Built',
'Link','Google Search Result', 'Description' ]]
except:
results= pd.DataFrame({'Address Input':[addresses]})
return results
|