Spaces:
Sleeping
Sleeping
Commit
·
9ec1981
1
Parent(s):
0e8d848
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
from urllib.parse import urlparse, quote
|
8 |
+
import re
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import time
|
11 |
+
from joblib import Parallel, delayed
|
12 |
+
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def convert_df(df):
|
16 |
+
return df.to_csv()
|
17 |
+
|
18 |
+
def extract_website_domain(url):
|
19 |
+
parsed_url = urlparse(url)
|
20 |
+
return parsed_url.netloc
|
21 |
+
|
22 |
+
|
23 |
+
def google_address(address):
|
24 |
+
|
25 |
+
address_number = re.findall(r'\b\d+\b', address)[0]
|
26 |
+
|
27 |
+
search_query = quote(address)
|
28 |
+
url=f'https://www.google.com/search?q={search_query}'
|
29 |
+
response = requests.get(url)
|
30 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
31 |
+
|
32 |
+
texts_links = []
|
33 |
+
for link in soup.find_all("a"):
|
34 |
+
t,l=link.get_text(), link.get("href")
|
35 |
+
if (l[:11]=='/url?q=http') and (len(t)>20 ):
|
36 |
+
texts_links.append((t,l))
|
37 |
+
|
38 |
+
|
39 |
+
text = soup.get_text()
|
40 |
+
|
41 |
+
texts_links_des=[]
|
42 |
+
for i,t_l in enumerate(texts_links):
|
43 |
+
start=text.find(texts_links[i][0][:50])
|
44 |
+
try:
|
45 |
+
end=text.find(texts_links[i+1][0][:50])
|
46 |
+
except:
|
47 |
+
end=text.find('Related searches')
|
48 |
+
|
49 |
+
description=text[start:end]
|
50 |
+
texts_links_des.append((t_l[0],t_l[1],description))
|
51 |
+
|
52 |
+
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
53 |
+
df['Description']=df['Description'].bfill()
|
54 |
+
df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
|
55 |
+
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
56 |
+
df['Website'] = df['Link'].apply(extract_website_domain)
|
57 |
+
|
58 |
+
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.)")[0]
|
59 |
+
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
|
60 |
+
|
61 |
+
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
|
62 |
+
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bath")
|
63 |
+
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
64 |
+
|
65 |
+
df_final=df[df['Address'].notnull()]
|
66 |
+
df_final=df_final[df_final['Address'].str.contains(str(address_number))]
|
67 |
+
return df_final
|
68 |
+
|
69 |
+
def process_multiple_address(addresses):
|
70 |
+
results=Parallel(n_jobs=-1, prefer="threads")(delayed(google_address)(i) for i in addresses)
|
71 |
+
return results
|
72 |
+
|
73 |
+
|
74 |
+
st.set_page_config(layout="wide")
|
75 |
+
# col1, col2 = st.columns((2))
|
76 |
+
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
77 |
+
|
78 |
+
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
79 |
+
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
80 |
+
# uploaded_file='C:/Users/mritchey/Documents/addresses 100 generated.xlsx'
|
81 |
+
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
82 |
+
|
83 |
+
|
84 |
+
if address_file == 'File' and not None:
|
85 |
+
try:
|
86 |
+
df = pd.read_csv(uploaded_file)
|
87 |
+
except:
|
88 |
+
df = pd.read_excel(uploaded_file)
|
89 |
+
|
90 |
+
address_cols=list(df.columns[:4])
|
91 |
+
df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
|
92 |
+
df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5))
|
93 |
+
|
94 |
+
df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
|
95 |
+
|
96 |
+
results= process_multiple_address(df['Address All'].values,trial=True)
|
97 |
+
results=pd.concat(results).reset_index(drop=1)
|
98 |
+
results.index=results.index+1
|
99 |
+
|
100 |
+
else:
|
101 |
+
results=google_address(address).reset_index(drop=1)
|
102 |
+
results.index=results.index+1
|
103 |
+
results.insert(0,'Address Input',address)
|
104 |
+
|
105 |
+
|
106 |
+
results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
107 |
+
'Link', 'Description',
|
108 |
+
]]
|
109 |
+
|
110 |
+
if return_sq=='Yes':
|
111 |
+
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
112 |
+
results.index=results.index+1
|
113 |
+
|
114 |
+
st.dataframe(
|
115 |
+
results,
|
116 |
+
column_config={
|
117 |
+
|
118 |
+
"Link": st.column_config.LinkColumn("Link"),
|
119 |
+
},
|
120 |
+
hide_index=True,
|
121 |
+
)
|
122 |
+
|
123 |
+
csv2 = convert_df(results)
|
124 |
+
st.download_button(
|
125 |
+
label="Download data as CSV",
|
126 |
+
data=csv2,
|
127 |
+
file_name=f'{address}.csv',
|
128 |
+
mime='text/csv')
|
129 |
+
|
130 |
+
|
131 |
+
st.markdown(""" <style>
|
132 |
+
#MainMenu {visibility: hidden;}
|
133 |
+
footer {visibility: hidden;}
|
134 |
+
</style> """, unsafe_allow_html=True)
|