mattritchey commited on
Commit
9ec1981
·
1 Parent(s): 0e8d848

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import numpy as np
6
+ import requests
7
+ from urllib.parse import urlparse, quote
8
+ import re
9
+ from bs4 import BeautifulSoup
10
+ import time
11
+ from joblib import Parallel, delayed
12
+
13
+
14
+ @st.cache_data
15
+ def convert_df(df):
16
+ return df.to_csv()
17
+
18
+ def extract_website_domain(url):
19
+ parsed_url = urlparse(url)
20
+ return parsed_url.netloc
21
+
22
+
23
+ def google_address(address):
24
+
25
+ address_number = re.findall(r'\b\d+\b', address)[0]
26
+
27
+ search_query = quote(address)
28
+ url=f'https://www.google.com/search?q={search_query}'
29
+ response = requests.get(url)
30
+ soup = BeautifulSoup(response.content, "html.parser")
31
+
32
+ texts_links = []
33
+ for link in soup.find_all("a"):
34
+ t,l=link.get_text(), link.get("href")
35
+ if (l[:11]=='/url?q=http') and (len(t)>20 ):
36
+ texts_links.append((t,l))
37
+
38
+
39
+ text = soup.get_text()
40
+
41
+ texts_links_des=[]
42
+ for i,t_l in enumerate(texts_links):
43
+ start=text.find(texts_links[i][0][:50])
44
+ try:
45
+ end=text.find(texts_links[i+1][0][:50])
46
+ except:
47
+ end=text.find('Related searches')
48
+
49
+ description=text[start:end]
50
+ texts_links_des.append((t_l[0],t_l[1],description))
51
+
52
+ df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
53
+ df['Description']=df['Description'].bfill()
54
+ df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
55
+ df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
56
+ df['Website'] = df['Link'].apply(extract_website_domain)
57
+
58
+ df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.)")[0]
59
+ df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
60
+
61
+ df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
62
+ df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bath")
63
+ df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
64
+
65
+ df_final=df[df['Address'].notnull()]
66
+ df_final=df_final[df_final['Address'].str.contains(str(address_number))]
67
+ return df_final
68
+
69
+ def process_multiple_address(addresses):
70
+ results=Parallel(n_jobs=-1, prefer="threads")(delayed(google_address)(i) for i in addresses)
71
+ return results
72
+
73
+
74
+ st.set_page_config(layout="wide")
75
+ # col1, col2 = st.columns((2))
76
+ address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
77
+
78
+ address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
79
+ uploaded_file = st.sidebar.file_uploader("Choose a file")
80
+ # uploaded_file='C:/Users/mritchey/Documents/addresses 100 generated.xlsx'
81
+ return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
82
+
83
+
84
+ if address_file == 'File' and not None:
85
+ try:
86
+ df = pd.read_csv(uploaded_file)
87
+ except:
88
+ df = pd.read_excel(uploaded_file)
89
+
90
+ address_cols=list(df.columns[:4])
91
+ df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
92
+ df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5))
93
+
94
+ df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
95
+
96
+ results= process_multiple_address(df['Address All'].values,trial=True)
97
+ results=pd.concat(results).reset_index(drop=1)
98
+ results.index=results.index+1
99
+
100
+ else:
101
+ results=google_address(address).reset_index(drop=1)
102
+ results.index=results.index+1
103
+ results.insert(0,'Address Input',address)
104
+
105
+
106
+ results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
107
+ 'Link', 'Description',
108
+ ]]
109
+
110
+ if return_sq=='Yes':
111
+ results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
112
+ results.index=results.index+1
113
+
114
+ st.dataframe(
115
+ results,
116
+ column_config={
117
+
118
+ "Link": st.column_config.LinkColumn("Link"),
119
+ },
120
+ hide_index=True,
121
+ )
122
+
123
+ csv2 = convert_df(results)
124
+ st.download_button(
125
+ label="Download data as CSV",
126
+ data=csv2,
127
+ file_name=f'{address}.csv',
128
+ mime='text/csv')
129
+
130
+
131
+ st.markdown(""" <style>
132
+ #MainMenu {visibility: hidden;}
133
+ footer {visibility: hidden;}
134
+ </style> """, unsafe_allow_html=True)