mattritchey commited on
Commit
7149684
·
1 Parent(s): 56aaa78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -26
app.py CHANGED
@@ -7,12 +7,29 @@ import re
7
  from bs4 import BeautifulSoup
8
  import time
9
  from joblib import Parallel, delayed
10
-
11
 
12
  @st.cache_data
13
  def convert_df(df):
14
  return df.to_csv()
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def extract_website_domain(url):
17
  parsed_url = urlparse(url)
18
  return parsed_url.netloc
@@ -50,7 +67,7 @@ def google_address(address):
50
 
51
  df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
52
  df['Description']=df['Description'].bfill()
53
- df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
54
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
55
  df['Website'] = df['Link'].apply(extract_website_domain)
56
 
@@ -64,12 +81,15 @@ def google_address(address):
64
  df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
65
 
66
  df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
 
 
67
 
68
- df_final=df[df['Address'].notnull()]
69
- df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
70
 
71
- df_final.insert(0,'Address Input',address)
72
- return df_final
 
73
 
74
  def catch_errors(addresses):
75
  try:
@@ -84,18 +104,14 @@ def process_multiple_address(addresses):
84
 
85
 
86
 
87
-
88
-
89
  st.set_page_config(layout="wide")
90
- # col1, col2 = st.columns((2))
91
- address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
92
 
93
  address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
94
  uploaded_file = st.sidebar.file_uploader("Choose a file")
95
- # uploaded_file='C:/Users/mritchey/Documents/addresses 100 generated.xlsx'
 
96
  return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
97
 
98
-
99
  if address_file == 'File' and not None:
100
  try:
101
  df = pd.read_csv(uploaded_file)
@@ -110,30 +126,38 @@ if address_file == 'File' and not None:
110
 
111
  results= process_multiple_address(df['Address All'].values)
112
  results=pd.concat(results).reset_index(drop=1)
113
- results.index=results.index+1
114
 
115
  else:
116
  results=google_address(address).reset_index(drop=1)
117
- results.index=results.index+1
118
 
119
 
120
-
121
- results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
122
  'Link', 'Description',
123
  ]]
124
 
 
 
125
  if return_sq=='Yes':
126
  results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
127
- results.index=results.index+1
128
-
129
- st.dataframe(
130
- results,
131
- column_config={
132
-
133
- "Link": st.column_config.LinkColumn("Link"),
134
- },
135
- hide_index=True,
136
- )
 
 
 
 
 
 
 
137
 
138
  csv2 = convert_df(results)
139
  st.download_button(
 
7
  from bs4 import BeautifulSoup
8
  import time
9
  from joblib import Parallel, delayed
10
+ from nltk import ngrams
11
 
12
  @st.cache_data
13
  def convert_df(df):
14
  return df.to_csv()
15
 
16
+ def normalize_string(string):
17
+ normalized_string = string.lower()
18
+ normalized_string = re.sub(r'[^\w\s]', '', normalized_string)
19
+
20
+ return normalized_string
21
+
22
+ def jaccard_similarity(string1, string2,n = 2, normalize=True):
23
+
24
+ if normalize:
25
+ string1,string2= normalize_string(string1),normalize_string(string2)
26
+
27
+ grams1 = set(ngrams(string1, n))
28
+ grams2 = set(ngrams(string2, n))
29
+ similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
30
+ return similarity
31
+
32
+
33
  def extract_website_domain(url):
34
  parsed_url = urlparse(url)
35
  return parsed_url.netloc
 
67
 
68
  df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
69
  df['Description']=df['Description'].bfill()
70
+ df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})')
71
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
72
  df['Website'] = df['Link'].apply(extract_website_domain)
73
 
 
81
  df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
82
 
83
  df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
84
+ df['Match Percent']=[jaccard_similarity(address,i)*100 for i in df['Address Output']]
85
+
86
 
87
+ # df_final=df[df['Address Output'].notnull()]
88
+ # df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
89
 
90
+ df.insert(0,'Address Input',address)
91
+
92
+ return df
93
 
94
  def catch_errors(addresses):
95
  try:
 
104
 
105
 
106
 
 
 
107
  st.set_page_config(layout="wide")
 
 
108
 
109
  address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
110
  uploaded_file = st.sidebar.file_uploader("Choose a file")
111
+ address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
112
+ match_percent = st.sidebar.selectbox('Address Match Percentage At Least:',(70, 80, 90, 100, 0))
113
  return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
114
 
 
115
  if address_file == 'File' and not None:
116
  try:
117
  df = pd.read_csv(uploaded_file)
 
126
 
127
  results= process_multiple_address(df['Address All'].values)
128
  results=pd.concat(results).reset_index(drop=1)
129
+ # results.index=results.index+1
130
 
131
  else:
132
  results=google_address(address).reset_index(drop=1)
133
+ # results.index=results.index+1
134
 
135
 
136
+ results=results[['Address Input', 'Address Output','Match Percent', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
 
137
  'Link', 'Description',
138
  ]]
139
 
140
+ results=results.query(f"`Match Percent`>={match_percent}")
141
+
142
  if return_sq=='Yes':
143
  results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
144
+ # results.index=results.index+1
145
+
146
+
147
+
148
+ with st.container():
149
+
150
+ st.dataframe(
151
+ results,
152
+ column_config={
153
+
154
+ "Link": st.column_config.LinkColumn("Link"),
155
+ 'Match Percent': st.column_config.NumberColumn(format='%.2f %%'),
156
+ },
157
+ hide_index=True,
158
+ # height=500,
159
+ # width=500,
160
+ )
161
 
162
  csv2 = convert_df(results)
163
  st.download_button(