Spaces:
Sleeping
Sleeping
Commit
·
7149684
1
Parent(s):
56aaa78
Update app.py
Browse files
app.py
CHANGED
@@ -7,12 +7,29 @@ import re
|
|
7 |
from bs4 import BeautifulSoup
|
8 |
import time
|
9 |
from joblib import Parallel, delayed
|
10 |
-
|
11 |
|
12 |
@st.cache_data
|
13 |
def convert_df(df):
|
14 |
return df.to_csv()
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def extract_website_domain(url):
|
17 |
parsed_url = urlparse(url)
|
18 |
return parsed_url.netloc
|
@@ -50,7 +67,7 @@ def google_address(address):
|
|
50 |
|
51 |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
52 |
df['Description']=df['Description'].bfill()
|
53 |
-
df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
|
54 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
55 |
df['Website'] = df['Link'].apply(extract_website_domain)
|
56 |
|
@@ -64,12 +81,15 @@ def google_address(address):
|
|
64 |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
|
65 |
|
66 |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
|
|
|
|
67 |
|
68 |
-
df_final=df[df['Address'].notnull()]
|
69 |
-
df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
|
70 |
|
71 |
-
|
72 |
-
|
|
|
73 |
|
74 |
def catch_errors(addresses):
|
75 |
try:
|
@@ -84,18 +104,14 @@ def process_multiple_address(addresses):
|
|
84 |
|
85 |
|
86 |
|
87 |
-
|
88 |
-
|
89 |
st.set_page_config(layout="wide")
|
90 |
-
# col1, col2 = st.columns((2))
|
91 |
-
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
92 |
|
93 |
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
94 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
95 |
-
|
|
|
96 |
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
97 |
|
98 |
-
|
99 |
if address_file == 'File' and not None:
|
100 |
try:
|
101 |
df = pd.read_csv(uploaded_file)
|
@@ -110,30 +126,38 @@ if address_file == 'File' and not None:
|
|
110 |
|
111 |
results= process_multiple_address(df['Address All'].values)
|
112 |
results=pd.concat(results).reset_index(drop=1)
|
113 |
-
results.index=results.index+1
|
114 |
|
115 |
else:
|
116 |
results=google_address(address).reset_index(drop=1)
|
117 |
-
results.index=results.index+1
|
118 |
|
119 |
|
120 |
-
|
121 |
-
results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
122 |
'Link', 'Description',
|
123 |
]]
|
124 |
|
|
|
|
|
125 |
if return_sq=='Yes':
|
126 |
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
127 |
-
results.index=results.index+1
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
csv2 = convert_df(results)
|
139 |
st.download_button(
|
|
|
7 |
from bs4 import BeautifulSoup
|
8 |
import time
|
9 |
from joblib import Parallel, delayed
|
10 |
+
from nltk import ngrams
|
11 |
|
12 |
@st.cache_data
|
13 |
def convert_df(df):
|
14 |
return df.to_csv()
|
15 |
|
16 |
+
def normalize_string(string):
|
17 |
+
normalized_string = string.lower()
|
18 |
+
normalized_string = re.sub(r'[^\w\s]', '', normalized_string)
|
19 |
+
|
20 |
+
return normalized_string
|
21 |
+
|
22 |
+
def jaccard_similarity(string1, string2,n = 2, normalize=True):
|
23 |
+
|
24 |
+
if normalize:
|
25 |
+
string1,string2= normalize_string(string1),normalize_string(string2)
|
26 |
+
|
27 |
+
grams1 = set(ngrams(string1, n))
|
28 |
+
grams2 = set(ngrams(string2, n))
|
29 |
+
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
|
30 |
+
return similarity
|
31 |
+
|
32 |
+
|
33 |
def extract_website_domain(url):
|
34 |
parsed_url = urlparse(url)
|
35 |
return parsed_url.netloc
|
|
|
67 |
|
68 |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
69 |
df['Description']=df['Description'].bfill()
|
70 |
+
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})')
|
71 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
72 |
df['Website'] = df['Link'].apply(extract_website_domain)
|
73 |
|
|
|
81 |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
|
82 |
|
83 |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
84 |
+
df['Match Percent']=[jaccard_similarity(address,i)*100 for i in df['Address Output']]
|
85 |
+
|
86 |
|
87 |
+
# df_final=df[df['Address Output'].notnull()]
|
88 |
+
# df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
|
89 |
|
90 |
+
df.insert(0,'Address Input',address)
|
91 |
+
|
92 |
+
return df
|
93 |
|
94 |
def catch_errors(addresses):
|
95 |
try:
|
|
|
104 |
|
105 |
|
106 |
|
|
|
|
|
107 |
st.set_page_config(layout="wide")
|
|
|
|
|
108 |
|
109 |
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
110 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
111 |
+
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
112 |
+
match_percent = st.sidebar.selectbox('Address Match Percentage At Least:',(70, 80, 90, 100, 0))
|
113 |
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
114 |
|
|
|
115 |
if address_file == 'File' and not None:
|
116 |
try:
|
117 |
df = pd.read_csv(uploaded_file)
|
|
|
126 |
|
127 |
results= process_multiple_address(df['Address All'].values)
|
128 |
results=pd.concat(results).reset_index(drop=1)
|
129 |
+
# results.index=results.index+1
|
130 |
|
131 |
else:
|
132 |
results=google_address(address).reset_index(drop=1)
|
133 |
+
# results.index=results.index+1
|
134 |
|
135 |
|
136 |
+
results=results[['Address Input', 'Address Output','Match Percent', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
|
|
137 |
'Link', 'Description',
|
138 |
]]
|
139 |
|
140 |
+
results=results.query(f"`Match Percent`>={match_percent}")
|
141 |
+
|
142 |
if return_sq=='Yes':
|
143 |
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
144 |
+
# results.index=results.index+1
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
with st.container():
|
149 |
+
|
150 |
+
st.dataframe(
|
151 |
+
results,
|
152 |
+
column_config={
|
153 |
+
|
154 |
+
"Link": st.column_config.LinkColumn("Link"),
|
155 |
+
'Match Percent': st.column_config.NumberColumn(format='%.2f %%'),
|
156 |
+
},
|
157 |
+
hide_index=True,
|
158 |
+
# height=500,
|
159 |
+
# width=500,
|
160 |
+
)
|
161 |
|
162 |
csv2 = convert_df(results)
|
163 |
st.download_button(
|