joshuadunlop commited on
Commit
68329f5
·
verified ·
1 Parent(s): 4aff20a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -0
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import advertools as adv
2
+ import streamlit as st
3
+ import tempfile
4
+ import pandas as pd
5
+ from urllib.parse import urlparse
6
+ import base64
7
+ import requests
8
+ import time
9
+
10
+ def get_seo_powersuite_data(domains, api_key):
11
+ url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
12
+ url_refdomains_count = "https://api.seopowersuite.com/backlinks/v1.0/get-refdomains-count"
13
+ headers = {"Content-Type": "application/json"}
14
+
15
+ results = []
16
+ for i in range(0, len(domains), 100):
17
+ batch_domains = domains[i:i+100]
18
+
19
+ # Get domain inlink rank
20
+ start_time = time.time()
21
+ payload_domain_inlink_rank = {"target": list(batch_domains)}
22
+ params_domain_inlink_rank = {"apikey": api_key, "output": "json"}
23
+ response_domain_inlink_rank = requests.post(url_domain_inlink_rank, json=payload_domain_inlink_rank, headers=headers, params=params_domain_inlink_rank)
24
+ duration = time.time() - start_time
25
+ print(f"get-domain-inlink-rank API call for {len(batch_domains)} domains took {duration:.2f} seconds")
26
+
27
+ if response_domain_inlink_rank.status_code == 200:
28
+ data_domain_inlink_rank = response_domain_inlink_rank.json()
29
+ domain_inlink_rank_dict = {page["url"]: page["domain_inlink_rank"] for page in data_domain_inlink_rank["pages"]}
30
+ else:
31
+ st.error(f"Error fetching domain inlink rank data from SEO PowerSuite API: {response_domain_inlink_rank.status_code}")
32
+ st.error("Error Response:")
33
+ st.write(response_domain_inlink_rank.text)
34
+ return None
35
+
36
+ # Get refdomains count
37
+ start_time = time.time()
38
+ payload_refdomains_count = {"target": list(batch_domains), "mode": "domain"}
39
+ params_refdomains_count = {"apikey": api_key, "output": "json"}
40
+ response_refdomains_count = requests.post(url_refdomains_count, json=payload_refdomains_count, headers=headers, params=params_refdomains_count)
41
+ duration = time.time() - start_time
42
+ print(f"get-refdomains-count API call for {len(batch_domains)} domains took {duration:.2f} seconds")
43
+
44
+ if response_refdomains_count.status_code == 200:
45
+ data_refdomains_count = response_refdomains_count.json()
46
+ for metric in data_refdomains_count["metrics"]:
47
+ result = {
48
+ "target": metric["target"],
49
+ "domain_inlink_rank": domain_inlink_rank_dict.get(metric["target"], None),
50
+ "refdomains": metric["refdomains"]
51
+ }
52
+ results.append(result)
53
+ else:
54
+ st.error(f"Error fetching refdomains count data from SEO PowerSuite API: {response_refdomains_count.status_code}")
55
+ st.error("Error Response:")
56
+ st.write(response_refdomains_count.text)
57
+ return None
58
+
59
+ return pd.DataFrame(results)
60
+
61
+ def get_peter_lowe_domains():
62
+ url = "https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus&mimetype=plaintext"
63
+ response = requests.get(url)
64
+ lines = response.text.split('\n')
65
+ domains = [line.strip('|^') for line in lines if line.startswith('||')]
66
+ return set(domains)
67
+
68
+ def extract_hostname(url):
69
+ return urlparse(url).netloc
70
+
71
+ def remove_subdomain(domain):
72
+ parts = domain.split('.')
73
+ if len(parts) > 2:
74
+ return '.'.join(parts[-2:])
75
+ return domain
76
+
77
+ def crawl_website(url, exclude_url_regex, page_count, concurrent_requests, depth_limit):
78
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.jl') as temp_file:
79
+ custom_settings = {
80
+ "CLOSESPIDER_PAGECOUNT": page_count,
81
+ "CONCURRENT_REQUESTS_PER_DOMAIN": concurrent_requests,
82
+ "DEPTH_LIMIT": depth_limit
83
+ }
84
+ if exclude_url_regex:
85
+ adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, exclude_url_regex=exclude_url_regex, custom_settings=custom_settings)
86
+ else:
87
+ adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, custom_settings=custom_settings)
88
+ crawl_df = pd.read_json(temp_file.name, lines=True)
89
+ crawl_df['hostname'] = crawl_df['url'].apply(extract_hostname)
90
+ # Select only the desired columns
91
+ desired_columns = ['url', 'hostname', 'title', 'meta_desc', 'status', 'links_url', 'links_text', 'links_nofollow']
92
+ crawl_df = crawl_df[desired_columns]
93
+ return crawl_df
94
+
95
+ def download_csv(df, filename):
96
+ csv = df.to_csv(index=False)
97
+ b64 = base64.b64encode(csv.encode()).decode()
98
+ href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>'
99
+ return href
100
+
101
+ def main():
102
+ st.title("Website Crawler")
103
+
104
+ domains = st.text_area("Enter the website URLs (one per line):", value="")
105
+ page_count = st.number_input("Enter the maximum number of pages to crawl:", value=100, min_value=1, step=1)
106
+ col1, col2 = st.columns(2)
107
+ with col1:
108
+ concurrent_requests = st.number_input("Enter the number of concurrent requests per domain:", value=8, min_value=1, step=1)
109
+ with col2:
110
+ depth_limit = st.number_input("Enter the depth limit for the crawl:", value=0, min_value=0, step=1)
111
+
112
+ col1, col2 = st.columns(2)
113
+ with col1:
114
+ exclude_url_regex_input = st.text_area("Enter exclude URL regex patterns (one per line):", value="cdn")
115
+ with col2:
116
+ domain_filter_regex_input = st.text_area("Filter our unique domains with regex (one per line):", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be")
117
+
118
+ col1, col2 = st.columns([2, 5])
119
+ with col1:
120
+ use_seo_powersuite = st.checkbox("Use SEO PowerSuite")
121
+ with col2:
122
+ api_key = None
123
+ if use_seo_powersuite:
124
+ api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password", value="")
125
+ download_links = st.checkbox("Show Download Links")
126
+
127
+ if st.button("Crawl"):
128
+ if domains:
129
+ domain_list = [domain.strip() for domain in domains.split('\n') if domain.strip()]
130
+ if domain_list:
131
+ exclude_url_regex_patterns = exclude_url_regex_input.split('\n')
132
+ exclude_url_regex = '|'.join(exclude_url_regex_patterns) if exclude_url_regex_patterns else None
133
+
134
+ all_crawl_results = []
135
+ all_link_df = []
136
+ all_unique_links_df = []
137
+ all_unique_outbound_links_df = []
138
+ all_domain_df = []
139
+ all_final_df = []
140
+ all_analysis_df = []
141
+
142
+ for domain in domain_list:
143
+ with st.spinner(f"Crawling {domain}..."):
144
+ crawl_results = crawl_website(domain, exclude_url_regex, page_count, concurrent_requests, depth_limit)
145
+ crawl_results.insert(0, 'Originating Domain', domain)
146
+ all_crawl_results.append(crawl_results)
147
+
148
+ if not crawl_results.empty:
149
+ link_df = adv.crawlytics.links(crawl_results, internal_url_regex=extract_hostname(domain))
150
+ link_df.insert(0, 'Originating Domain', domain)
151
+ all_link_df.append(link_df)
152
+
153
+ unique_links_df = link_df['link'].value_counts().reset_index()
154
+ unique_links_df.columns = ['Link', 'Count']
155
+ unique_links_df.insert(0, 'Originating Domain', domain)
156
+ all_unique_links_df.append(unique_links_df)
157
+
158
+ outbound_links_df = link_df[link_df['internal'] == False]
159
+ unique_outbound_links_df = outbound_links_df['link'].value_counts().reset_index()
160
+ unique_outbound_links_df.columns = ['Link', 'Count']
161
+ unique_outbound_links_df.insert(0, 'Originating Domain', domain)
162
+ all_unique_outbound_links_df.append(unique_outbound_links_df)
163
+
164
+ outbound_links_df['link'] = outbound_links_df['link'].astype(str)
165
+ domain_df = outbound_links_df['link'].apply(extract_hostname).value_counts().reset_index()
166
+ domain_df.columns = ['Domain', 'Count']
167
+ domain_df = domain_df[domain_df['Domain'] != '']
168
+ peter_lowe_domains = get_peter_lowe_domains()
169
+ domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No')
170
+ domain_df.insert(0, 'Originating Domain', domain)
171
+ all_domain_df.append(domain_df)
172
+
173
+ if not domain_df.empty:
174
+ if domain_filter_regex_input:
175
+ domain_filter_regex_patterns = domain_filter_regex_input.split('\n')
176
+ domain_filter_regex = '|'.join(domain_filter_regex_patterns)
177
+ domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)]
178
+
179
+ if use_seo_powersuite and api_key:
180
+ seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key)
181
+ if seo_powersuite_df is not None:
182
+ domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left')
183
+ domain_df.drop('target', axis=1, inplace=True)
184
+
185
+ desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'domain_inlink_rank', 'refdomains']
186
+ final_df = domain_df[desired_columns]
187
+ all_final_df.append(final_df)
188
+
189
+ total_domains = len(final_df)
190
+ peter_lowe_percentage = round((final_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2)
191
+ avg_domain_inlink_rank = round(final_df['domain_inlink_rank'].mean(), 2)
192
+ avg_domain_inlink_rank_less_than_70 = round(final_df[final_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2)
193
+ avg_refdomains = round(final_df['refdomains'].mean(), 2)
194
+
195
+ analysis_data = {
196
+ 'Originating Domain': [domain] * 4,
197
+ 'Metric': [
198
+ 'Percentage of domains not in Peter Lowe\'s list',
199
+ 'Average domain inlink rank',
200
+ 'Average domain inlink rank (< 70)',
201
+ 'Average number of refdomains'
202
+ ],
203
+ 'Value': [
204
+ f"{peter_lowe_percentage}%",
205
+ avg_domain_inlink_rank,
206
+ avg_domain_inlink_rank_less_than_70,
207
+ avg_refdomains
208
+ ]
209
+ }
210
+
211
+ analysis_df = pd.DataFrame(analysis_data)
212
+ all_analysis_df.append(analysis_df)
213
+
214
+ else:
215
+ desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List']
216
+ final_df = domain_df[desired_columns]
217
+ all_final_df.append(final_df)
218
+
219
+ if all_crawl_results:
220
+ st.subheader("Crawl Results")
221
+ combined_crawl_results = pd.concat(all_crawl_results, ignore_index=True)
222
+ if download_links:
223
+ st.markdown(download_csv(combined_crawl_results, "Crawl Results"), unsafe_allow_html=True)
224
+ else:
225
+ st.write(combined_crawl_results)
226
+
227
+ if all_link_df:
228
+ st.subheader("All Links")
229
+ combined_link_df = pd.concat(all_link_df, ignore_index=True)
230
+ if download_links:
231
+ st.markdown(download_csv(combined_link_df, "All Links"), unsafe_allow_html=True)
232
+ else:
233
+ st.write(combined_link_df)
234
+
235
+ if all_unique_links_df:
236
+ st.subheader("Unique Links")
237
+ combined_unique_links_df = pd.concat(all_unique_links_df, ignore_index=True)
238
+ if download_links:
239
+ st.markdown(download_csv(combined_unique_links_df, "Unique Links"), unsafe_allow_html=True)
240
+ else:
241
+ st.write(combined_unique_links_df)
242
+
243
+ if all_unique_outbound_links_df:
244
+ st.subheader("Unique Outbound Links")
245
+ combined_unique_outbound_links_df = pd.concat(all_unique_outbound_links_df, ignore_index=True)
246
+ if download_links:
247
+ st.markdown(download_csv(combined_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True)
248
+ else:
249
+ st.write(combined_unique_outbound_links_df)
250
+
251
+ if all_final_df:
252
+ st.subheader("Unique Outbound Domains")
253
+ combined_final_df = pd.concat(all_final_df, ignore_index=True)
254
+ if download_links:
255
+ st.markdown(download_csv(combined_final_df, "Unique Outbound Domains"), unsafe_allow_html=True)
256
+ else:
257
+ st.write(combined_final_df)
258
+
259
+ if all_analysis_df:
260
+ st.subheader("Analytics")
261
+ combined_analysis_df = pd.concat(all_analysis_df)
262
+ combined_analysis_df = combined_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index()
263
+ combined_analysis_df.columns.name = None
264
+ numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains']
265
+ combined_analysis_df[numeric_columns] = combined_analysis_df[numeric_columns].astype(int)
266
+ combined_analysis_df = combined_analysis_df[[
267
+ 'Originating Domain',
268
+ 'Percentage of domains not in Peter Lowe\'s list',
269
+ 'Average domain inlink rank',
270
+ 'Average domain inlink rank (< 70)',
271
+ 'Average number of refdomains'
272
+ ]]
273
+ if download_links:
274
+ st.markdown(download_csv(combined_analysis_df, "Analytics"), unsafe_allow_html=True)
275
+ else:
276
+ st.table(combined_analysis_df)
277
+ else:
278
+ st.warning("Please enter at least one website URL.")
279
+ else:
280
+ st.warning("Please enter the website URLs.")
281
+
282
+ if __name__ == '__main__':
283
+ main()