Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import advertools as adv
|
2 |
+
import streamlit as st
|
3 |
+
import tempfile
|
4 |
+
import pandas as pd
|
5 |
+
from urllib.parse import urlparse
|
6 |
+
import base64
|
7 |
+
import requests
|
8 |
+
import time
|
9 |
+
|
10 |
+
def get_seo_powersuite_data(domains, api_key):
|
11 |
+
url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
|
12 |
+
url_refdomains_count = "https://api.seopowersuite.com/backlinks/v1.0/get-refdomains-count"
|
13 |
+
headers = {"Content-Type": "application/json"}
|
14 |
+
|
15 |
+
results = []
|
16 |
+
for i in range(0, len(domains), 100):
|
17 |
+
batch_domains = domains[i:i+100]
|
18 |
+
|
19 |
+
# Get domain inlink rank
|
20 |
+
start_time = time.time()
|
21 |
+
payload_domain_inlink_rank = {"target": list(batch_domains)}
|
22 |
+
params_domain_inlink_rank = {"apikey": api_key, "output": "json"}
|
23 |
+
response_domain_inlink_rank = requests.post(url_domain_inlink_rank, json=payload_domain_inlink_rank, headers=headers, params=params_domain_inlink_rank)
|
24 |
+
duration = time.time() - start_time
|
25 |
+
print(f"get-domain-inlink-rank API call for {len(batch_domains)} domains took {duration:.2f} seconds")
|
26 |
+
|
27 |
+
if response_domain_inlink_rank.status_code == 200:
|
28 |
+
data_domain_inlink_rank = response_domain_inlink_rank.json()
|
29 |
+
domain_inlink_rank_dict = {page["url"]: page["domain_inlink_rank"] for page in data_domain_inlink_rank["pages"]}
|
30 |
+
else:
|
31 |
+
st.error(f"Error fetching domain inlink rank data from SEO PowerSuite API: {response_domain_inlink_rank.status_code}")
|
32 |
+
st.error("Error Response:")
|
33 |
+
st.write(response_domain_inlink_rank.text)
|
34 |
+
return None
|
35 |
+
|
36 |
+
# Get refdomains count
|
37 |
+
start_time = time.time()
|
38 |
+
payload_refdomains_count = {"target": list(batch_domains), "mode": "domain"}
|
39 |
+
params_refdomains_count = {"apikey": api_key, "output": "json"}
|
40 |
+
response_refdomains_count = requests.post(url_refdomains_count, json=payload_refdomains_count, headers=headers, params=params_refdomains_count)
|
41 |
+
duration = time.time() - start_time
|
42 |
+
print(f"get-refdomains-count API call for {len(batch_domains)} domains took {duration:.2f} seconds")
|
43 |
+
|
44 |
+
if response_refdomains_count.status_code == 200:
|
45 |
+
data_refdomains_count = response_refdomains_count.json()
|
46 |
+
for metric in data_refdomains_count["metrics"]:
|
47 |
+
result = {
|
48 |
+
"target": metric["target"],
|
49 |
+
"domain_inlink_rank": domain_inlink_rank_dict.get(metric["target"], None),
|
50 |
+
"refdomains": metric["refdomains"]
|
51 |
+
}
|
52 |
+
results.append(result)
|
53 |
+
else:
|
54 |
+
st.error(f"Error fetching refdomains count data from SEO PowerSuite API: {response_refdomains_count.status_code}")
|
55 |
+
st.error("Error Response:")
|
56 |
+
st.write(response_refdomains_count.text)
|
57 |
+
return None
|
58 |
+
|
59 |
+
return pd.DataFrame(results)
|
60 |
+
|
61 |
+
def get_peter_lowe_domains():
|
62 |
+
url = "https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus&mimetype=plaintext"
|
63 |
+
response = requests.get(url)
|
64 |
+
lines = response.text.split('\n')
|
65 |
+
domains = [line.strip('|^') for line in lines if line.startswith('||')]
|
66 |
+
return set(domains)
|
67 |
+
|
68 |
+
def extract_hostname(url):
|
69 |
+
return urlparse(url).netloc
|
70 |
+
|
71 |
+
def remove_subdomain(domain):
|
72 |
+
parts = domain.split('.')
|
73 |
+
if len(parts) > 2:
|
74 |
+
return '.'.join(parts[-2:])
|
75 |
+
return domain
|
76 |
+
|
77 |
+
def crawl_website(url, exclude_url_regex, page_count, concurrent_requests, depth_limit):
|
78 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.jl') as temp_file:
|
79 |
+
custom_settings = {
|
80 |
+
"CLOSESPIDER_PAGECOUNT": page_count,
|
81 |
+
"CONCURRENT_REQUESTS_PER_DOMAIN": concurrent_requests,
|
82 |
+
"DEPTH_LIMIT": depth_limit
|
83 |
+
}
|
84 |
+
if exclude_url_regex:
|
85 |
+
adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, exclude_url_regex=exclude_url_regex, custom_settings=custom_settings)
|
86 |
+
else:
|
87 |
+
adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, custom_settings=custom_settings)
|
88 |
+
crawl_df = pd.read_json(temp_file.name, lines=True)
|
89 |
+
crawl_df['hostname'] = crawl_df['url'].apply(extract_hostname)
|
90 |
+
# Select only the desired columns
|
91 |
+
desired_columns = ['url', 'hostname', 'title', 'meta_desc', 'status', 'links_url', 'links_text', 'links_nofollow']
|
92 |
+
crawl_df = crawl_df[desired_columns]
|
93 |
+
return crawl_df
|
94 |
+
|
95 |
+
def download_csv(df, filename):
|
96 |
+
csv = df.to_csv(index=False)
|
97 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
98 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>'
|
99 |
+
return href
|
100 |
+
|
101 |
+
def main():
|
102 |
+
st.title("Website Crawler")
|
103 |
+
|
104 |
+
domains = st.text_area("Enter the website URLs (one per line):", value="")
|
105 |
+
page_count = st.number_input("Enter the maximum number of pages to crawl:", value=100, min_value=1, step=1)
|
106 |
+
col1, col2 = st.columns(2)
|
107 |
+
with col1:
|
108 |
+
concurrent_requests = st.number_input("Enter the number of concurrent requests per domain:", value=8, min_value=1, step=1)
|
109 |
+
with col2:
|
110 |
+
depth_limit = st.number_input("Enter the depth limit for the crawl:", value=0, min_value=0, step=1)
|
111 |
+
|
112 |
+
col1, col2 = st.columns(2)
|
113 |
+
with col1:
|
114 |
+
exclude_url_regex_input = st.text_area("Enter exclude URL regex patterns (one per line):", value="cdn")
|
115 |
+
with col2:
|
116 |
+
domain_filter_regex_input = st.text_area("Filter our unique domains with regex (one per line):", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be")
|
117 |
+
|
118 |
+
col1, col2 = st.columns([2, 5])
|
119 |
+
with col1:
|
120 |
+
use_seo_powersuite = st.checkbox("Use SEO PowerSuite")
|
121 |
+
with col2:
|
122 |
+
api_key = None
|
123 |
+
if use_seo_powersuite:
|
124 |
+
api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password", value="")
|
125 |
+
download_links = st.checkbox("Show Download Links")
|
126 |
+
|
127 |
+
if st.button("Crawl"):
|
128 |
+
if domains:
|
129 |
+
domain_list = [domain.strip() for domain in domains.split('\n') if domain.strip()]
|
130 |
+
if domain_list:
|
131 |
+
exclude_url_regex_patterns = exclude_url_regex_input.split('\n')
|
132 |
+
exclude_url_regex = '|'.join(exclude_url_regex_patterns) if exclude_url_regex_patterns else None
|
133 |
+
|
134 |
+
all_crawl_results = []
|
135 |
+
all_link_df = []
|
136 |
+
all_unique_links_df = []
|
137 |
+
all_unique_outbound_links_df = []
|
138 |
+
all_domain_df = []
|
139 |
+
all_final_df = []
|
140 |
+
all_analysis_df = []
|
141 |
+
|
142 |
+
for domain in domain_list:
|
143 |
+
with st.spinner(f"Crawling {domain}..."):
|
144 |
+
crawl_results = crawl_website(domain, exclude_url_regex, page_count, concurrent_requests, depth_limit)
|
145 |
+
crawl_results.insert(0, 'Originating Domain', domain)
|
146 |
+
all_crawl_results.append(crawl_results)
|
147 |
+
|
148 |
+
if not crawl_results.empty:
|
149 |
+
link_df = adv.crawlytics.links(crawl_results, internal_url_regex=extract_hostname(domain))
|
150 |
+
link_df.insert(0, 'Originating Domain', domain)
|
151 |
+
all_link_df.append(link_df)
|
152 |
+
|
153 |
+
unique_links_df = link_df['link'].value_counts().reset_index()
|
154 |
+
unique_links_df.columns = ['Link', 'Count']
|
155 |
+
unique_links_df.insert(0, 'Originating Domain', domain)
|
156 |
+
all_unique_links_df.append(unique_links_df)
|
157 |
+
|
158 |
+
outbound_links_df = link_df[link_df['internal'] == False]
|
159 |
+
unique_outbound_links_df = outbound_links_df['link'].value_counts().reset_index()
|
160 |
+
unique_outbound_links_df.columns = ['Link', 'Count']
|
161 |
+
unique_outbound_links_df.insert(0, 'Originating Domain', domain)
|
162 |
+
all_unique_outbound_links_df.append(unique_outbound_links_df)
|
163 |
+
|
164 |
+
outbound_links_df['link'] = outbound_links_df['link'].astype(str)
|
165 |
+
domain_df = outbound_links_df['link'].apply(extract_hostname).value_counts().reset_index()
|
166 |
+
domain_df.columns = ['Domain', 'Count']
|
167 |
+
domain_df = domain_df[domain_df['Domain'] != '']
|
168 |
+
peter_lowe_domains = get_peter_lowe_domains()
|
169 |
+
domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No')
|
170 |
+
domain_df.insert(0, 'Originating Domain', domain)
|
171 |
+
all_domain_df.append(domain_df)
|
172 |
+
|
173 |
+
if not domain_df.empty:
|
174 |
+
if domain_filter_regex_input:
|
175 |
+
domain_filter_regex_patterns = domain_filter_regex_input.split('\n')
|
176 |
+
domain_filter_regex = '|'.join(domain_filter_regex_patterns)
|
177 |
+
domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)]
|
178 |
+
|
179 |
+
if use_seo_powersuite and api_key:
|
180 |
+
seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key)
|
181 |
+
if seo_powersuite_df is not None:
|
182 |
+
domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left')
|
183 |
+
domain_df.drop('target', axis=1, inplace=True)
|
184 |
+
|
185 |
+
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'domain_inlink_rank', 'refdomains']
|
186 |
+
final_df = domain_df[desired_columns]
|
187 |
+
all_final_df.append(final_df)
|
188 |
+
|
189 |
+
total_domains = len(final_df)
|
190 |
+
peter_lowe_percentage = round((final_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2)
|
191 |
+
avg_domain_inlink_rank = round(final_df['domain_inlink_rank'].mean(), 2)
|
192 |
+
avg_domain_inlink_rank_less_than_70 = round(final_df[final_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2)
|
193 |
+
avg_refdomains = round(final_df['refdomains'].mean(), 2)
|
194 |
+
|
195 |
+
analysis_data = {
|
196 |
+
'Originating Domain': [domain] * 4,
|
197 |
+
'Metric': [
|
198 |
+
'Percentage of domains not in Peter Lowe\'s list',
|
199 |
+
'Average domain inlink rank',
|
200 |
+
'Average domain inlink rank (< 70)',
|
201 |
+
'Average number of refdomains'
|
202 |
+
],
|
203 |
+
'Value': [
|
204 |
+
f"{peter_lowe_percentage}%",
|
205 |
+
avg_domain_inlink_rank,
|
206 |
+
avg_domain_inlink_rank_less_than_70,
|
207 |
+
avg_refdomains
|
208 |
+
]
|
209 |
+
}
|
210 |
+
|
211 |
+
analysis_df = pd.DataFrame(analysis_data)
|
212 |
+
all_analysis_df.append(analysis_df)
|
213 |
+
|
214 |
+
else:
|
215 |
+
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List']
|
216 |
+
final_df = domain_df[desired_columns]
|
217 |
+
all_final_df.append(final_df)
|
218 |
+
|
219 |
+
if all_crawl_results:
|
220 |
+
st.subheader("Crawl Results")
|
221 |
+
combined_crawl_results = pd.concat(all_crawl_results, ignore_index=True)
|
222 |
+
if download_links:
|
223 |
+
st.markdown(download_csv(combined_crawl_results, "Crawl Results"), unsafe_allow_html=True)
|
224 |
+
else:
|
225 |
+
st.write(combined_crawl_results)
|
226 |
+
|
227 |
+
if all_link_df:
|
228 |
+
st.subheader("All Links")
|
229 |
+
combined_link_df = pd.concat(all_link_df, ignore_index=True)
|
230 |
+
if download_links:
|
231 |
+
st.markdown(download_csv(combined_link_df, "All Links"), unsafe_allow_html=True)
|
232 |
+
else:
|
233 |
+
st.write(combined_link_df)
|
234 |
+
|
235 |
+
if all_unique_links_df:
|
236 |
+
st.subheader("Unique Links")
|
237 |
+
combined_unique_links_df = pd.concat(all_unique_links_df, ignore_index=True)
|
238 |
+
if download_links:
|
239 |
+
st.markdown(download_csv(combined_unique_links_df, "Unique Links"), unsafe_allow_html=True)
|
240 |
+
else:
|
241 |
+
st.write(combined_unique_links_df)
|
242 |
+
|
243 |
+
if all_unique_outbound_links_df:
|
244 |
+
st.subheader("Unique Outbound Links")
|
245 |
+
combined_unique_outbound_links_df = pd.concat(all_unique_outbound_links_df, ignore_index=True)
|
246 |
+
if download_links:
|
247 |
+
st.markdown(download_csv(combined_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True)
|
248 |
+
else:
|
249 |
+
st.write(combined_unique_outbound_links_df)
|
250 |
+
|
251 |
+
if all_final_df:
|
252 |
+
st.subheader("Unique Outbound Domains")
|
253 |
+
combined_final_df = pd.concat(all_final_df, ignore_index=True)
|
254 |
+
if download_links:
|
255 |
+
st.markdown(download_csv(combined_final_df, "Unique Outbound Domains"), unsafe_allow_html=True)
|
256 |
+
else:
|
257 |
+
st.write(combined_final_df)
|
258 |
+
|
259 |
+
if all_analysis_df:
|
260 |
+
st.subheader("Analytics")
|
261 |
+
combined_analysis_df = pd.concat(all_analysis_df)
|
262 |
+
combined_analysis_df = combined_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index()
|
263 |
+
combined_analysis_df.columns.name = None
|
264 |
+
numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains']
|
265 |
+
combined_analysis_df[numeric_columns] = combined_analysis_df[numeric_columns].astype(int)
|
266 |
+
combined_analysis_df = combined_analysis_df[[
|
267 |
+
'Originating Domain',
|
268 |
+
'Percentage of domains not in Peter Lowe\'s list',
|
269 |
+
'Average domain inlink rank',
|
270 |
+
'Average domain inlink rank (< 70)',
|
271 |
+
'Average number of refdomains'
|
272 |
+
]]
|
273 |
+
if download_links:
|
274 |
+
st.markdown(download_csv(combined_analysis_df, "Analytics"), unsafe_allow_html=True)
|
275 |
+
else:
|
276 |
+
st.table(combined_analysis_df)
|
277 |
+
else:
|
278 |
+
st.warning("Please enter at least one website URL.")
|
279 |
+
else:
|
280 |
+
st.warning("Please enter the website URLs.")
|
281 |
+
|
282 |
+
if __name__ == '__main__':
|
283 |
+
main()
|