Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,8 @@ from urllib.parse import urlparse
|
|
6 |
import base64
|
7 |
import requests
|
8 |
import time
|
|
|
|
|
9 |
|
10 |
def get_seo_powersuite_data(domains, api_key):
|
11 |
url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
|
@@ -74,23 +76,65 @@ def remove_subdomain(domain):
|
|
74 |
return '.'.join(parts[-2:])
|
75 |
return domain
|
76 |
|
77 |
-
def
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def download_csv(df, filename):
|
96 |
csv = df.to_csv(index=False)
|
@@ -101,183 +145,169 @@ def download_csv(df, filename):
|
|
101 |
def main():
|
102 |
st.title("Website Crawler")
|
103 |
|
104 |
-
|
105 |
-
page_count = st.number_input("Enter the
|
106 |
-
|
107 |
-
with col1:
|
108 |
-
concurrent_requests = st.number_input("Enter the number of concurrent requests per domain:", value=8, min_value=1, step=1)
|
109 |
-
with col2:
|
110 |
-
depth_limit = st.number_input("Enter the depth limit for the crawl:", value=0, min_value=0, step=1)
|
111 |
-
|
112 |
col1, col2 = st.columns(2)
|
113 |
with col1:
|
114 |
-
|
115 |
with col2:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
with col2:
|
122 |
-
api_key = None
|
123 |
-
if use_seo_powersuite:
|
124 |
-
api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password", value="")
|
125 |
download_links = st.checkbox("Show Download Links")
|
126 |
-
|
127 |
if st.button("Crawl"):
|
128 |
-
if
|
129 |
-
|
130 |
-
if
|
131 |
-
|
132 |
-
|
|
|
|
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
link_df = adv.crawlytics.links(crawl_results, internal_url_regex=extract_hostname(domain))
|
150 |
-
link_df.insert(0, 'Originating Domain', domain)
|
151 |
-
all_link_df.append(link_df)
|
152 |
-
|
153 |
-
unique_links_df = link_df['link'].value_counts().reset_index()
|
154 |
-
unique_links_df.columns = ['Link', 'Count']
|
155 |
-
unique_links_df.insert(0, 'Originating Domain', domain)
|
156 |
-
all_unique_links_df.append(unique_links_df)
|
157 |
-
|
158 |
-
outbound_links_df = link_df[link_df['internal'] == False]
|
159 |
-
unique_outbound_links_df = outbound_links_df['link'].value_counts().reset_index()
|
160 |
-
unique_outbound_links_df.columns = ['Link', 'Count']
|
161 |
-
unique_outbound_links_df.insert(0, 'Originating Domain', domain)
|
162 |
-
all_unique_outbound_links_df.append(unique_outbound_links_df)
|
163 |
-
|
164 |
-
outbound_links_df['link'] = outbound_links_df['link'].astype(str)
|
165 |
-
domain_df = outbound_links_df['link'].apply(extract_hostname).value_counts().reset_index()
|
166 |
-
domain_df.columns = ['Domain', 'Count']
|
167 |
-
domain_df = domain_df[domain_df['Domain'] != '']
|
168 |
-
peter_lowe_domains = get_peter_lowe_domains()
|
169 |
-
domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No')
|
170 |
-
domain_df.insert(0, 'Originating Domain', domain)
|
171 |
-
all_domain_df.append(domain_df)
|
172 |
-
|
173 |
-
if not domain_df.empty:
|
174 |
-
if domain_filter_regex_input:
|
175 |
-
domain_filter_regex_patterns = domain_filter_regex_input.split('\n')
|
176 |
-
domain_filter_regex = '|'.join(domain_filter_regex_patterns)
|
177 |
-
domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)]
|
178 |
-
|
179 |
-
if use_seo_powersuite and api_key:
|
180 |
-
seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key)
|
181 |
-
if seo_powersuite_df is not None:
|
182 |
-
domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left')
|
183 |
-
domain_df.drop('target', axis=1, inplace=True)
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
avg_refdomains = round(final_df['refdomains'].mean(), 2)
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
],
|
203 |
-
'Value': [
|
204 |
-
f"{peter_lowe_percentage}%",
|
205 |
-
avg_domain_inlink_rank,
|
206 |
-
avg_domain_inlink_rank_less_than_70,
|
207 |
-
avg_refdomains
|
208 |
-
]
|
209 |
-
}
|
210 |
|
211 |
-
|
212 |
-
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains']
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
'Average domain inlink rank (< 70)',
|
271 |
-
'Average number of refdomains'
|
272 |
-
]]
|
273 |
-
if download_links:
|
274 |
-
st.markdown(download_csv(combined_analysis_df, "Analytics"), unsafe_allow_html=True)
|
275 |
-
else:
|
276 |
-
st.table(combined_analysis_df)
|
277 |
else:
|
278 |
st.warning("Please enter at least one website URL.")
|
279 |
else:
|
280 |
-
st.warning("Please enter
|
281 |
|
282 |
if __name__ == '__main__':
|
283 |
main()
|
|
|
6 |
import base64
|
7 |
import requests
|
8 |
import time
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import re
|
11 |
|
12 |
def get_seo_powersuite_data(domains, api_key):
|
13 |
url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
|
|
|
76 |
return '.'.join(parts[-2:])
|
77 |
return domain
|
78 |
|
79 |
+
def domain_matches_blacklist(domain, regex_patterns):
|
80 |
+
for pattern in regex_patterns:
|
81 |
+
if re.search(pattern, domain, re.IGNORECASE):
|
82 |
+
return 'Yes'
|
83 |
+
return 'No'
|
84 |
+
|
85 |
+
def find_sitemap(url):
|
86 |
+
robots_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}/robots.txt"
|
87 |
+
try:
|
88 |
+
robots_response = requests.get(robots_url)
|
89 |
+
if robots_response.status_code == 200:
|
90 |
+
for line in robots_response.text.split("\n"):
|
91 |
+
if line.startswith("Sitemap:"):
|
92 |
+
return line.split(":", 1)[1].strip()
|
93 |
+
except requests.exceptions.RequestException:
|
94 |
+
pass
|
95 |
+
|
96 |
+
sitemap_urls = [
|
97 |
+
"/sitemap.xml", "/wp-sitemap.xml", "/?sitemap=1", "/sitemap_index/xml",
|
98 |
+
"/sitemap-index.xml", "/sitemap.php", "/sitemap.txt", "/sitemap.xml.gz",
|
99 |
+
"/sitemap/", "/sitemap/sitemap.xml", "/sitemapindex.xml", "/sitemap/index.xml", "/sitemap1.xml"
|
100 |
+
]
|
101 |
+
|
102 |
+
for sitemap_url in sitemap_urls:
|
103 |
+
try:
|
104 |
+
sitemap_response = requests.get(f"{urlparse(url).scheme}://{urlparse(url).netloc}{sitemap_url}")
|
105 |
+
if sitemap_response.status_code == 200:
|
106 |
+
return f"{urlparse(url).scheme}://{urlparse(url).netloc}{sitemap_url}"
|
107 |
+
except requests.exceptions.RequestException:
|
108 |
+
pass
|
109 |
+
|
110 |
+
return None
|
111 |
+
|
112 |
+
def crawl_posts(df, page_count):
|
113 |
+
crawl_results = []
|
114 |
+
for i, row in df.head(page_count).iterrows():
|
115 |
+
url = row['loc']
|
116 |
+
try:
|
117 |
+
response = requests.get(url)
|
118 |
+
if response.status_code == 200:
|
119 |
+
html = response.text
|
120 |
+
soup = BeautifulSoup(html, 'html.parser')
|
121 |
+
title = soup.title.text if soup.title else ''
|
122 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else ''
|
123 |
+
links = []
|
124 |
+
for a in soup.find_all('a', href=True):
|
125 |
+
link_url = a['href']
|
126 |
+
link_text = a.text.strip()
|
127 |
+
link_nofollow = 'nofollow' in a.get('rel', [])
|
128 |
+
links.append({'url': link_url, 'text': link_text, 'nofollow': link_nofollow})
|
129 |
+
crawl_results.append({
|
130 |
+
'url': url,
|
131 |
+
'title': title,
|
132 |
+
'meta_desc': meta_desc,
|
133 |
+
'links': links
|
134 |
+
})
|
135 |
+
except requests.exceptions.RequestException:
|
136 |
+
pass
|
137 |
+
return pd.DataFrame(crawl_results)
|
138 |
|
139 |
def download_csv(df, filename):
|
140 |
csv = df.to_csv(index=False)
|
|
|
145 |
def main():
|
146 |
st.title("Website Crawler")
|
147 |
|
148 |
+
urls = st.text_area("Enter the website URLs (one per line):", value="")
|
149 |
+
page_count = st.number_input("Enter the number of pages to crawl:", value=1000, min_value=1, step=1)
|
150 |
+
|
|
|
|
|
|
|
|
|
|
|
151 |
col1, col2 = st.columns(2)
|
152 |
with col1:
|
153 |
+
domain_filter_regex_input = st.text_area("Filter out Unique Outbound Domains:", help="This uses a regex filter to find domains in the unique outbound domains list. Enter one regex per line.", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be")
|
154 |
with col2:
|
155 |
+
domain_match_regex_input = st.text_area("Domain Blacklist:", help="This uses a regex filter to match domains in the Unique Outbound Domains to the blacklist entered. Enter one regex per line.", value="xyz\ncasino\ncbd\nessay")
|
156 |
+
use_seo_powersuite = st.checkbox("Use SEO PowerSuite")
|
157 |
+
api_key = None
|
158 |
+
if use_seo_powersuite:
|
159 |
+
api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password")
|
|
|
|
|
|
|
|
|
160 |
download_links = st.checkbox("Show Download Links")
|
161 |
+
|
162 |
if st.button("Crawl"):
|
163 |
+
if urls:
|
164 |
+
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
|
165 |
+
if url_list:
|
166 |
+
all_link_df = pd.DataFrame()
|
167 |
+
all_unique_outbound_links_df = pd.DataFrame()
|
168 |
+
all_final_df = pd.DataFrame()
|
169 |
+
all_analysis_df = pd.DataFrame()
|
170 |
|
171 |
+
for url in url_list:
|
172 |
+
with st.spinner(f"Finding sitemap for {url}..."):
|
173 |
+
sitemap_url = find_sitemap(url)
|
174 |
+
if sitemap_url:
|
175 |
+
with st.spinner(f"Crawling {url}..."):
|
176 |
+
sitemap_df = adv.sitemap_to_df(sitemap_url)
|
177 |
+
crawl_results = crawl_posts(sitemap_df, page_count)
|
178 |
+
|
179 |
+
if not crawl_results.empty:
|
180 |
+
link_df = pd.DataFrame(crawl_results['links'].explode().tolist())
|
181 |
+
link_df = link_df[~link_df['url'].str.startswith(('/','#'))]
|
182 |
+
link_df['internal'] = link_df['url'].apply(lambda x: extract_hostname(url) in extract_hostname(x))
|
183 |
+
link_df = link_df[link_df['internal'] == False] # Filter out internal links
|
184 |
+
link_df.insert(0, 'Originating Domain', url) # Add 'Originating Domain' column
|
185 |
+
link_df = link_df[['Originating Domain', 'url', 'text', 'nofollow']] # Remove the 'internal' column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
+
outbound_links_df = link_df.copy() # Create a copy of link_df for outbound links
|
188 |
+
unique_links_df = link_df['url'].value_counts().reset_index()
|
189 |
+
unique_links_df = unique_links_df[~unique_links_df['url'].str.startswith(('/','#'))]
|
190 |
+
unique_links_df.columns = ['Link', 'Count']
|
191 |
+
unique_links_df.insert(0, 'Originating Domain', url)
|
192 |
|
193 |
+
unique_outbound_links_df = outbound_links_df['url'].value_counts().reset_index()
|
194 |
+
unique_outbound_links_df = unique_outbound_links_df[~unique_outbound_links_df['url'].str.startswith(('/','#'))]
|
195 |
+
unique_outbound_links_df.columns = ['Link', 'Count']
|
196 |
+
unique_outbound_links_df.insert(0, 'Originating Domain', url)
|
|
|
197 |
|
198 |
+
outbound_links_df['url'] = outbound_links_df['url'].astype(str)
|
199 |
+
domain_df = outbound_links_df['url'].apply(extract_hostname).value_counts().reset_index()
|
200 |
+
domain_df.columns = ['Domain', 'Count']
|
201 |
+
domain_df = domain_df[domain_df['Domain'] != '']
|
202 |
+
peter_lowe_domains = get_peter_lowe_domains()
|
203 |
+
domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No')
|
204 |
+
domain_df.insert(0, 'Originating Domain', url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
+
# Determine the 'DoFollow' value for each domain
|
207 |
+
domain_df['DoFollow'] = domain_df['Domain'].apply(lambda x: any(outbound_links_df[(outbound_links_df['url'].str.contains(x)) & (outbound_links_df['nofollow'] == False)]))
|
208 |
|
209 |
+
if not domain_df.empty:
|
210 |
+
if domain_filter_regex_input:
|
211 |
+
domain_filter_regex_patterns = domain_filter_regex_input.split('\n')
|
212 |
+
domain_filter_regex = '|'.join(domain_filter_regex_patterns)
|
213 |
+
domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)]
|
214 |
+
|
215 |
+
if not domain_df.empty:
|
216 |
+
if domain_match_regex_input:
|
217 |
+
domain_match_regex_patterns = domain_match_regex_input.split('\n')
|
218 |
+
domain_df['Blacklist'] = domain_df['Domain'].apply(lambda x: domain_matches_blacklist(x, domain_match_regex_patterns) == 'Yes')
|
219 |
+
else:
|
220 |
+
domain_df['Blacklist'] = False
|
221 |
+
|
222 |
+
total_domains = len(domain_df)
|
223 |
+
peter_lowe_percentage = round((domain_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2)
|
224 |
+
blacklist_percentage = round((domain_df['Blacklist'] == True).sum() / total_domains * 100, 2)
|
225 |
+
|
226 |
+
analysis_data = {
|
227 |
+
'Originating Domain': [url] * 2,
|
228 |
+
'Metric': ['Percentage of domains not in Peter Lowe\'s list', 'Percentage of domains in the Blacklist'],
|
229 |
+
'Value': [f"{peter_lowe_percentage}%", f"{blacklist_percentage}%"]
|
230 |
+
}
|
231 |
+
|
232 |
+
analysis_df = pd.DataFrame(analysis_data)
|
233 |
+
|
234 |
+
if use_seo_powersuite and api_key:
|
235 |
+
seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key)
|
236 |
+
if seo_powersuite_df is not None:
|
237 |
+
domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left')
|
238 |
+
domain_df.drop('target', axis=1, inplace=True)
|
239 |
+
|
240 |
+
avg_domain_inlink_rank = round(domain_df['domain_inlink_rank'].mean(), 2)
|
241 |
+
avg_domain_inlink_rank_less_than_70 = round(domain_df[domain_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2)
|
242 |
+
avg_refdomains = round(domain_df['refdomains'].mean(), 2)
|
243 |
+
|
244 |
+
additional_analysis_data = {
|
245 |
+
'Originating Domain': [url] * 3,
|
246 |
+
'Metric': [
|
247 |
+
'Average domain inlink rank',
|
248 |
+
'Average domain inlink rank (< 70)',
|
249 |
+
'Average number of refdomains'
|
250 |
+
],
|
251 |
+
'Value': [
|
252 |
+
avg_domain_inlink_rank,
|
253 |
+
avg_domain_inlink_rank_less_than_70,
|
254 |
+
avg_refdomains
|
255 |
+
]
|
256 |
+
}
|
257 |
+
|
258 |
+
analysis_df = pd.concat([analysis_df, pd.DataFrame(additional_analysis_data)], ignore_index=True)
|
259 |
+
|
260 |
+
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'DoFollow', 'Blacklist', 'domain_inlink_rank', 'refdomains']
|
261 |
+
final_df = domain_df[desired_columns]
|
262 |
+
else:
|
263 |
+
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'DoFollow', 'Blacklist']
|
264 |
+
final_df = domain_df[desired_columns]
|
265 |
+
else:
|
266 |
+
st.warning(f"No unique outbound domains found for {url} after filtering.")
|
267 |
+
else:
|
268 |
+
st.warning(f"No unique outbound domains found for {url}.")
|
269 |
+
|
270 |
+
all_link_df = pd.concat([all_link_df, link_df], ignore_index=True)
|
271 |
+
all_unique_outbound_links_df = pd.concat([all_unique_outbound_links_df, unique_outbound_links_df], ignore_index=True)
|
272 |
+
all_final_df = pd.concat([all_final_df, final_df], ignore_index=True)
|
273 |
+
all_analysis_df = pd.concat([all_analysis_df, analysis_df], ignore_index=True)
|
274 |
+
else:
|
275 |
+
st.warning(f"No posts found in the sitemap for {url}.")
|
276 |
+
else:
|
277 |
+
st.warning(f"Sitemap not found for {url}.")
|
278 |
+
|
279 |
+
st.subheader("Outbound Links")
|
280 |
+
if download_links:
|
281 |
+
st.markdown(download_csv(all_link_df, "Outbound Links"), unsafe_allow_html=True)
|
282 |
+
else:
|
283 |
+
st.write(all_link_df)
|
284 |
+
|
285 |
+
st.subheader("Unique Outbound Links")
|
286 |
+
if download_links:
|
287 |
+
st.markdown(download_csv(all_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True)
|
288 |
+
else:
|
289 |
+
st.write(all_unique_outbound_links_df)
|
290 |
+
|
291 |
+
st.subheader("Unique Outbound Domains")
|
292 |
+
if download_links:
|
293 |
+
st.markdown(download_csv(all_final_df, "Unique Outbound Domains"), unsafe_allow_html=True)
|
294 |
+
else:
|
295 |
+
st.write(all_final_df)
|
296 |
+
|
297 |
+
st.subheader("Analytics")
|
298 |
+
all_analysis_df = all_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index()
|
299 |
+
all_analysis_df.columns.name = None
|
300 |
+
if use_seo_powersuite and api_key:
|
301 |
numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains']
|
302 |
+
all_analysis_df[numeric_columns] = all_analysis_df[numeric_columns].astype(int)
|
303 |
+
if download_links:
|
304 |
+
st.markdown(download_csv(all_analysis_df, "Analytics"), unsafe_allow_html=True)
|
305 |
+
else:
|
306 |
+
st.table(all_analysis_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
else:
|
308 |
st.warning("Please enter at least one website URL.")
|
309 |
else:
|
310 |
+
st.warning("Please enter website URLs.")
|
311 |
|
312 |
if __name__ == '__main__':
|
313 |
main()
|