Spaces:
Sleeping
Sleeping
import advertools as adv | |
import streamlit as st | |
import tempfile | |
import pandas as pd | |
from urllib.parse import urlparse | |
import base64 | |
import requests | |
import time | |
from bs4 import BeautifulSoup | |
import re | |
import concurrent.futures | |
def get_seo_powersuite_data(domains, api_key): | |
url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank" | |
url_refdomains_count = "https://api.seopowersuite.com/backlinks/v1.0/get-refdomains-count" | |
headers = {"Content-Type": "application/json"} | |
results = [] | |
for i in range(0, len(domains), 100): | |
batch_domains = domains[i:i+100] | |
# Get domain inlink rank | |
start_time = time.time() | |
payload_domain_inlink_rank = {"target": list(batch_domains)} | |
params_domain_inlink_rank = {"apikey": api_key, "output": "json"} | |
response_domain_inlink_rank = requests.post(url_domain_inlink_rank, json=payload_domain_inlink_rank, headers=headers, params=params_domain_inlink_rank) | |
duration = time.time() - start_time | |
print(f"get-domain-inlink-rank API call for {len(batch_domains)} domains took {duration:.2f} seconds") | |
if response_domain_inlink_rank.status_code == 200: | |
data_domain_inlink_rank = response_domain_inlink_rank.json() | |
domain_inlink_rank_dict = {page["url"]: page["domain_inlink_rank"] for page in data_domain_inlink_rank["pages"]} | |
else: | |
st.error(f"Error fetching domain inlink rank data from SEO PowerSuite API: {response_domain_inlink_rank.status_code}") | |
st.error("Error Response:") | |
st.write(response_domain_inlink_rank.text) | |
return None | |
# Get refdomains count | |
start_time = time.time() | |
payload_refdomains_count = {"target": list(batch_domains), "mode": "domain"} | |
params_refdomains_count = {"apikey": api_key, "output": "json"} | |
response_refdomains_count = requests.post(url_refdomains_count, json=payload_refdomains_count, headers=headers, params=params_refdomains_count) | |
duration = time.time() - start_time | |
print(f"get-refdomains-count API call for {len(batch_domains)} domains took {duration:.2f} seconds") | |
if response_refdomains_count.status_code == 200: | |
data_refdomains_count = response_refdomains_count.json() | |
for metric in data_refdomains_count["metrics"]: | |
result = { | |
"target": metric["target"], | |
"domain_inlink_rank": domain_inlink_rank_dict.get(metric["target"], None), | |
"refdomains": metric["refdomains"] | |
} | |
results.append(result) | |
else: | |
st.error(f"Error fetching refdomains count data from SEO PowerSuite API: {response_refdomains_count.status_code}") | |
st.error("Error Response:") | |
st.write(response_refdomains_count.text) | |
return None | |
return pd.DataFrame(results) | |
def get_peter_lowe_domains(): | |
url = "https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus&mimetype=plaintext" | |
response = requests.get(url) | |
lines = response.text.split('\n') | |
domains = [line.strip('|^') for line in lines if line.startswith('||')] | |
return set(domains) | |
def extract_hostname(url): | |
return urlparse(url).netloc | |
def remove_subdomain(domain): | |
parts = domain.split('.') | |
if len(parts) > 2: | |
return '.'.join(parts[-2:]) | |
return domain | |
def domain_matches_blacklist(domain, regex_patterns): | |
for pattern in regex_patterns: | |
if re.search(pattern, domain, re.IGNORECASE): | |
return 'Yes' | |
return 'No' | |
def find_sitemap(url): | |
robots_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}/robots.txt" | |
try: | |
robots_response = requests.get(robots_url) | |
if robots_response.status_code == 200: | |
for line in robots_response.text.split("\n"): | |
if line.startswith("Sitemap:"): | |
sitemap_url = line.split(":", 1)[1].strip() | |
if "post" in sitemap_url.lower() or "blog" in sitemap_url.lower(): | |
return sitemap_url | |
except requests.exceptions.RequestException: | |
pass | |
sitemap_urls = [ | |
"/post-sitemap.xml", "/blog-sitemap.xml", "/sitemap-posts.xml", | |
"/sitemap.xml", "/wp-sitemap.xml", "/?sitemap=1", "/sitemap_index/xml", | |
"/sitemap-index.xml", "/sitemap.php", "/sitemap.txt", "/sitemap.xml.gz", | |
"/sitemap/", "/sitemap/sitemap.xml", "/sitemapindex.xml", "/sitemap/index.xml", "/sitemap1.xml" | |
] | |
for sitemap_url in sitemap_urls: | |
try: | |
sitemap_response = requests.get(f"{urlparse(url).scheme}://{urlparse(url).netloc}{sitemap_url}") | |
if sitemap_response.status_code == 200: | |
return f"{urlparse(url).scheme}://{urlparse(url).netloc}{sitemap_url}" | |
except requests.exceptions.RequestException: | |
pass | |
return None | |
def crawl_posts(df, page_count, url, concurrent_scrapes): | |
crawl_results = [] | |
crawl_status = st.empty() | |
def crawl_page(row): | |
page_url = row['loc'] | |
try: | |
response = requests.get(page_url) | |
if response.status_code == 200: | |
html = response.text | |
soup = BeautifulSoup(html, 'html.parser') | |
title = soup.title.text if soup.title else '' | |
meta_desc = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else '' | |
links = [] | |
for a in soup.find_all('a', href=True): | |
link_url = a['href'] | |
link_text = a.text.strip() | |
link_nofollow = 'nofollow' in a.get('rel', []) | |
links.append({'url': link_url, 'text': link_text, 'nofollow': link_nofollow}) | |
return { | |
'url': page_url, # Use page_url instead of url | |
'title': title, | |
'meta_desc': meta_desc, | |
'links': links | |
} | |
except requests.exceptions.RequestException: | |
return None | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
futures = [] | |
for i in range(0, page_count, concurrent_scrapes): | |
batch_df = df.iloc[i:i+concurrent_scrapes] | |
batch_futures = [executor.submit(crawl_page, row) for _, row in batch_df.iterrows()] | |
futures.extend(batch_futures) | |
for i, future in enumerate(concurrent.futures.as_completed(futures)): | |
result = future.result() | |
if result is not None: | |
crawl_results.append(result) | |
crawl_status.text(f"Crawling {url} - Page {i+1}/{page_count}") | |
crawl_status.empty() | |
return pd.DataFrame(crawl_results) | |
def download_csv(df, filename): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>' | |
return href | |
def main(): | |
st.title("Website Crawler") | |
urls = st.text_area("Enter the website URLs (one per line):", value="") | |
page_count = st.number_input("Enter the number of pages to crawl:", value=1000, min_value=1, step=1) | |
concurrent_scrapes = st.number_input("Enter the number of concurrent scrapes:", value=20, min_value=1, step=1) | |
col1, col2 = st.columns(2) | |
with col1: | |
domain_filter_regex_input = st.text_area("Filter out Unique Outbound Domains:", help="This uses a regex filter to find domains in the unique outbound domains list. Enter one regex per line.", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be") | |
with col2: | |
domain_match_regex_input = st.text_area("Domain Blacklist:", help="This uses a regex filter to match domains in the Unique Outbound Domains to the blacklist entered. Enter one regex per line.", value="xyz\ncasino\ncbd\nessay") | |
use_seo_powersuite = st.checkbox("Use SEO PowerSuite") | |
api_key = None | |
if use_seo_powersuite: | |
api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password") | |
download_links = st.checkbox("Show Download Links") | |
if st.button("Crawl"): | |
if urls: | |
url_list = [url.strip() for url in urls.split('\n') if url.strip()] | |
if url_list: | |
all_link_df = pd.DataFrame() | |
all_unique_outbound_links_df = pd.DataFrame() | |
all_final_df = pd.DataFrame() | |
all_analysis_df = pd.DataFrame() | |
all_crawled_pages_df = pd.DataFrame() | |
# | |
for url in url_list: | |
with st.spinner(f"Finding sitemap for {url}..."): | |
sitemap_url = find_sitemap(url) | |
if sitemap_url: | |
with st.spinner(f"Crawling {url}..."): | |
sitemap_df = adv.sitemap_to_df(sitemap_url) | |
sitemap_df = sitemap_df.sort_values(by="lastmod", ascending=False) # Sort by lastmod in descending order | |
crawl_results = crawl_posts(sitemap_df, page_count, url, concurrent_scrapes) | |
if not crawl_results.empty: | |
crawled_pages_df = pd.DataFrame({'Originating Domain': url, 'Crawled Page': crawl_results['url']}) | |
all_crawled_pages_df = pd.concat([all_crawled_pages_df, crawled_pages_df], ignore_index=True) | |
link_df = pd.DataFrame(crawl_results['links'].explode().tolist()) | |
link_df = link_df[~link_df['url'].str.startswith(('/','#'))] | |
link_df['internal'] = link_df['url'].apply(lambda x: extract_hostname(url) in extract_hostname(x)) | |
link_df = link_df[link_df['internal'] == False] # Filter out internal links | |
link_df.insert(0, 'Originating Domain', url) # Add 'Originating Domain' column | |
link_df = link_df[['Originating Domain', 'url', 'text', 'nofollow']] # Remove the 'internal' column | |
outbound_links_df = link_df.copy() # Create a copy of link_df for outbound links | |
unique_links_df = link_df['url'].value_counts().reset_index() | |
unique_links_df = unique_links_df[~unique_links_df['url'].str.startswith(('/','#'))] | |
unique_links_df.columns = ['Link', 'Count'] | |
unique_links_df.insert(0, 'Originating Domain', url) | |
unique_outbound_links_df = outbound_links_df['url'].value_counts().reset_index() | |
unique_outbound_links_df = unique_outbound_links_df[~unique_outbound_links_df['url'].str.startswith(('/','#'))] | |
unique_outbound_links_df.columns = ['Link', 'Count'] | |
unique_outbound_links_df.insert(0, 'Originating Domain', url) | |
outbound_links_df['url'] = outbound_links_df['url'].astype(str) | |
domain_df = outbound_links_df['url'].apply(extract_hostname).value_counts().reset_index() | |
domain_df.columns = ['Domain', 'Count'] | |
domain_df = domain_df[domain_df['Domain'] != ''] | |
peter_lowe_domains = get_peter_lowe_domains() | |
domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No') | |
domain_df.insert(0, 'Originating Domain', url) | |
# Determine the 'DoFollow' value for each domain | |
domain_df['DoFollow'] = domain_df['Domain'].apply(lambda x: any(outbound_links_df[(outbound_links_df['url'].str.contains(x)) & (outbound_links_df['nofollow'] == False)])) | |
if not domain_df.empty: | |
if domain_filter_regex_input: | |
domain_filter_regex_patterns = domain_filter_regex_input.split('\n') | |
domain_filter_regex = '|'.join(domain_filter_regex_patterns) | |
domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)] | |
if not domain_df.empty: | |
if domain_match_regex_input: | |
domain_match_regex_patterns = domain_match_regex_input.split('\n') | |
domain_df['Blacklist'] = domain_df['Domain'].apply(lambda x: domain_matches_blacklist(x, domain_match_regex_patterns) == 'Yes') | |
else: | |
domain_df['Blacklist'] = False | |
total_domains = len(domain_df) | |
peter_lowe_percentage = round((domain_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2) | |
blacklist_percentage = round((domain_df['Blacklist'] == True).sum() / total_domains * 100, 2) | |
analysis_data = { | |
'Originating Domain': [url] * 2, | |
'Metric': ['Percentage of domains not in Peter Lowe\'s list', 'Percentage of domains in the Blacklist'], | |
'Value': [f"{peter_lowe_percentage}%", f"{blacklist_percentage}%"] | |
} | |
analysis_df = pd.DataFrame(analysis_data) | |
if use_seo_powersuite and api_key: | |
seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key) | |
if seo_powersuite_df is not None: | |
domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left') | |
domain_df.drop('target', axis=1, inplace=True) | |
avg_domain_inlink_rank = round(domain_df['domain_inlink_rank'].mean(), 2) | |
avg_domain_inlink_rank_less_than_70 = round(domain_df[domain_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2) | |
avg_refdomains = round(domain_df['refdomains'].mean(), 2) | |
additional_analysis_data = { | |
'Originating Domain': [url] * 3, | |
'Metric': [ | |
'Average domain inlink rank', | |
'Average domain inlink rank (< 70)', | |
'Average number of refdomains' | |
], | |
'Value': [ | |
avg_domain_inlink_rank, | |
avg_domain_inlink_rank_less_than_70, | |
avg_refdomains | |
] | |
} | |
analysis_df = pd.concat([analysis_df, pd.DataFrame(additional_analysis_data)], ignore_index=True) | |
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'DoFollow', 'Blacklist', 'domain_inlink_rank', 'refdomains'] | |
final_df = domain_df[desired_columns] | |
else: | |
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'DoFollow', 'Blacklist'] | |
final_df = domain_df[desired_columns] | |
else: | |
st.warning(f"No unique outbound domains found for {url} after filtering.") | |
else: | |
st.warning(f"No unique outbound domains found for {url}.") | |
all_link_df = pd.concat([all_link_df, link_df], ignore_index=True) | |
all_unique_outbound_links_df = pd.concat([all_unique_outbound_links_df, unique_outbound_links_df], ignore_index=True) | |
all_final_df = pd.concat([all_final_df, final_df], ignore_index=True) | |
all_analysis_df = pd.concat([all_analysis_df, analysis_df], ignore_index=True) | |
else: | |
st.warning(f"No posts found in the sitemap for {url}.") | |
else: | |
st.warning(f"Sitemap not found for {url}.") | |
st.subheader("Crawled Pages") | |
if download_links: | |
st.markdown(download_csv(all_crawled_pages_df, "Crawled Pages"), unsafe_allow_html=True) | |
else: | |
st.write(all_crawled_pages_df) | |
st.subheader("Outbound Links") | |
if download_links: | |
st.markdown(download_csv(all_link_df, "Outbound Links"), unsafe_allow_html=True) | |
else: | |
st.write(all_link_df) | |
st.subheader("Unique Outbound Links") | |
if download_links: | |
st.markdown(download_csv(all_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True) | |
else: | |
st.write(all_unique_outbound_links_df) | |
st.subheader("Unique Outbound Domains") | |
if download_links: | |
st.markdown(download_csv(all_final_df, "Unique Outbound Domains"), unsafe_allow_html=True) | |
else: | |
st.write(all_final_df) | |
st.subheader("Analytics") | |
all_analysis_df = all_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index() | |
all_analysis_df.columns.name = None | |
if use_seo_powersuite and api_key: | |
numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains'] | |
all_analysis_df[numeric_columns] = all_analysis_df[numeric_columns].astype(int) | |
if download_links: | |
st.markdown(download_csv(all_analysis_df, "Analytics"), unsafe_allow_html=True) | |
else: | |
st.table(all_analysis_df) | |
else: | |
st.warning("Please enter at least one website URL.") | |
else: | |
st.warning("Please enter website URLs.") | |
if __name__ == '__main__': | |
main() |