|
import advertools as adv |
|
import streamlit as st |
|
import tempfile |
|
import pandas as pd |
|
from urllib.parse import urlparse |
|
import base64 |
|
import requests |
|
import time |
|
|
|
def get_seo_powersuite_data(domains, api_key): |
|
url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank" |
|
url_refdomains_count = "https://api.seopowersuite.com/backlinks/v1.0/get-refdomains-count" |
|
headers = {"Content-Type": "application/json"} |
|
|
|
results = [] |
|
for i in range(0, len(domains), 100): |
|
batch_domains = domains[i:i+100] |
|
|
|
|
|
start_time = time.time() |
|
payload_domain_inlink_rank = {"target": list(batch_domains)} |
|
params_domain_inlink_rank = {"apikey": api_key, "output": "json"} |
|
response_domain_inlink_rank = requests.post(url_domain_inlink_rank, json=payload_domain_inlink_rank, headers=headers, params=params_domain_inlink_rank) |
|
duration = time.time() - start_time |
|
print(f"get-domain-inlink-rank API call for {len(batch_domains)} domains took {duration:.2f} seconds") |
|
|
|
if response_domain_inlink_rank.status_code == 200: |
|
data_domain_inlink_rank = response_domain_inlink_rank.json() |
|
domain_inlink_rank_dict = {page["url"]: page["domain_inlink_rank"] for page in data_domain_inlink_rank["pages"]} |
|
else: |
|
st.error(f"Error fetching domain inlink rank data from SEO PowerSuite API: {response_domain_inlink_rank.status_code}") |
|
st.error("Error Response:") |
|
st.write(response_domain_inlink_rank.text) |
|
return None |
|
|
|
|
|
start_time = time.time() |
|
payload_refdomains_count = {"target": list(batch_domains), "mode": "domain"} |
|
params_refdomains_count = {"apikey": api_key, "output": "json"} |
|
response_refdomains_count = requests.post(url_refdomains_count, json=payload_refdomains_count, headers=headers, params=params_refdomains_count) |
|
duration = time.time() - start_time |
|
print(f"get-refdomains-count API call for {len(batch_domains)} domains took {duration:.2f} seconds") |
|
|
|
if response_refdomains_count.status_code == 200: |
|
data_refdomains_count = response_refdomains_count.json() |
|
for metric in data_refdomains_count["metrics"]: |
|
result = { |
|
"target": metric["target"], |
|
"domain_inlink_rank": domain_inlink_rank_dict.get(metric["target"], None), |
|
"refdomains": metric["refdomains"] |
|
} |
|
results.append(result) |
|
else: |
|
st.error(f"Error fetching refdomains count data from SEO PowerSuite API: {response_refdomains_count.status_code}") |
|
st.error("Error Response:") |
|
st.write(response_refdomains_count.text) |
|
return None |
|
|
|
return pd.DataFrame(results) |
|
|
|
def get_peter_lowe_domains(): |
|
url = "https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus&mimetype=plaintext" |
|
response = requests.get(url) |
|
lines = response.text.split('\n') |
|
domains = [line.strip('|^') for line in lines if line.startswith('||')] |
|
return set(domains) |
|
|
|
def extract_hostname(url): |
|
return urlparse(url).netloc |
|
|
|
def remove_subdomain(domain): |
|
parts = domain.split('.') |
|
if len(parts) > 2: |
|
return '.'.join(parts[-2:]) |
|
return domain |
|
|
|
def crawl_website(url, exclude_url_regex, page_count, concurrent_requests, depth_limit): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jl') as temp_file: |
|
custom_settings = { |
|
"CLOSESPIDER_PAGECOUNT": page_count, |
|
"CONCURRENT_REQUESTS_PER_DOMAIN": concurrent_requests, |
|
"DEPTH_LIMIT": depth_limit |
|
} |
|
if exclude_url_regex: |
|
adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, exclude_url_regex=exclude_url_regex, custom_settings=custom_settings) |
|
else: |
|
adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, custom_settings=custom_settings) |
|
crawl_df = pd.read_json(temp_file.name, lines=True) |
|
crawl_df['hostname'] = crawl_df['url'].apply(extract_hostname) |
|
|
|
desired_columns = ['url', 'hostname', 'title', 'meta_desc', 'status', 'links_url', 'links_text', 'links_nofollow'] |
|
crawl_df = crawl_df[desired_columns] |
|
return crawl_df |
|
|
|
def download_csv(df, filename): |
|
csv = df.to_csv(index=False) |
|
b64 = base64.b64encode(csv.encode()).decode() |
|
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>' |
|
return href |
|
|
|
def main(): |
|
st.title("Website Crawler") |
|
|
|
domains = st.text_area("Enter the website URLs (one per line):", value="") |
|
page_count = st.number_input("Enter the maximum number of pages to crawl:", value=5000, min_value=1, step=1) |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
concurrent_requests = st.number_input("Enter the number of concurrent requests per domain:", value=8, min_value=1, step=1) |
|
with col2: |
|
depth_limit = st.number_input("Enter the depth limit for the crawl:", value=0, min_value=0, step=1) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
exclude_url_regex_input = st.text_area("Enter exclude URL regex patterns (one per line):", value="cdn\nwp-content") |
|
with col2: |
|
domain_filter_regex_input = st.text_area("Filter our unique domains with regex (one per line):", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be") |
|
|
|
col1, col2 = st.columns([2, 5]) |
|
with col1: |
|
use_seo_powersuite = st.checkbox("Use SEO PowerSuite") |
|
with col2: |
|
api_key = None |
|
if use_seo_powersuite: |
|
api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password", value="") |
|
download_links = st.checkbox("Show Download Links") |
|
|
|
if st.button("Crawl"): |
|
if domains: |
|
domain_list = [domain.strip() for domain in domains.split('\n') if domain.strip()] |
|
if domain_list: |
|
exclude_url_regex_patterns = exclude_url_regex_input.split('\n') |
|
exclude_url_regex = '|'.join(exclude_url_regex_patterns) if exclude_url_regex_patterns else None |
|
|
|
all_crawl_results = [] |
|
all_link_df = [] |
|
all_unique_links_df = [] |
|
all_unique_outbound_links_df = [] |
|
all_domain_df = [] |
|
all_final_df = [] |
|
all_analysis_df = [] |
|
|
|
for domain in domain_list: |
|
with st.spinner(f"Crawling {domain}..."): |
|
crawl_results = crawl_website(domain, exclude_url_regex, page_count, concurrent_requests, depth_limit) |
|
crawl_results.insert(0, 'Originating Domain', domain) |
|
all_crawl_results.append(crawl_results) |
|
|
|
if not crawl_results.empty: |
|
link_df = adv.crawlytics.links(crawl_results, internal_url_regex=extract_hostname(domain)) |
|
link_df.insert(0, 'Originating Domain', domain) |
|
all_link_df.append(link_df) |
|
|
|
unique_links_df = link_df['link'].value_counts().reset_index() |
|
unique_links_df.columns = ['Link', 'Count'] |
|
unique_links_df.insert(0, 'Originating Domain', domain) |
|
all_unique_links_df.append(unique_links_df) |
|
|
|
outbound_links_df = link_df[link_df['internal'] == False] |
|
unique_outbound_links_df = outbound_links_df['link'].value_counts().reset_index() |
|
unique_outbound_links_df.columns = ['Link', 'Count'] |
|
unique_outbound_links_df.insert(0, 'Originating Domain', domain) |
|
all_unique_outbound_links_df.append(unique_outbound_links_df) |
|
|
|
outbound_links_df['link'] = outbound_links_df['link'].astype(str) |
|
domain_df = outbound_links_df['link'].apply(extract_hostname).value_counts().reset_index() |
|
domain_df.columns = ['Domain', 'Count'] |
|
domain_df = domain_df[domain_df['Domain'] != ''] |
|
peter_lowe_domains = get_peter_lowe_domains() |
|
domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No') |
|
domain_df.insert(0, 'Originating Domain', domain) |
|
all_domain_df.append(domain_df) |
|
|
|
if not domain_df.empty: |
|
if domain_filter_regex_input: |
|
domain_filter_regex_patterns = domain_filter_regex_input.split('\n') |
|
domain_filter_regex = '|'.join(domain_filter_regex_patterns) |
|
domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)] |
|
|
|
if use_seo_powersuite and api_key: |
|
seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key) |
|
if seo_powersuite_df is not None: |
|
domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left') |
|
domain_df.drop('target', axis=1, inplace=True) |
|
|
|
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'domain_inlink_rank', 'refdomains'] |
|
final_df = domain_df[desired_columns] |
|
all_final_df.append(final_df) |
|
|
|
total_domains = len(final_df) |
|
peter_lowe_percentage = round((final_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2) |
|
avg_domain_inlink_rank = round(final_df['domain_inlink_rank'].mean(), 2) |
|
avg_domain_inlink_rank_less_than_70 = round(final_df[final_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2) |
|
avg_refdomains = round(final_df['refdomains'].mean(), 2) |
|
|
|
analysis_data = { |
|
'Originating Domain': [domain] * 4, |
|
'Metric': [ |
|
'Percentage of domains not in Peter Lowe\'s list', |
|
'Average domain inlink rank', |
|
'Average domain inlink rank (< 70)', |
|
'Average number of refdomains' |
|
], |
|
'Value': [ |
|
f"{peter_lowe_percentage}%", |
|
avg_domain_inlink_rank, |
|
avg_domain_inlink_rank_less_than_70, |
|
avg_refdomains |
|
] |
|
} |
|
|
|
analysis_df = pd.DataFrame(analysis_data) |
|
all_analysis_df.append(analysis_df) |
|
|
|
else: |
|
desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List'] |
|
final_df = domain_df[desired_columns] |
|
all_final_df.append(final_df) |
|
|
|
if all_crawl_results: |
|
st.subheader("Crawl Results") |
|
combined_crawl_results = pd.concat(all_crawl_results, ignore_index=True) |
|
if download_links: |
|
st.markdown(download_csv(combined_crawl_results, "Crawl Results"), unsafe_allow_html=True) |
|
else: |
|
st.write(combined_crawl_results) |
|
|
|
if all_link_df: |
|
st.subheader("All Links") |
|
combined_link_df = pd.concat(all_link_df, ignore_index=True) |
|
if download_links: |
|
st.markdown(download_csv(combined_link_df, "All Links"), unsafe_allow_html=True) |
|
else: |
|
st.write(combined_link_df) |
|
|
|
if all_unique_links_df: |
|
st.subheader("Unique Links") |
|
combined_unique_links_df = pd.concat(all_unique_links_df, ignore_index=True) |
|
if download_links: |
|
st.markdown(download_csv(combined_unique_links_df, "Unique Links"), unsafe_allow_html=True) |
|
else: |
|
st.write(combined_unique_links_df) |
|
|
|
if all_unique_outbound_links_df: |
|
st.subheader("Unique Outbound Links") |
|
combined_unique_outbound_links_df = pd.concat(all_unique_outbound_links_df, ignore_index=True) |
|
if download_links: |
|
st.markdown(download_csv(combined_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True) |
|
else: |
|
st.write(combined_unique_outbound_links_df) |
|
|
|
if all_final_df: |
|
st.subheader("Unique Outbound Domains") |
|
combined_final_df = pd.concat(all_final_df, ignore_index=True) |
|
if download_links: |
|
st.markdown(download_csv(combined_final_df, "Unique Outbound Domains"), unsafe_allow_html=True) |
|
else: |
|
st.write(combined_final_df) |
|
|
|
if all_analysis_df: |
|
st.subheader("Analytics") |
|
combined_analysis_df = pd.concat(all_analysis_df) |
|
combined_analysis_df = combined_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index() |
|
combined_analysis_df.columns.name = None |
|
numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains'] |
|
combined_analysis_df[numeric_columns] = combined_analysis_df[numeric_columns].astype(int) |
|
combined_analysis_df = combined_analysis_df[[ |
|
'Originating Domain', |
|
'Percentage of domains not in Peter Lowe\'s list', |
|
'Average domain inlink rank', |
|
'Average domain inlink rank (< 70)', |
|
'Average number of refdomains' |
|
]] |
|
if download_links: |
|
st.markdown(download_csv(combined_analysis_df, "Analytics"), unsafe_allow_html=True) |
|
else: |
|
st.table(combined_analysis_df) |
|
else: |
|
st.warning("Please enter at least one website URL.") |
|
else: |
|
st.warning("Please enter the website URLs.") |
|
|
|
if __name__ == '__main__': |
|
main() |