Spaces:

joshuadunlop
/

Bulk-Website-Crawler

Running

File size: 16,117 Bytes

68329f5

import advertools as adv
import streamlit as st
import tempfile
import pandas as pd
from urllib.parse import urlparse
import base64
import requests
import time

def get_seo_powersuite_data(domains, api_key):
    url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
    url_refdomains_count = "https://api.seopowersuite.com/backlinks/v1.0/get-refdomains-count"
    headers = {"Content-Type": "application/json"}
    
    results = []
    for i in range(0, len(domains), 100):
        batch_domains = domains[i:i+100]
        
        # Get domain inlink rank
        start_time = time.time()
        payload_domain_inlink_rank = {"target": list(batch_domains)}
        params_domain_inlink_rank = {"apikey": api_key, "output": "json"}
        response_domain_inlink_rank = requests.post(url_domain_inlink_rank, json=payload_domain_inlink_rank, headers=headers, params=params_domain_inlink_rank)
        duration = time.time() - start_time
        print(f"get-domain-inlink-rank API call for {len(batch_domains)} domains took {duration:.2f} seconds")
        
        if response_domain_inlink_rank.status_code == 200:
            data_domain_inlink_rank = response_domain_inlink_rank.json()
            domain_inlink_rank_dict = {page["url"]: page["domain_inlink_rank"] for page in data_domain_inlink_rank["pages"]}
        else:
            st.error(f"Error fetching domain inlink rank data from SEO PowerSuite API: {response_domain_inlink_rank.status_code}")
            st.error("Error Response:")
            st.write(response_domain_inlink_rank.text)
            return None
        
        # Get refdomains count
        start_time = time.time()
        payload_refdomains_count = {"target": list(batch_domains), "mode": "domain"}
        params_refdomains_count = {"apikey": api_key, "output": "json"}
        response_refdomains_count = requests.post(url_refdomains_count, json=payload_refdomains_count, headers=headers, params=params_refdomains_count)
        duration = time.time() - start_time
        print(f"get-refdomains-count API call for {len(batch_domains)} domains took {duration:.2f} seconds")
        
        if response_refdomains_count.status_code == 200:
            data_refdomains_count = response_refdomains_count.json()
            for metric in data_refdomains_count["metrics"]:
                result = {
                    "target": metric["target"],
                    "domain_inlink_rank": domain_inlink_rank_dict.get(metric["target"], None),
                    "refdomains": metric["refdomains"]
                }
                results.append(result)
        else:
            st.error(f"Error fetching refdomains count data from SEO PowerSuite API: {response_refdomains_count.status_code}")
            st.error("Error Response:")
            st.write(response_refdomains_count.text)
            return None
        
    return pd.DataFrame(results)

def get_peter_lowe_domains():
    url = "https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus&mimetype=plaintext"
    response = requests.get(url)
    lines = response.text.split('\n')
    domains = [line.strip('|^') for line in lines if line.startswith('||')]
    return set(domains)

def extract_hostname(url):
    return urlparse(url).netloc

def remove_subdomain(domain):
    parts = domain.split('.')
    if len(parts) > 2:
        return '.'.join(parts[-2:])
    return domain

def crawl_website(url, exclude_url_regex, page_count, concurrent_requests, depth_limit):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.jl') as temp_file:
        custom_settings = {
            "CLOSESPIDER_PAGECOUNT": page_count,
            "CONCURRENT_REQUESTS_PER_DOMAIN": concurrent_requests,
            "DEPTH_LIMIT": depth_limit
        }
        if exclude_url_regex:
            adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, exclude_url_regex=exclude_url_regex, custom_settings=custom_settings)
        else:
            adv.crawl(url, output_file=temp_file.name, follow_links=True, exclude_url_params=True, custom_settings=custom_settings)
        crawl_df = pd.read_json(temp_file.name, lines=True)
    crawl_df['hostname'] = crawl_df['url'].apply(extract_hostname)
    # Select only the desired columns
    desired_columns = ['url', 'hostname', 'title', 'meta_desc', 'status', 'links_url', 'links_text', 'links_nofollow']
    crawl_df = crawl_df[desired_columns]
    return crawl_df

def download_csv(df, filename):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>'
    return href

def main():
    st.title("Website Crawler")
    
    domains = st.text_area("Enter the website URLs (one per line):", value="")
    page_count = st.number_input("Enter the maximum number of pages to crawl:", value=100, min_value=1, step=1)
    col1, col2 = st.columns(2)
    with col1:
        concurrent_requests = st.number_input("Enter the number of concurrent requests per domain:", value=8, min_value=1, step=1)
    with col2:
        depth_limit = st.number_input("Enter the depth limit for the crawl:", value=0, min_value=0, step=1)
        
    col1, col2 = st.columns(2)
    with col1:
        exclude_url_regex_input = st.text_area("Enter exclude URL regex patterns (one per line):", value="cdn")
    with col2:
        domain_filter_regex_input = st.text_area("Filter our unique domains with regex (one per line):", value="instagram\nfacebook\ntwitter\nlinkedin\nsnapchat\ntiktok\nreddit\npinterest\namazon\ncdn\nyoutube\nyoutu.be")
        
    col1, col2 = st.columns([2, 5])
    with col1:
        use_seo_powersuite = st.checkbox("Use SEO PowerSuite")
    with col2:
        api_key = None
        if use_seo_powersuite:
            api_key = st.text_input("Enter the SEO PowerSuite API key:", type="password", value="")
    download_links = st.checkbox("Show Download Links")
        
    if st.button("Crawl"):
        if domains:
            domain_list = [domain.strip() for domain in domains.split('\n') if domain.strip()]
            if domain_list:
                exclude_url_regex_patterns = exclude_url_regex_input.split('\n')
                exclude_url_regex = '|'.join(exclude_url_regex_patterns) if exclude_url_regex_patterns else None
                
                all_crawl_results = []
                all_link_df = []
                all_unique_links_df = []
                all_unique_outbound_links_df = []
                all_domain_df = []
                all_final_df = []
                all_analysis_df = []
                
                for domain in domain_list:
                    with st.spinner(f"Crawling {domain}..."):
                        crawl_results = crawl_website(domain, exclude_url_regex, page_count, concurrent_requests, depth_limit)
                        crawl_results.insert(0, 'Originating Domain', domain)
                        all_crawl_results.append(crawl_results)
                        
                        if not crawl_results.empty:
                            link_df = adv.crawlytics.links(crawl_results, internal_url_regex=extract_hostname(domain))
                            link_df.insert(0, 'Originating Domain', domain)
                            all_link_df.append(link_df)
                            
                            unique_links_df = link_df['link'].value_counts().reset_index()
                            unique_links_df.columns = ['Link', 'Count']
                            unique_links_df.insert(0, 'Originating Domain', domain)
                            all_unique_links_df.append(unique_links_df)
                            
                            outbound_links_df = link_df[link_df['internal'] == False]
                            unique_outbound_links_df = outbound_links_df['link'].value_counts().reset_index()
                            unique_outbound_links_df.columns = ['Link', 'Count']
                            unique_outbound_links_df.insert(0, 'Originating Domain', domain)
                            all_unique_outbound_links_df.append(unique_outbound_links_df)
                            
                            outbound_links_df['link'] = outbound_links_df['link'].astype(str)
                            domain_df = outbound_links_df['link'].apply(extract_hostname).value_counts().reset_index()
                            domain_df.columns = ['Domain', 'Count']
                            domain_df = domain_df[domain_df['Domain'] != '']
                            peter_lowe_domains = get_peter_lowe_domains()
                            domain_df['In Peter Lowe List'] = domain_df['Domain'].apply(lambda x: 'Yes' if remove_subdomain(x) in peter_lowe_domains else 'No')
                            domain_df.insert(0, 'Originating Domain', domain)
                            all_domain_df.append(domain_df)
                            
                            if not domain_df.empty:
                                if domain_filter_regex_input:
                                    domain_filter_regex_patterns = domain_filter_regex_input.split('\n')
                                    domain_filter_regex = '|'.join(domain_filter_regex_patterns)
                                    domain_df = domain_df[~domain_df['Domain'].str.contains(domain_filter_regex, case=False, regex=True)]
                                    
                            if use_seo_powersuite and api_key:
                                seo_powersuite_df = get_seo_powersuite_data(domain_df['Domain'].tolist(), api_key)
                                if seo_powersuite_df is not None:
                                    domain_df = pd.merge(domain_df, seo_powersuite_df, left_on='Domain', right_on='target', how='left')
                                    domain_df.drop('target', axis=1, inplace=True)
                                    
                                    desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List', 'domain_inlink_rank', 'refdomains']
                                    final_df = domain_df[desired_columns]
                                    all_final_df.append(final_df)
                                    
                                    total_domains = len(final_df)
                                    peter_lowe_percentage = round((final_df['In Peter Lowe List'] == 'No').sum() / total_domains * 100, 2)
                                    avg_domain_inlink_rank = round(final_df['domain_inlink_rank'].mean(), 2)
                                    avg_domain_inlink_rank_less_than_70 = round(final_df[final_df['domain_inlink_rank'] < 70]['domain_inlink_rank'].mean(), 2)
                                    avg_refdomains = round(final_df['refdomains'].mean(), 2)
                                    
                                    analysis_data = {
                                        'Originating Domain': [domain] * 4,
                                        'Metric': [
                                            'Percentage of domains not in Peter Lowe\'s list',
                                            'Average domain inlink rank',
                                            'Average domain inlink rank (< 70)',
                                            'Average number of refdomains'
                                        ],
                                        'Value': [
                                            f"{peter_lowe_percentage}%",
                                            avg_domain_inlink_rank,
                                            avg_domain_inlink_rank_less_than_70,
                                            avg_refdomains
                                        ]
                                    }
                                    
                                    analysis_df = pd.DataFrame(analysis_data)
                                    all_analysis_df.append(analysis_df)
                                    
                            else:
                                desired_columns = ['Originating Domain', 'Domain', 'Count', 'In Peter Lowe List']
                                final_df = domain_df[desired_columns]
                                all_final_df.append(final_df)
                                
                if all_crawl_results:
                    st.subheader("Crawl Results")
                    combined_crawl_results = pd.concat(all_crawl_results, ignore_index=True)
                    if download_links:
                        st.markdown(download_csv(combined_crawl_results, "Crawl Results"), unsafe_allow_html=True)
                    else:
                        st.write(combined_crawl_results)
                        
                if all_link_df:
                    st.subheader("All Links")
                    combined_link_df = pd.concat(all_link_df, ignore_index=True)
                    if download_links:
                        st.markdown(download_csv(combined_link_df, "All Links"), unsafe_allow_html=True)
                    else:
                        st.write(combined_link_df)
                        
                if all_unique_links_df:
                    st.subheader("Unique Links")
                    combined_unique_links_df = pd.concat(all_unique_links_df, ignore_index=True)
                    if download_links:
                        st.markdown(download_csv(combined_unique_links_df, "Unique Links"), unsafe_allow_html=True)
                    else:
                        st.write(combined_unique_links_df)
                        
                if all_unique_outbound_links_df:
                    st.subheader("Unique Outbound Links")
                    combined_unique_outbound_links_df = pd.concat(all_unique_outbound_links_df, ignore_index=True)
                    if download_links:
                        st.markdown(download_csv(combined_unique_outbound_links_df, "Unique Outbound Links"), unsafe_allow_html=True)
                    else:
                        st.write(combined_unique_outbound_links_df)
                        
                if all_final_df:
                    st.subheader("Unique Outbound Domains")
                    combined_final_df = pd.concat(all_final_df, ignore_index=True)
                    if download_links:
                        st.markdown(download_csv(combined_final_df, "Unique Outbound Domains"), unsafe_allow_html=True)
                    else:
                        st.write(combined_final_df)
                        
                if all_analysis_df:
                    st.subheader("Analytics")
                    combined_analysis_df = pd.concat(all_analysis_df)
                    combined_analysis_df = combined_analysis_df.pivot(index='Originating Domain', columns='Metric', values='Value').reset_index()
                    combined_analysis_df.columns.name = None
                    numeric_columns = ['Average domain inlink rank', 'Average domain inlink rank (< 70)', 'Average number of refdomains']
                    combined_analysis_df[numeric_columns] = combined_analysis_df[numeric_columns].astype(int)
                    combined_analysis_df = combined_analysis_df[[
                        'Originating Domain',
                        'Percentage of domains not in Peter Lowe\'s list',
                        'Average domain inlink rank',
                        'Average domain inlink rank (< 70)',
                        'Average number of refdomains'
                    ]]
                    if download_links:
                        st.markdown(download_csv(combined_analysis_df, "Analytics"), unsafe_allow_html=True)
                    else:
                        st.table(combined_analysis_df)
            else:
                st.warning("Please enter at least one website URL.")
        else:
            st.warning("Please enter the website URLs.")
            
if __name__ == '__main__':
    main()