Spaces:

abdulllah01
/

articles_from_sitemap

Sleeping

File size: 4,393 Bytes

f9d793f

import streamlit as st
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Helper Functions
COMMON_SITEMAP_LOCATIONS = [
    "/sitemap.xml",
    "/sitemap_index.xml",
    "/sitemap-index.xml",
    "/sitemap.php",
    "/sitemap.txt",
    "/sitemap.xml.gz",
    "/sitemap/",
    "/sitemap/sitemap.xml",
    "/sitemapindex.xml",
    "/sitemap/index.xml",
    "/sitemap1.xml",
    "/rss/",
    "/rss.xml",
    "/atom.xml",
]


def find_sitemap(domain):
    """Locate the sitemap URL by checking common locations and robots.txt."""
    for path in COMMON_SITEMAP_LOCATIONS:
        sitemap_url = domain.rstrip("/") + path
        try:
            response = requests.get(sitemap_url, timeout=5)
            if response.status_code == 200:
                return sitemap_url
        except requests.RequestException:
            continue

    robots_url = domain.rstrip("/") + "/robots.txt"
    try:
        response = requests.get(robots_url, timeout=5)
        if response.status_code == 200:
            for line in response.text.splitlines():
                if line.lower().startswith("sitemap:"):
                    return line.split(":", 1)[1].strip()
    except requests.RequestException:
        pass
    return None


def get_sitemap_links(sitemap_url):
    """Fetch all links from a sitemap."""
    response = requests.get(sitemap_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch sitemap: {response.status_code}")

    soup = BeautifulSoup(response.content, "lxml-xml")
    links = [loc.text.strip() for loc in soup.find_all("loc")]
    return links


def is_blog_link(link):
    """Filter links for blog-related URLs."""
    return "blog" in link.lower()


def extract_article_info(url):
    """Extract the article content from a URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "html.parser")

        heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
        article_text = "\n\n".join(all_paragraphs)
        full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
        return full_article_text
    except Exception as e:
        return f"Error fetching article: {e}"


# Streamlit App
st.title("Blog Article Scraper")

# Input Website URL
website_url = st.text_input("Enter the website URL (e.g., https://example.com):")

if st.button("Start Scraping"):
    if not website_url:
        st.error("Please enter a website URL.")
    else:
        st.write("Locating sitemap...")
        sitemap_url = find_sitemap(website_url)

        if not sitemap_url:
            st.error("Could not locate a sitemap.")
        else:
            st.success(f"Sitemap found: {sitemap_url}")
            st.write("Fetching links from sitemap...")
            try:
                links = get_sitemap_links(sitemap_url)
                blog_links = [link for link in links if is_blog_link(link)]
                st.success(f"Found {len(blog_links)} blog links.")

                # Scraping articles
                st.write("Extracting article content...")
                data = []
                progress = st.progress(0)
                for i, link in enumerate(blog_links):
                    article_text = extract_article_info(link)
                    data.append({"URL": link, "Article Text": article_text})
                    progress.progress((i + 1) / len(blog_links))

                # Save results to DataFrame and Display
                df = pd.DataFrame(data)
                st.write("Scraping completed.")
                st.dataframe(df)

                # Provide download link for Excel file
                file_name = "blog_articles.xlsx"
                df.to_excel(file_name, index=False)
                with open(file_name, "rb") as file:
                    st.download_button(
                        label="Download Excel File",
                        data=file,
                        file_name="blog_articles.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    )
            except Exception as e:
                st.error(f"Error during scraping: {e}")