File size: 4,393 Bytes
f9d793f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Helper Functions
COMMON_SITEMAP_LOCATIONS = [
    "/sitemap.xml",
    "/sitemap_index.xml",
    "/sitemap-index.xml",
    "/sitemap.php",
    "/sitemap.txt",
    "/sitemap.xml.gz",
    "/sitemap/",
    "/sitemap/sitemap.xml",
    "/sitemapindex.xml",
    "/sitemap/index.xml",
    "/sitemap1.xml",
    "/rss/",
    "/rss.xml",
    "/atom.xml",
]


def find_sitemap(domain):
    """Locate the sitemap URL by checking common locations and robots.txt."""
    for path in COMMON_SITEMAP_LOCATIONS:
        sitemap_url = domain.rstrip("/") + path
        try:
            response = requests.get(sitemap_url, timeout=5)
            if response.status_code == 200:
                return sitemap_url
        except requests.RequestException:
            continue

    robots_url = domain.rstrip("/") + "/robots.txt"
    try:
        response = requests.get(robots_url, timeout=5)
        if response.status_code == 200:
            for line in response.text.splitlines():
                if line.lower().startswith("sitemap:"):
                    return line.split(":", 1)[1].strip()
    except requests.RequestException:
        pass
    return None


def get_sitemap_links(sitemap_url):
    """Fetch all links from a sitemap."""
    response = requests.get(sitemap_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch sitemap: {response.status_code}")

    soup = BeautifulSoup(response.content, "lxml-xml")
    links = [loc.text.strip() for loc in soup.find_all("loc")]
    return links


def is_blog_link(link):
    """Filter links for blog-related URLs."""
    return "blog" in link.lower()


def extract_article_info(url):
    """Extract the article content from a URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "html.parser")

        heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
        article_text = "\n\n".join(all_paragraphs)
        full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
        return full_article_text
    except Exception as e:
        return f"Error fetching article: {e}"


# Streamlit App
st.title("Blog Article Scraper")

# Input Website URL
website_url = st.text_input("Enter the website URL (e.g., https://example.com):")

if st.button("Start Scraping"):
    if not website_url:
        st.error("Please enter a website URL.")
    else:
        st.write("Locating sitemap...")
        sitemap_url = find_sitemap(website_url)

        if not sitemap_url:
            st.error("Could not locate a sitemap.")
        else:
            st.success(f"Sitemap found: {sitemap_url}")
            st.write("Fetching links from sitemap...")
            try:
                links = get_sitemap_links(sitemap_url)
                blog_links = [link for link in links if is_blog_link(link)]
                st.success(f"Found {len(blog_links)} blog links.")

                # Scraping articles
                st.write("Extracting article content...")
                data = []
                progress = st.progress(0)
                for i, link in enumerate(blog_links):
                    article_text = extract_article_info(link)
                    data.append({"URL": link, "Article Text": article_text})
                    progress.progress((i + 1) / len(blog_links))

                # Save results to DataFrame and Display
                df = pd.DataFrame(data)
                st.write("Scraping completed.")
                st.dataframe(df)

                # Provide download link for Excel file
                file_name = "blog_articles.xlsx"
                df.to_excel(file_name, index=False)
                with open(file_name, "rb") as file:
                    st.download_button(
                        label="Download Excel File",
                        data=file,
                        file_name="blog_articles.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    )
            except Exception as e:
                st.error(f"Error during scraping: {e}")