Spaces:
Sleeping
Sleeping
File size: 4,393 Bytes
f9d793f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import streamlit as st
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Helper Functions
COMMON_SITEMAP_LOCATIONS = [
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/sitemap.php",
"/sitemap.txt",
"/sitemap.xml.gz",
"/sitemap/",
"/sitemap/sitemap.xml",
"/sitemapindex.xml",
"/sitemap/index.xml",
"/sitemap1.xml",
"/rss/",
"/rss.xml",
"/atom.xml",
]
def find_sitemap(domain):
"""Locate the sitemap URL by checking common locations and robots.txt."""
for path in COMMON_SITEMAP_LOCATIONS:
sitemap_url = domain.rstrip("/") + path
try:
response = requests.get(sitemap_url, timeout=5)
if response.status_code == 200:
return sitemap_url
except requests.RequestException:
continue
robots_url = domain.rstrip("/") + "/robots.txt"
try:
response = requests.get(robots_url, timeout=5)
if response.status_code == 200:
for line in response.text.splitlines():
if line.lower().startswith("sitemap:"):
return line.split(":", 1)[1].strip()
except requests.RequestException:
pass
return None
def get_sitemap_links(sitemap_url):
"""Fetch all links from a sitemap."""
response = requests.get(sitemap_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch sitemap: {response.status_code}")
soup = BeautifulSoup(response.content, "lxml-xml")
links = [loc.text.strip() for loc in soup.find_all("loc")]
return links
def is_blog_link(link):
"""Filter links for blog-related URLs."""
return "blog" in link.lower()
def extract_article_info(url):
"""Extract the article content from a URL."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, "html.parser")
heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
article_text = "\n\n".join(all_paragraphs)
full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
return full_article_text
except Exception as e:
return f"Error fetching article: {e}"
# Streamlit App
st.title("Blog Article Scraper")
# Input Website URL
website_url = st.text_input("Enter the website URL (e.g., https://example.com):")
if st.button("Start Scraping"):
if not website_url:
st.error("Please enter a website URL.")
else:
st.write("Locating sitemap...")
sitemap_url = find_sitemap(website_url)
if not sitemap_url:
st.error("Could not locate a sitemap.")
else:
st.success(f"Sitemap found: {sitemap_url}")
st.write("Fetching links from sitemap...")
try:
links = get_sitemap_links(sitemap_url)
blog_links = [link for link in links if is_blog_link(link)]
st.success(f"Found {len(blog_links)} blog links.")
# Scraping articles
st.write("Extracting article content...")
data = []
progress = st.progress(0)
for i, link in enumerate(blog_links):
article_text = extract_article_info(link)
data.append({"URL": link, "Article Text": article_text})
progress.progress((i + 1) / len(blog_links))
# Save results to DataFrame and Display
df = pd.DataFrame(data)
st.write("Scraping completed.")
st.dataframe(df)
# Provide download link for Excel file
file_name = "blog_articles.xlsx"
df.to_excel(file_name, index=False)
with open(file_name, "rb") as file:
st.download_button(
label="Download Excel File",
data=file,
file_name="blog_articles.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
except Exception as e:
st.error(f"Error during scraping: {e}")
|