Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import pdfkit | |
import os | |
import math | |
# Function to extract all links from a website | |
def extract_links(url): | |
try: | |
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) | |
if response.status_code != 200: | |
return f"Error: Unable to fetch page (Status Code {response.status_code})", [] | |
soup = BeautifulSoup(response.text, "html.parser") | |
base_url = "/".join(url.split("/")[:3]) # Extract base domain | |
links = [] | |
for a_tag in soup.find_all("a", href=True): | |
href = a_tag["href"] | |
if not href.startswith("http"): # Convert relative links to absolute | |
href = base_url + href if href.startswith("/") else base_url + "/" + href | |
links.append(href) | |
links = list(set(links)) # Remove duplicates | |
if not links: | |
return "No links found on the website.", [] | |
return f"β {len(links)} links found! Select which ones to convert into PDFs:", links | |
except Exception as e: | |
return f"Error: {str(e)}", [] | |
# Function to clean unwanted content (like headers, footers, etc.) | |
def clean_content(soup): | |
# Remove common unwanted elements | |
for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]): | |
tag.decompose() # Remove the tag completely | |
# You can also remove specific classes or IDs if necessary, for example: | |
# for tag in soup.find_all(attrs={"class": "footer"}): | |
# tag.decompose() | |
# Get the cleaned text from the remaining content | |
return soup.get_text(separator="\n", strip=True) | |
# Function to scrape selected links and generate PDFs | |
def scrape_and_generate_pdfs(selected_links): | |
try: | |
if not selected_links: | |
return "No links selected.", None | |
pdf_files = [] | |
batch_size = 4 # Each PDF contains up to 4 links | |
# Process selected links in batches of 4 | |
for i in range(0, len(selected_links), batch_size): | |
batch_links = selected_links[i:i + batch_size] | |
all_text = "" | |
# Scrape text content from each selected link | |
for link in batch_links: | |
try: | |
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, "html.parser") | |
page_text = clean_content(soup) | |
all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n" | |
except: | |
all_text += f"Failed to fetch content from {link}\n\n" | |
if all_text: | |
pdf_filename = f"output_{(i//batch_size) + 1}.pdf" | |
# Save as temporary HTML file | |
html_path = f"temp_{i}.html" | |
with open(html_path, "w", encoding="utf-8") as f: | |
f.write(f"<html><body><pre>{all_text}</pre></body></html>") | |
# Convert HTML to PDF | |
pdfkit.from_file(html_path, pdf_filename) | |
os.remove(html_path) | |
pdf_files.append(pdf_filename) | |
return pdf_files # Return list of generated PDFs | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
# Gradio UI with link selection | |
def show_links_and_generate_pdfs(url): | |
message, links = extract_links(url) | |
if not links: | |
return message, gr.update(choices=[], value=[]) | |
return message, gr.update(choices=links, value=[]) | |
iface = gr.Blocks() | |
with iface: | |
gr.Markdown("### π Web Scraper & PDF Generator") | |
gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).") | |
url_input = gr.Textbox(label="Enter Website URL") | |
extract_btn = gr.Button("Extract Links") | |
message_output = gr.Markdown("") | |
link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True) | |
generate_btn = gr.Button("Generate PDFs") | |
pdf_output = gr.File(label="Download Generated PDFs") | |
extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector]) | |
generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output) | |
iface.launch() | |