Spaces:
Sleeping
Sleeping
File size: 4,443 Bytes
d64f230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pdfkit
import os
import math
# Function to extract all links from a website
def extract_links(url):
try:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
return f"Error: Unable to fetch page (Status Code {response.status_code})", []
soup = BeautifulSoup(response.text, "html.parser")
base_url = "/".join(url.split("/")[:3]) # Extract base domain
links = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if not href.startswith("http"): # Convert relative links to absolute
href = base_url + href if href.startswith("/") else base_url + "/" + href
links.append(href)
links = list(set(links)) # Remove duplicates
if not links:
return "No links found on the website.", []
return f"β
{len(links)} links found! Select which ones to convert into PDFs:", links
except Exception as e:
return f"Error: {str(e)}", []
# Function to clean unwanted content (like headers, footers, etc.)
def clean_content(soup):
# Remove common unwanted elements
for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
tag.decompose() # Remove the tag completely
# You can also remove specific classes or IDs if necessary, for example:
# for tag in soup.find_all(attrs={"class": "footer"}):
# tag.decompose()
# Get the cleaned text from the remaining content
return soup.get_text(separator="\n", strip=True)
# Function to scrape selected links and generate PDFs
def scrape_and_generate_pdfs(selected_links):
try:
if not selected_links:
return "No links selected.", None
pdf_files = []
batch_size = 4 # Each PDF contains up to 4 links
# Process selected links in batches of 4
for i in range(0, len(selected_links), batch_size):
batch_links = selected_links[i:i + batch_size]
all_text = ""
# Scrape text content from each selected link
for link in batch_links:
try:
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
page_text = clean_content(soup)
all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
except:
all_text += f"Failed to fetch content from {link}\n\n"
if all_text:
pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
# Save as temporary HTML file
html_path = f"temp_{i}.html"
with open(html_path, "w", encoding="utf-8") as f:
f.write(f"<html><body><pre>{all_text}</pre></body></html>")
# Convert HTML to PDF
pdfkit.from_file(html_path, pdf_filename)
os.remove(html_path)
pdf_files.append(pdf_filename)
return pdf_files # Return list of generated PDFs
except Exception as e:
return f"Error: {str(e)}", None
# Gradio UI with link selection
def show_links_and_generate_pdfs(url):
message, links = extract_links(url)
if not links:
return message, gr.update(choices=[], value=[])
return message, gr.update(choices=links, value=[])
iface = gr.Blocks()
with iface:
gr.Markdown("### π Web Scraper & PDF Generator")
gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")
url_input = gr.Textbox(label="Enter Website URL")
extract_btn = gr.Button("Extract Links")
message_output = gr.Markdown("")
link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
generate_btn = gr.Button("Generate PDFs")
pdf_output = gr.File(label="Download Generated PDFs")
extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)
iface.launch()
|