Spaces:

Mishal23
/

web-scrapper

Running

App Files Files Community

web-scrapper / app.py

Mishal23

Update app.py

89ef1f5 verified 5 months ago

raw

history blame

4.44 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import pdfkit
	import os
	import math
	# Function to extract all links from a website
	def extract_links(url):
	try:
	response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code != 200:
	return f"Error: Unable to fetch page (Status Code {response.status_code})", []

	soup = BeautifulSoup(response.text, "html.parser")
	base_url = "/".join(url.split("/")[:3]) # Extract base domain

	links = []
	for a_tag in soup.find_all("a", href=True):
	href = a_tag["href"]
	if not href.startswith("http"): # Convert relative links to absolute
	href = base_url + href if href.startswith("/") else base_url + "/" + href

	links.append(href)

	links = list(set(links)) # Remove duplicates
	if not links:
	return "No links found on the website.", []

	return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links

	except Exception as e:
	return f"Error: {str(e)}", []

	# Function to clean unwanted content (like headers, footers, etc.)
	def clean_content(soup):
	# Remove common unwanted elements
	for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
	tag.decompose() # Remove the tag completely

	# You can also remove specific classes or IDs if necessary, for example:
	# for tag in soup.find_all(attrs={"class": "footer"}):
	# tag.decompose()

	# Get the cleaned text from the remaining content
	return soup.get_text(separator="\n", strip=True)

	# Function to scrape selected links and generate PDFs
	def scrape_and_generate_pdfs(selected_links):
	try:
	if not selected_links:
	return "No links selected.", None

	pdf_files = []
	batch_size = 4 # Each PDF contains up to 4 links

	# Process selected links in batches of 4
	for i in range(0, len(selected_links), batch_size):
	batch_links = selected_links[i:i + batch_size]
	all_text = ""

	# Scrape text content from each selected link
	for link in batch_links:
	try:
	response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	page_text = clean_content(soup)
	all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
	except:
	all_text += f"Failed to fetch content from {link}\n\n"

	if all_text:
	pdf_filename = f"output_{(i//batch_size) + 1}.pdf"

	# Save as temporary HTML file
	html_path = f"temp_{i}.html"
	with open(html_path, "w", encoding="utf-8") as f:
	f.write(f"<html><body><pre>{all_text}</pre></body></html>")

	# Convert HTML to PDF
	pdfkit.from_file(html_path, pdf_filename)
	os.remove(html_path)

	pdf_files.append(pdf_filename)

	return pdf_files # Return list of generated PDFs

	except Exception as e:
	return f"Error: {str(e)}", None

	# Gradio UI with link selection
	def show_links_and_generate_pdfs(url):
	message, links = extract_links(url)
	if not links:
	return message, gr.update(choices=[], value=[])

	return message, gr.update(choices=links, value=[])

	iface = gr.Blocks()

	with iface:
	gr.Markdown("### 🌐 Web Scraper & PDF Generator")
	gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")

	url_input = gr.Textbox(label="Enter Website URL")
	extract_btn = gr.Button("Extract Links")

	message_output = gr.Markdown("")
	link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
	generate_btn = gr.Button("Generate PDFs")

	pdf_output = gr.File(label="Download Generated PDFs")

	extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
	generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)

	iface.launch()