import gradio as gr import requests from bs4 import BeautifulSoup import pdfkit import os import math # Function to extract all links from a website def extract_links(url): try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code != 200: return f"Error: Unable to fetch page (Status Code {response.status_code})", [] soup = BeautifulSoup(response.text, "html.parser") base_url = "/".join(url.split("/")[:3]) # Extract base domain links = [] for a_tag in soup.find_all("a", href=True): href = a_tag["href"] if not href.startswith("http"): # Convert relative links to absolute href = base_url + href if href.startswith("/") else base_url + "/" + href links.append(href) links = list(set(links)) # Remove duplicates if not links: return "No links found on the website.", [] return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links except Exception as e: return f"Error: {str(e)}", [] # Function to clean unwanted content (like headers, footers, etc.) def clean_content(soup): # Remove common unwanted elements for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]): tag.decompose() # Remove the tag completely # You can also remove specific classes or IDs if necessary, for example: # for tag in soup.find_all(attrs={"class": "footer"}): # tag.decompose() # Get the cleaned text from the remaining content return soup.get_text(separator="\n", strip=True) # Function to scrape selected links and generate PDFs def scrape_and_generate_pdfs(selected_links): try: if not selected_links: return "No links selected.", None pdf_files = [] batch_size = 4 # Each PDF contains up to 4 links # Process selected links in batches of 4 for i in range(0, len(selected_links), batch_size): batch_links = selected_links[i:i + batch_size] all_text = "" # Scrape text content from each selected link for link in batch_links: try: response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") page_text = clean_content(soup) all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n" except: all_text += f"Failed to fetch content from {link}\n\n" if all_text: pdf_filename = f"output_{(i//batch_size) + 1}.pdf" # Save as temporary HTML file html_path = f"temp_{i}.html" with open(html_path, "w", encoding="utf-8") as f: f.write(f"
{all_text}
") # Convert HTML to PDF pdfkit.from_file(html_path, pdf_filename) os.remove(html_path) pdf_files.append(pdf_filename) return pdf_files # Return list of generated PDFs except Exception as e: return f"Error: {str(e)}", None # Gradio UI with link selection def show_links_and_generate_pdfs(url): message, links = extract_links(url) if not links: return message, gr.update(choices=[], value=[]) return message, gr.update(choices=links, value=[]) iface = gr.Blocks() with iface: gr.Markdown("### 🌐 Web Scraper & PDF Generator") gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).") url_input = gr.Textbox(label="Enter Website URL") extract_btn = gr.Button("Extract Links") message_output = gr.Markdown("") link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True) generate_btn = gr.Button("Generate PDFs") pdf_output = gr.File(label="Download Generated PDFs") extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector]) generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output) iface.launch()