Spaces:

Mishal23
/

web-scrapper

Sleeping

File size: 4,443 Bytes

d64f230

import gradio as gr
import requests
from bs4 import BeautifulSoup
import pdfkit
import os
import math
# Function to extract all links from a website
def extract_links(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code != 200:
            return f"Error: Unable to fetch page (Status Code {response.status_code})", []
        
        soup = BeautifulSoup(response.text, "html.parser")
        base_url = "/".join(url.split("/")[:3])  # Extract base domain
        
        links = []
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if not href.startswith("http"):  # Convert relative links to absolute
                href = base_url + href if href.startswith("/") else base_url + "/" + href
            
            links.append(href)

        links = list(set(links))  # Remove duplicates
        if not links:
            return "No links found on the website.", []
        
        return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links

    except Exception as e:
        return f"Error: {str(e)}", []

# Function to clean unwanted content (like headers, footers, etc.)
def clean_content(soup):
    # Remove common unwanted elements
    for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
        tag.decompose()  # Remove the tag completely
    
    # You can also remove specific classes or IDs if necessary, for example:
    # for tag in soup.find_all(attrs={"class": "footer"}):
    #     tag.decompose()
    
    # Get the cleaned text from the remaining content
    return soup.get_text(separator="\n", strip=True)

# Function to scrape selected links and generate PDFs
def scrape_and_generate_pdfs(selected_links):
    try:
        if not selected_links:
            return "No links selected.", None
        
        pdf_files = []
        batch_size = 4  # Each PDF contains up to 4 links

        # Process selected links in batches of 4
        for i in range(0, len(selected_links), batch_size):
            batch_links = selected_links[i:i + batch_size]
            all_text = ""

            # Scrape text content from each selected link
            for link in batch_links:
                try:
                    response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, "html.parser")
                        page_text = clean_content(soup)
                        all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
                except:
                    all_text += f"Failed to fetch content from {link}\n\n"

            if all_text:
                pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
                
                # Save as temporary HTML file
                html_path = f"temp_{i}.html"
                with open(html_path, "w", encoding="utf-8") as f:
                    f.write(f"<html><body><pre>{all_text}</pre></body></html>")

                # Convert HTML to PDF
                pdfkit.from_file(html_path, pdf_filename)
                os.remove(html_path)

                pdf_files.append(pdf_filename)

        return pdf_files  # Return list of generated PDFs

    except Exception as e:
        return f"Error: {str(e)}", None

# Gradio UI with link selection
def show_links_and_generate_pdfs(url):
    message, links = extract_links(url)
    if not links:
        return message, gr.update(choices=[], value=[])
    
    return message, gr.update(choices=links, value=[])

iface = gr.Blocks()

with iface:
    gr.Markdown("### 🌐 Web Scraper & PDF Generator")
    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")

    url_input = gr.Textbox(label="Enter Website URL")
    extract_btn = gr.Button("Extract Links")

    message_output = gr.Markdown("")
    link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
    generate_btn = gr.Button("Generate PDFs")
    
    pdf_output = gr.File(label="Download Generated PDFs")

    extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
    generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)

iface.launch()