File size: 4,443 Bytes
d64f230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pdfkit
import os
import math
# Function to extract all links from a website
def extract_links(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code != 200:
            return f"Error: Unable to fetch page (Status Code {response.status_code})", []
        
        soup = BeautifulSoup(response.text, "html.parser")
        base_url = "/".join(url.split("/")[:3])  # Extract base domain
        
        links = []
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if not href.startswith("http"):  # Convert relative links to absolute
                href = base_url + href if href.startswith("/") else base_url + "/" + href
            
            links.append(href)

        links = list(set(links))  # Remove duplicates
        if not links:
            return "No links found on the website.", []
        
        return f"βœ… {len(links)} links found! Select which ones to convert into PDFs:", links

    except Exception as e:
        return f"Error: {str(e)}", []

# Function to clean unwanted content (like headers, footers, etc.)
def clean_content(soup):
    # Remove common unwanted elements
    for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
        tag.decompose()  # Remove the tag completely
    
    # You can also remove specific classes or IDs if necessary, for example:
    # for tag in soup.find_all(attrs={"class": "footer"}):
    #     tag.decompose()
    
    # Get the cleaned text from the remaining content
    return soup.get_text(separator="\n", strip=True)

# Function to scrape selected links and generate PDFs
def scrape_and_generate_pdfs(selected_links):
    try:
        if not selected_links:
            return "No links selected.", None
        
        pdf_files = []
        batch_size = 4  # Each PDF contains up to 4 links

        # Process selected links in batches of 4
        for i in range(0, len(selected_links), batch_size):
            batch_links = selected_links[i:i + batch_size]
            all_text = ""

            # Scrape text content from each selected link
            for link in batch_links:
                try:
                    response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, "html.parser")
                        page_text = clean_content(soup)
                        all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
                except:
                    all_text += f"Failed to fetch content from {link}\n\n"

            if all_text:
                pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
                
                # Save as temporary HTML file
                html_path = f"temp_{i}.html"
                with open(html_path, "w", encoding="utf-8") as f:
                    f.write(f"<html><body><pre>{all_text}</pre></body></html>")

                # Convert HTML to PDF
                pdfkit.from_file(html_path, pdf_filename)
                os.remove(html_path)

                pdf_files.append(pdf_filename)

        return pdf_files  # Return list of generated PDFs

    except Exception as e:
        return f"Error: {str(e)}", None

# Gradio UI with link selection
def show_links_and_generate_pdfs(url):
    message, links = extract_links(url)
    if not links:
        return message, gr.update(choices=[], value=[])
    
    return message, gr.update(choices=links, value=[])

iface = gr.Blocks()

with iface:
    gr.Markdown("### 🌐 Web Scraper & PDF Generator")
    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")

    url_input = gr.Textbox(label="Enter Website URL")
    extract_btn = gr.Button("Extract Links")

    message_output = gr.Markdown("")
    link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
    generate_btn = gr.Button("Generate PDFs")
    
    pdf_output = gr.File(label="Download Generated PDFs")

    extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
    generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)

iface.launch()