Mishal23 commited on
Commit
d64f230
·
verified ·
1 Parent(s): 46416f9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pdfkit
5
+ import os
6
+ import math
7
+
8
+ # Function to extract all links from a website
9
+ def extract_links(url):
10
+ try:
11
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
12
+ if response.status_code != 200:
13
+ return f"Error: Unable to fetch page (Status Code {response.status_code})", []
14
+
15
+ soup = BeautifulSoup(response.text, "html.parser")
16
+ base_url = "/".join(url.split("/")[:3]) # Extract base domain
17
+
18
+ links = []
19
+ for a_tag in soup.find_all("a", href=True):
20
+ href = a_tag["href"]
21
+ if not href.startswith("http"): # Convert relative links to absolute
22
+ href = base_url + href if href.startswith("/") else base_url + "/" + href
23
+
24
+ links.append(href)
25
+
26
+ links = list(set(links)) # Remove duplicates
27
+ if not links:
28
+ return "No links found on the website.", []
29
+
30
+ return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links
31
+
32
+ except Exception as e:
33
+ return f"Error: {str(e)}", []
34
+
35
+ # Function to clean unwanted content (like headers, footers, etc.)
36
+ def clean_content(soup):
37
+ # Remove common unwanted elements
38
+ for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
39
+ tag.decompose() # Remove the tag completely
40
+
41
+ # You can also remove specific classes or IDs if necessary, for example:
42
+ # for tag in soup.find_all(attrs={"class": "footer"}):
43
+ # tag.decompose()
44
+
45
+ # Get the cleaned text from the remaining content
46
+ return soup.get_text(separator="\n", strip=True)
47
+
48
+ # Function to scrape selected links and generate PDFs
49
+ def scrape_and_generate_pdfs(selected_links):
50
+ try:
51
+ if not selected_links:
52
+ return "No links selected.", None
53
+
54
+ pdf_files = []
55
+ batch_size = 4 # Each PDF contains up to 4 links
56
+
57
+ # Process selected links in batches of 4
58
+ for i in range(0, len(selected_links), batch_size):
59
+ batch_links = selected_links[i:i + batch_size]
60
+ all_text = ""
61
+
62
+ # Scrape text content from each selected link
63
+ for link in batch_links:
64
+ try:
65
+ response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
66
+ if response.status_code == 200:
67
+ soup = BeautifulSoup(response.text, "html.parser")
68
+ page_text = clean_content(soup)
69
+ all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
70
+ except:
71
+ all_text += f"Failed to fetch content from {link}\n\n"
72
+
73
+ if all_text:
74
+ pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
75
+
76
+ # Save as temporary HTML file
77
+ html_path = f"temp_{i}.html"
78
+ with open(html_path, "w", encoding="utf-8") as f:
79
+ f.write(f"<html><body><pre>{all_text}</pre></body></html>")
80
+
81
+ # Convert HTML to PDF
82
+ pdfkit.from_file(html_path, pdf_filename)
83
+ os.remove(html_path)
84
+
85
+ pdf_files.append(pdf_filename)
86
+
87
+ return pdf_files # Return list of generated PDFs
88
+
89
+ except Exception as e:
90
+ return f"Error: {str(e)}", None
91
+
92
+ # Gradio UI with link selection
93
+ def show_links_and_generate_pdfs(url):
94
+ message, links = extract_links(url)
95
+ if not links:
96
+ return message, gr.update(choices=[], value=[])
97
+
98
+ return message, gr.update(choices=links, value=[])
99
+
100
+ iface = gr.Blocks()
101
+
102
+ with iface:
103
+ gr.Markdown("### 🌐 Web Scraper & PDF Generator")
104
+ gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")
105
+
106
+ url_input = gr.Textbox(label="Enter Website URL")
107
+ extract_btn = gr.Button("Extract Links")
108
+
109
+ message_output = gr.Markdown("")
110
+ link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
111
+ generate_btn = gr.Button("Generate PDFs")
112
+
113
+ pdf_output = gr.File(label="Download Generated PDFs")
114
+
115
+ extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
116
+ generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)
117
+
118
+ iface.launch()