Spaces:
Sleeping
Sleeping
File size: 3,945 Bytes
d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 dcfc8fd d64f230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import os
import math
from docx import Document # Import for Word file generation
# Function to extract all links from a website
def extract_links(url):
try:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
return f"Error: Unable to fetch page (Status Code {response.status_code})", []
soup = BeautifulSoup(response.text, "html.parser")
base_url = "/".join(url.split("/")[:3]) # Extract base domain
links = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if not href.startswith("http"): # Convert relative links to absolute
href = base_url + href if href.startswith("/") else base_url + "/" + href
links.append(href)
links = list(set(links)) # Remove duplicates
if not links:
return "No links found on the website.", []
return f"β
{len(links)} links found! Select which ones to convert into Word files:", links
except Exception as e:
return f"Error: {str(e)}", []
# Function to clean unwanted content (like headers, footers, etc.)
def clean_content(soup):
for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
tag.decompose() # Remove the tag completely
return soup.get_text(separator="\n", strip=True)
# Function to scrape selected links and generate Word files
def scrape_and_generate_word(selected_links):
try:
if not selected_links:
return "No links selected.", None
word_files = []
batch_size = 4 # Each Word file contains up to 4 links
for i in range(0, len(selected_links), batch_size):
batch_links = selected_links[i:i + batch_size]
doc = Document()
for link in batch_links:
try:
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
page_text = clean_content(soup)
# Add title for each link
doc.add_heading(f"Content from: {link}", level=1)
doc.add_paragraph(page_text)
doc.add_page_break() # Ensure proper formatting
except:
doc.add_paragraph(f"Failed to fetch content from {link}\n\n")
# Save the Word file
word_filename = f"output_{(i//batch_size) + 1}.docx"
doc.save(word_filename)
word_files.append(word_filename)
return word_files # Return list of generated Word files
except Exception as e:
return f"Error: {str(e)}", None
# Gradio UI with link selection
def show_links_and_generate_word(url):
message, links = extract_links(url)
if not links:
return message, gr.update(choices=[], value=[])
return message, gr.update(choices=links, value=[])
iface = gr.Blocks()
with iface:
gr.Markdown("### π Web Scraper & Word Document Generator")
gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).")
url_input = gr.Textbox(label="Enter Website URL")
extract_btn = gr.Button("Extract Links")
message_output = gr.Markdown("")
link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
generate_btn = gr.Button("Generate Word Files")
word_output = gr.File(label="Download Generated Word Files")
extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector])
generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output)
iface.launch()
|