Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import math | |
from docx import Document # Import for Word file generation | |
# Function to extract all links from a website | |
def extract_links(url): | |
try: | |
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) | |
if response.status_code != 200: | |
return f"Error: Unable to fetch page (Status Code {response.status_code})", [] | |
soup = BeautifulSoup(response.text, "html.parser") | |
base_url = "/".join(url.split("/")[:3]) # Extract base domain | |
links = [] | |
for a_tag in soup.find_all("a", href=True): | |
href = a_tag["href"] | |
if not href.startswith("http"): # Convert relative links to absolute | |
href = base_url + href if href.startswith("/") else base_url + "/" + href | |
links.append(href) | |
links = list(set(links)) # Remove duplicates | |
if not links: | |
return "No links found on the website.", [] | |
return f"β {len(links)} links found! Select which ones to convert into Word files:", links | |
except Exception as e: | |
return f"Error: {str(e)}", [] | |
# Function to clean unwanted content (like headers, footers, etc.) | |
def clean_content(soup): | |
for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]): | |
tag.decompose() # Remove the tag completely | |
return soup.get_text(separator="\n", strip=True) | |
# Function to scrape selected links and generate Word files | |
def scrape_and_generate_word(selected_links): | |
try: | |
if not selected_links: | |
return "No links selected.", None | |
word_files = [] | |
batch_size = 4 # Each Word file contains up to 4 links | |
for i in range(0, len(selected_links), batch_size): | |
batch_links = selected_links[i:i + batch_size] | |
doc = Document() | |
for link in batch_links: | |
try: | |
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, "html.parser") | |
page_text = clean_content(soup) | |
# Add title for each link | |
doc.add_heading(f"Content from: {link}", level=1) | |
doc.add_paragraph(page_text) | |
doc.add_page_break() # Ensure proper formatting | |
except: | |
doc.add_paragraph(f"Failed to fetch content from {link}\n\n") | |
# Save the Word file | |
word_filename = f"output_{(i//batch_size) + 1}.docx" | |
doc.save(word_filename) | |
word_files.append(word_filename) | |
return word_files # Return list of generated Word files | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
# Gradio UI with link selection | |
def show_links_and_generate_word(url): | |
message, links = extract_links(url) | |
if not links: | |
return message, gr.update(choices=[], value=[]) | |
return message, gr.update(choices=links, value=[]) | |
iface = gr.Blocks() | |
with iface: | |
gr.Markdown("### π Web Scraper & Word Document Generator") | |
gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).") | |
url_input = gr.Textbox(label="Enter Website URL") | |
extract_btn = gr.Button("Extract Links") | |
message_output = gr.Markdown("") | |
link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True) | |
generate_btn = gr.Button("Generate Word Files") | |
word_output = gr.File(label="Download Generated Word Files") | |
extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector]) | |
generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output) | |
iface.launch() | |