import gradio as gr import requests from bs4 import BeautifulSoup import os import math from docx import Document # Import for Word file generation # Function to extract all links from a website def extract_links(url): try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code != 200: return f"Error: Unable to fetch page (Status Code {response.status_code})", [] soup = BeautifulSoup(response.text, "html.parser") base_url = "/".join(url.split("/")[:3]) # Extract base domain links = [] for a_tag in soup.find_all("a", href=True): href = a_tag["href"] if not href.startswith("http"): # Convert relative links to absolute href = base_url + href if href.startswith("/") else base_url + "/" + href links.append(href) links = list(set(links)) # Remove duplicates if not links: return "No links found on the website.", [] return f"✅ {len(links)} links found! Select which ones to convert into Word files:", links except Exception as e: return f"Error: {str(e)}", [] # Function to clean unwanted content (like headers, footers, etc.) def clean_content(soup): for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]): tag.decompose() # Remove the tag completely return soup.get_text(separator="\n", strip=True) # Function to scrape selected links and generate Word files def scrape_and_generate_word(selected_links): try: if not selected_links: return "No links selected.", None word_files = [] batch_size = 4 # Each Word file contains up to 4 links for i in range(0, len(selected_links), batch_size): batch_links = selected_links[i:i + batch_size] doc = Document() for link in batch_links: try: response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") page_text = clean_content(soup) # Add title for each link doc.add_heading(f"Content from: {link}", level=1) doc.add_paragraph(page_text) doc.add_page_break() # Ensure proper formatting except: doc.add_paragraph(f"Failed to fetch content from {link}\n\n") # Save the Word file word_filename = f"output_{(i//batch_size) + 1}.docx" doc.save(word_filename) word_files.append(word_filename) return word_files # Return list of generated Word files except Exception as e: return f"Error: {str(e)}", None # Gradio UI with link selection def show_links_and_generate_word(url): message, links = extract_links(url) if not links: return message, gr.update(choices=[], value=[]) return message, gr.update(choices=links, value=[]) iface = gr.Blocks() with iface: gr.Markdown("### 🌐 Web Scraper & Word Document Generator") gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).") url_input = gr.Textbox(label="Enter Website URL") extract_btn = gr.Button("Extract Links") message_output = gr.Markdown("") link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True) generate_btn = gr.Button("Generate Word Files") word_output = gr.File(label="Download Generated Word Files") extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector]) generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output) iface.launch()