Spaces:

Mishal23
/

web-scrapper

Sleeping

File size: 3,945 Bytes

d64f230
 
 
 
 
dcfc8fd
 
d64f230
 
 
 
 
 
dcfc8fd
d64f230
 
dcfc8fd
d64f230
 
 
 
 
dcfc8fd
d64f230
 
 
 
 
dcfc8fd
 
d64f230
 
 
 
 
 
 
 
dcfc8fd
d64f230
 
dcfc8fd
 
d64f230
 
 
 
dcfc8fd
 
 
d64f230
 
dcfc8fd
d64f230
 
 
 
 
 
 
 
dcfc8fd
 
 
 
 
 
d64f230
dcfc8fd
 
 
 
d64f230
dcfc8fd
d64f230
 
 
 
 
dcfc8fd
d64f230
 
 
dcfc8fd
d64f230
 
 
 
 
dcfc8fd
 
d64f230
 
 
 
 
 
dcfc8fd
 
 
d64f230
dcfc8fd
 
d64f230

import gradio as gr
import requests
from bs4 import BeautifulSoup
import os
import math
from docx import Document  # Import for Word file generation

# Function to extract all links from a website
def extract_links(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code != 200:
            return f"Error: Unable to fetch page (Status Code {response.status_code})", []

        soup = BeautifulSoup(response.text, "html.parser")
        base_url = "/".join(url.split("/")[:3])  # Extract base domain

        links = []
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if not href.startswith("http"):  # Convert relative links to absolute
                href = base_url + href if href.startswith("/") else base_url + "/" + href

            links.append(href)

        links = list(set(links))  # Remove duplicates
        if not links:
            return "No links found on the website.", []

        return f"✅ {len(links)} links found! Select which ones to convert into Word files:", links

    except Exception as e:
        return f"Error: {str(e)}", []

# Function to clean unwanted content (like headers, footers, etc.)
def clean_content(soup):
    for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
        tag.decompose()  # Remove the tag completely

    return soup.get_text(separator="\n", strip=True)

# Function to scrape selected links and generate Word files
def scrape_and_generate_word(selected_links):
    try:
        if not selected_links:
            return "No links selected.", None

        word_files = []
        batch_size = 4  # Each Word file contains up to 4 links

        for i in range(0, len(selected_links), batch_size):
            batch_links = selected_links[i:i + batch_size]
            doc = Document()

            for link in batch_links:
                try:
                    response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, "html.parser")
                        page_text = clean_content(soup)

                        # Add title for each link
                        doc.add_heading(f"Content from: {link}", level=1)
                        doc.add_paragraph(page_text)
                        doc.add_page_break()  # Ensure proper formatting
                except:
                    doc.add_paragraph(f"Failed to fetch content from {link}\n\n")

            # Save the Word file
            word_filename = f"output_{(i//batch_size) + 1}.docx"
            doc.save(word_filename)
            word_files.append(word_filename)

        return word_files  # Return list of generated Word files

    except Exception as e:
        return f"Error: {str(e)}", None

# Gradio UI with link selection
def show_links_and_generate_word(url):
    message, links = extract_links(url)
    if not links:
        return message, gr.update(choices=[], value=[])

    return message, gr.update(choices=links, value=[])

iface = gr.Blocks()

with iface:
    gr.Markdown("### 🌐 Web Scraper & Word Document Generator")
    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).")

    url_input = gr.Textbox(label="Enter Website URL")
    extract_btn = gr.Button("Extract Links")

    message_output = gr.Markdown("")
    link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
    generate_btn = gr.Button("Generate Word Files")

    word_output = gr.File(label="Download Generated Word Files")

    extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector])
    generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output)

iface.launch()