Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 6 days ago

Commit

5832786

verified ·

1 Parent(s): 301aafb

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -226

app.py CHANGED Viewed

@@ -1,241 +1,52 @@
-# File: app.py
-# Purpose: Fetch only relevant text (not raw HTML) from a URL, with a smart extractor and a clean fallback.
-import re
 import gradio as gr
 import requests
-from urllib.parse import urlparse
 from bs4 import BeautifulSoup
-# Try to import the smart extractor — if unavailable for any reason, we'll gracefully fall back.
-try:
-    import trafilatura  # Best-in-class main-content extractor
-except Exception:
-    trafilatura = None
-# ---------------------------
-# Helpers
-# ---------------------------
-def _normalize_url(url: str) -> str:
-    """
-    Make sure the URL has a scheme; default to https:// if missing.
-    This avoids 'Invalid URL' errors for inputs like 'example.com'.
-    """
-    url = (url or "").strip()
-    if not url:
-        raise ValueError("Please enter a URL.")
-    parsed = urlparse(url)
-    if not parsed.scheme:
-        url = "https://" + url
-    return url
-def _fetch(url: str, timeout: int = 15) -> requests.Response:
-    """
-    Fetch the page with a reasonable User-Agent and a timeout.
-    We allow redirects and raise on HTTP errors for clearer feedback.
-    """
-    headers = {
-        "User-Agent": "Mozilla/5.0 (compatible; SmartTextFetcher/1.0; +https://huggingface.co/spaces)",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-    }
-    resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
-    resp.raise_for_status()
-    return resp
-def _extract_title_from_html(html: str) -> str | None:
-    """
-    Pull the <title> tag text, if present, for a nicer header.
-    """
-    try:
-        soup = BeautifulSoup(html, "lxml")
-    except Exception:
-        soup = BeautifulSoup(html, "html.parser")
-    title_tag = soup.find("title")
-    if title_tag and title_tag.string:
-        return title_tag.string.strip()
-    return None
-def _visible_text_from_html(html: str) -> str:
-    """
-    Fallback extractor: strip scripts/styles/nav/ads and return visible text.
-    This is a heuristic but works well when the smart extractor isn't available.
-    """
-    try:
-        soup = BeautifulSoup(html, "lxml")
-    except Exception:
-        soup = BeautifulSoup(html, "html.parser")
-    # Remove obviously non-content elements (scripts, styles, nav, ads, etc.)
-    for tag in soup(["script", "style", "noscript", "svg", "path", "form",
-                     "header", "footer", "nav", "aside", "iframe"]):
-        tag.decompose()
-    # Also drop things that *look* like boilerplate (by id/class)
-    kill_words = (
-        "nav", "menu", "footer", "header", "cookie", "banner", "subscribe",
-        "newsletter", "sidebar", "social", "share", "comment", "promo",
-        "advert", "ad", "breadcrumbs", "breadcrumb"
-    )
-    for el in soup.find_all(True):
-        meta = " ".join(el.get("class", []) + [el.get("id", "")]).lower()
-        if any(k in meta for k in kill_words):
-            el.decompose()
-    # Prefer the main/article region when available
-    main = soup.find("article") or soup.find("main") or soup.body or soup
-    # Gather block-level text for nicer spacing
-    blocks = main.find_all(["h1","h2","h3","h4","h5","h6","p","li","blockquote"])
-    lines = []
-    for b in blocks:
-        text = b.get_text(" ", strip=True)
-        if len(text) >= 3:
-            lines.append(text)
-    text = "\n\n".join(lines) if lines else main.get_text(" ", strip=True)
-    # Tidy whitespace a bit
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    text = re.sub(r"[ \t]{2,}", " ", text)
-    return text.strip()
-def _smart_main_text(html: str, url: str) -> str | None:
     """
-    Use Trafilatura to pull the main/article text when available.
-    Returns None if extraction fails.
     """
-    if not trafilatura:
-        return None
     try:
-        # Trafilatura works best when we give it the page content as a string.
-        extracted = trafilatura.extract(
-            html,
-            include_comments=False,
-            favor_recall=True,   # a bit more inclusive; better for varied sites
-            url=url
-        )
-        return (extracted or None)
-    except Exception:
-        return None
-def _truncate(text: str, max_chars: int) -> str:
-    """
-    Optional safety guard so outputs stay small and responsive.
-    """
-    if max_chars is None or max_chars <= 0:
-        return text
-    if len(text) <= max_chars:
-        return text
-    return text[:max_chars].rstrip() + "\n\n… [truncated]"
-# ---------------------------
-# Gradio callback
-# ---------------------------
-def fetch_relevant_text(
-    url: str,
-    mode: str = "Main article (smart)",
-    max_chars: int = 8000,
-    include_title: bool = True
-) -> str:
-    """
-    Main entry point powered by the UI.
-    - Validates the URL
-    - Fetches the page
-    - Extracts relevant text based on the selected mode
-    - Optionally prefixes the page <title>
-    """
-    try:
-        url = _normalize_url(url)
-        resp = _fetch(url)
-        content_type = (resp.headers.get("Content-Type") or "").lower()
-        # If it's plain text, just return it directly.
-        if "text/plain" in content_type and resp.text:
-            text = resp.text.strip()
-        # If it's HTML/XHTML, run extractors.
-        elif "text/html" in content_type or "application/xhtml+xml" in content_type or "<html" in resp.text.lower():
-            html = resp.text
-            if mode.startswith("Main article"):
-                text = _smart_main_text(html, url) or _visible_text_from_html(html)
-            elif mode.startswith("Visible text"):
-                text = _visible_text_from_html(html)
-            else:  # Raw HTML (debug) — exposed in UI but not the default
-                text = html
-            # Prepend title if requested and available (but don't do it in Raw HTML mode)
-            if include_title and not mode.startswith("Raw HTML"):
-                title = _extract_title_from_html(html)
-                if title:
-                    text = f"{title}\n\n{text}".strip()
-        else:
-            # Not HTML or plain text — provide a helpful hint.
-            return f"Unsupported content type: {content_type or 'unknown'}. This tool extracts text from HTML pages."
-        # Keep response snappy by trimming overly long outputs.
-        return _truncate(text, max_chars)
     except requests.exceptions.RequestException as e:
-        return f"Network error while fetching the URL: {e}"
-    except ValueError as ve:
-        return f"{ve}"
-    except Exception as e:
-        return f"Unexpected error: {e}"
-# ---------------------------
-# UI (Gradio)
-# ---------------------------
-with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Smart Text") as demo:
-    # Headline & quick explainer (human-friendly)
-    gr.Markdown(
-        """
-        # Fetch MCP — Smart Text
-        Enter a URL and get the **relevant text** back (not the raw HTML).
-        Use “Main article (smart)” for best results; switch to “Visible text” if needed.
-        """
-    )
-    with gr.Row():
-        url_in = gr.Textbox(label="URL", placeholder="https://example.com/some-article", scale=4)
-    with gr.Row():
-        mode_in = gr.Radio(
-            label="Extraction mode",
-            choices=[
-                "Main article (smart)",
-                "Visible text (fallback)",
-                "Raw HTML (debug)"
-            ],
-            value="Main article (smart)",
-            scale=3
-        )
-        include_title_in = gr.Checkbox(label="Include page title", value=True, scale=1)
-        max_chars_in = gr.Slider(
-            label="Max characters (to keep responses fast)",
-            minimum=500,
-            maximum=40000,
-            step=500,
-            value=8000,
-            scale=3
-        )
-    out = gr.Textbox(label="Extracted Text", lines=22)
-    go = gr.Button("Fetch")
-    go.click(fetch_relevant_text, inputs=[url_in, mode_in, max_chars_in, include_title_in], outputs=out)
-# Keep MCP server flag for your Space
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
+def fetch_content(url):
     """
+    This function takes a URL as input, fetches its HTML content,
+    extracts the clean text, and returns it as a string.
+    It includes error handling for common request issues.
     """
     try:
+        # Send a GET request to the URL with a user-agent header
+        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
+        # Raise an exception for bad status codes (4xx or 5xx)
+        response.raise_for_status()
+        # Create a BeautifulSoup object to parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find and remove all script and style elements from the parsed HTML
+        for script_or_style in soup(['script', 'style']):
+            script_or_style.decompose()
+        # Get the text from the soup and clean up whitespace
+        text = soup.get_text()
+        # Split the text into lines and strip leading/trailing whitespace from each
+        lines = (line.strip() for line in text.splitlines())
+        # Further break down lines into phrases and strip whitespace
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        # Join the chunks back together with a single newline, removing any blank lines
+        clean_text = '\n'.join(chunk for chunk in chunks if chunk)
+        return clean_text
     except requests.exceptions.RequestException as e:
+        # Handle any network-related errors
+        return f"An error occurred: {e}"
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=fetch_content,
+    inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
+    outputs=gr.Textbox(label="Cleaned Page Content"),
+    title="Webpage Text Extractor",
+    description="Enter a URL to fetch the clean text content of the web page, stripped of HTML, scripts, and styles.",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme"
+)
 if __name__ == "__main__":
+    # Launch the Gradio app
+    demo.launch()