Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 6 days ago

Commit

ed27cf5

verified ·

1 Parent(s): 39ae379

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -175

app.py CHANGED Viewed

@@ -1,228 +1,241 @@
 # File: app.py
-# Purpose: Fetch only the readable text from a web page and return it as Markdown
-# Notes: This version is more efficient and user-friendly than returning raw HTML.
 import re
-import time
 import gradio as gr
 import requests
 from urllib.parse import urlparse
-from bs4 import BeautifulSoup  # used as a fallback cleaner
-from readability import Document  # isolates the "main content" like reader view
-import html2text  # converts HTML to Markdown
-# ----------------------------
-# Simple in-memory cache (tiny LRU-ish)
-# ----------------------------
-# layman's terms: we remember recent results so repeated requests for the same URL are instant
-_CACHE = {}
-_CACHE_ORDER = []
-_CACHE_MAX = 64
-_CACHE_TTL_SECONDS = 10 * 60  # 10 minutes
-def _cache_get(key):
-    # layman's terms: give me the saved value if it's still fresh
-    item = _CACHE.get(key)
-    if not item:
-        return None
-    value, ts = item
-    if time.time() - ts > _CACHE_TTL_SECONDS:
-        _CACHE.pop(key, None)
-        return None
-    # refresh order
-    if key in _CACHE_ORDER:
-        _CACHE_ORDER.remove(key)
-    _CACHE_ORDER.append(key)
-    return value
-def _cache_set(key, value):
-    # layman's terms: save a result and keep the list from growing too large
-    _CACHE[key] = (value, time.time())
-    if key in _CACHE_ORDER:
-        _CACHE_ORDER.remove(key)
-    _CACHE_ORDER.append(key)
-    while len(_CACHE_ORDER) > _CACHE_MAX:
-        oldest = _CACHE_ORDER.pop(0)
-        _CACHE.pop(oldest, None)
-# ----------------------------
 # Helpers
-# ----------------------------
 def _normalize_url(url: str) -> str:
     """
-    layman's terms: if the user forgot 'https://', add it.
     """
-    url = url.strip()
     parsed = urlparse(url)
     if not parsed.scheme:
         url = "https://" + url
     return url
-def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
     """
-    layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
     """
     try:
-        head = requests.head(
-            url,
-            allow_redirects=True,
-            timeout=(5, 10),
-            headers={
-                "User-Agent": "Mozilla/5.0",
-                "Accept": "text/html,application/xhtml+xml",
-                "Accept-Encoding": "gzip, deflate, br",
-            },
-        )
-        size = head.headers.get("Content-Length")
-        if size and size.isdigit():
-            return int(size) > max_bytes
-    except requests.exceptions.RequestException:
-        # layman's terms: if HEAD fails, we won't block the GET just because of that
-        pass
-    return False
-def _fetch_html(url: str) -> str:
     """
-    layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
     """
-    resp = requests.get(
-        url,
-        timeout=(5, 20),  # connect, read
-        headers={
-            "User-Agent": "Mozilla/5.0",
-            "Accept": "text/html,application/xhtml+xml",
-            "Accept-Encoding": "gzip, deflate, br",
-            "Accept-Language": "en-US,en;q=0.8",
-        },
     )
-    resp.raise_for_status()
-    # Only proceed for text/html payloads
-    ctype = resp.headers.get("Content-Type", "")
-    if "text/html" not in ctype.lower():
-        # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
-        return resp.text
-    # Respect declared encoding where possible
-    resp.encoding = resp.encoding or "utf-8"
-    return resp.text
-def _extract_main_html(html: str) -> str:
     """
-    layman's terms: use reader mode (Readability) to isolate the main article/body content.
-    Falls back to stripping scripts/styles if Readability can't find a core.
     """
     try:
-        doc = Document(html)
-        main_html = doc.summary(html_partial=True)  # main content as HTML
-        # Make sure we still have something useful
-        if main_html and len(main_html) > 40:
-            return main_html
     except Exception:
-        pass
-    # Fallback: strip scripts/styles and return a body-only HTML
-    soup = BeautifulSoup(html, "html.parser")
-    for tag in soup(["script", "style", "noscript"]):
-        tag.decompose()
-    body = soup.body or soup
-    return str(body)
-def _html_to_markdown(html: str) -> str:
     """
-    layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
     """
-    h = html2text.HTML2Text()
-    h.ignore_images = True          # don't inline images in Markdown
-    h.ignore_links = False          # keep links as [text](url)
-    h.body_width = 0                # don't hard-wrap lines
-    h.protect_links = True
-    h.single_line_break = True
-    md = h.handle(html)
-    # Tidy up excessive blank lines/whitespace
-    md = re.sub(r"\n{3,}", "\n\n", md).strip()
-    return md or "_No readable text found on this page._"
-# ----------------------------
-# Main callable for Gradio
-# ----------------------------
-def fetch_markdown(url: str) -> str:
     """
-    layman's terms: the function the UI calls.
-    Steps:
-      1) sanitize the URL
-      2) quick HEAD check to avoid massive pages
-      3) GET the HTML
-      4) isolate the main content
-      5) convert to Markdown
-      6) return Markdown
     """
-    if not url or not url.strip():
-        return "_Please enter a URL._"
     try:
         url = _normalize_url(url)
-        # Return cached value if available
-        cached = _cache_get(url)
-        if cached:
-            return cached
-        # Optional efficiency: skip very large pages before downloading
-        if _too_large_via_head(url):
-            return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
-        html = _fetch_html(url)
-        # If server returned non-HTML (e.g., JSON), just code-fence it
-        if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
-            # This condition is a no-op; we already content-typed in _fetch_html.
-            pass
-        main_html = _extract_main_html(html)
-        markdown = _html_to_markdown(main_html)
-        _cache_set(url, markdown)
-        return markdown
     except requests.exceptions.RequestException as e:
-        # layman's terms: network or HTTP error
-        return f"_Network error: {e}_"
     except Exception as e:
-        # layman's terms: any other unexpected error
-        return f"_Unexpected error: {e}_"
-# ----------------------------
-# Gradio UI
-# ----------------------------
-with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
-    # layman's terms: a simple, centered header explaining what this tool does
-    gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
     with gr.Row():
-        url_box = gr.Textbox(
-            label="URL",
-            placeholder="example.com or https://example.com/article",
         )
-        fetch_btn = gr.Button("Fetch")
-    # layman's terms: show the result as rendered Markdown (not a plain textbox)
-    output_md = gr.Markdown(label="Readable Markdown")
-    # layman's terms: helpful example URLs to try with one click
-    gr.Examples(
-        examples=[
-            ["https://en.wikipedia.org/wiki/Hugging_Face"],
-            ["https://huggingface.co/blog"],
-            ["https://www.bbc.com/news"],
-        ],
-        inputs=[url_box],
-    )
-    fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
-    url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
 if __name__ == "__main__":
     demo.launch(mcp_server=True)

 # File: app.py
+# Purpose: Fetch only relevant text (not raw HTML) from a URL, with a smart extractor and a clean fallback.
 import re
 import gradio as gr
 import requests
 from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+# Try to import the smart extractor — if unavailable for any reason, we'll gracefully fall back.
+try:
+    import trafilatura  # Best-in-class main-content extractor
+except Exception:
+    trafilatura = None
+# ---------------------------
 # Helpers
+# ---------------------------
 def _normalize_url(url: str) -> str:
     """
+    Make sure the URL has a scheme; default to https:// if missing.
+    This avoids 'Invalid URL' errors for inputs like 'example.com'.
     """
+    url = (url or "").strip()
+    if not url:
+        raise ValueError("Please enter a URL.")
     parsed = urlparse(url)
     if not parsed.scheme:
         url = "https://" + url
     return url
+def _fetch(url: str, timeout: int = 15) -> requests.Response:
+    """
+    Fetch the page with a reasonable User-Agent and a timeout.
+    We allow redirects and raise on HTTP errors for clearer feedback.
     """
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; SmartTextFetcher/1.0; +https://huggingface.co/spaces)",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    }
+    resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+    resp.raise_for_status()
+    return resp
+def _extract_title_from_html(html: str) -> str | None:
+    """
+    Pull the <title> tag text, if present, for a nicer header.
     """
     try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    title_tag = soup.find("title")
+    if title_tag and title_tag.string:
+        return title_tag.string.strip()
+    return None
+def _visible_text_from_html(html: str) -> str:
     """
+    Fallback extractor: strip scripts/styles/nav/ads and return visible text.
+    This is a heuristic but works well when the smart extractor isn't available.
     """
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    # Remove obviously non-content elements (scripts, styles, nav, ads, etc.)
+    for tag in soup(["script", "style", "noscript", "svg", "path", "form",
+                     "header", "footer", "nav", "aside", "iframe"]):
+        tag.decompose()
+    # Also drop things that *look* like boilerplate (by id/class)
+    kill_words = (
+        "nav", "menu", "footer", "header", "cookie", "banner", "subscribe",
+        "newsletter", "sidebar", "social", "share", "comment", "promo",
+        "advert", "ad", "breadcrumbs", "breadcrumb"
     )
+    for el in soup.find_all(True):
+        meta = " ".join(el.get("class", []) + [el.get("id", "")]).lower()
+        if any(k in meta for k in kill_words):
+            el.decompose()
+    # Prefer the main/article region when available
+    main = soup.find("article") or soup.find("main") or soup.body or soup
+    # Gather block-level text for nicer spacing
+    blocks = main.find_all(["h1","h2","h3","h4","h5","h6","p","li","blockquote"])
+    lines = []
+    for b in blocks:
+        text = b.get_text(" ", strip=True)
+        if len(text) >= 3:
+            lines.append(text)
+    text = "\n\n".join(lines) if lines else main.get_text(" ", strip=True)
+    # Tidy whitespace a bit
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    return text.strip()
+def _smart_main_text(html: str, url: str) -> str | None:
     """
+    Use Trafilatura to pull the main/article text when available.
+    Returns None if extraction fails.
     """
+    if not trafilatura:
+        return None
     try:
+        # Trafilatura works best when we give it the page content as a string.
+        extracted = trafilatura.extract(
+            html,
+            include_comments=False,
+            favor_recall=True,   # a bit more inclusive; better for varied sites
+            url=url
+        )
+        return (extracted or None)
     except Exception:
+        return None
+def _truncate(text: str, max_chars: int) -> str:
     """
+    Optional safety guard so outputs stay small and responsive.
     """
+    if max_chars is None or max_chars <= 0:
+        return text
+    if len(text) <= max_chars:
+        return text
+    return text[:max_chars].rstrip() + "\n\n… [truncated]"
+# ---------------------------
+# Gradio callback
+# ---------------------------
+def fetch_relevant_text(
+    url: str,
+    mode: str = "Main article (smart)",
+    max_chars: int = 8000,
+    include_title: bool = True
+) -> str:
     """
+    Main entry point powered by the UI.
+    - Validates the URL
+    - Fetches the page
+    - Extracts relevant text based on the selected mode
+    - Optionally prefixes the page <title>
     """
     try:
         url = _normalize_url(url)
+        resp = _fetch(url)
+        content_type = (resp.headers.get("Content-Type") or "").lower()
+        # If it's plain text, just return it directly.
+        if "text/plain" in content_type and resp.text:
+            text = resp.text.strip()
+        # If it's HTML/XHTML, run extractors.
+        elif "text/html" in content_type or "application/xhtml+xml" in content_type or "<html" in resp.text.lower():
+            html = resp.text
+            if mode.startswith("Main article"):
+                text = _smart_main_text(html, url) or _visible_text_from_html(html)
+            elif mode.startswith("Visible text"):
+                text = _visible_text_from_html(html)
+            else:  # Raw HTML (debug) — exposed in UI but not the default
+                text = html
+            # Prepend title if requested and available (but don't do it in Raw HTML mode)
+            if include_title and not mode.startswith("Raw HTML"):
+                title = _extract_title_from_html(html)
+                if title:
+                    text = f"{title}\n\n{text}".strip()
+        else:
+            # Not HTML or plain text — provide a helpful hint.
+            return f"Unsupported content type: {content_type or 'unknown'}. This tool extracts text from HTML pages."
+        # Keep response snappy by trimming overly long outputs.
+        return _truncate(text, max_chars)
     except requests.exceptions.RequestException as e:
+        return f"Network error while fetching the URL: {e}"
+    except ValueError as ve:
+        return f"{ve}"
     except Exception as e:
+        return f"Unexpected error: {e}"
+# ---------------------------
+# UI (Gradio)
+# ---------------------------
+with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Smart Text") as demo:
+    # Headline & quick explainer (human-friendly)
+    gr.Markdown(
+        """
+        # Fetch MCP — Smart Text
+        Enter a URL and get the **relevant text** back (not the raw HTML).
+        Use “Main article (smart)” for best results; switch to “Visible text” if needed.
+        """
+    )
     with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com/some-article", scale=4)
+    with gr.Row():
+        mode_in = gr.Radio(
+            label="Extraction mode",
+            choices=[
+                "Main article (smart)",
+                "Visible text (fallback)",
+                "Raw HTML (debug)"
+            ],
+            value="Main article (smart)",
+            scale=3
         )
+        include_title_in = gr.Checkbox(label="Include page title", value=True, scale=1)
+        max_chars_in = gr.Slider(
+            label="Max characters (to keep responses fast)",
+            minimum=500,
+            maximum=40000,
+            step=500,
+            value=8000,
+            scale=3
+        )
+    out = gr.Textbox(label="Extracted Text", lines=22)
+    go = gr.Button("Fetch")
+    go.click(fetch_relevant_text, inputs=[url_in, mode_in, max_chars_in, include_title_in], outputs=out)
+# Keep MCP server flag for your Space
 if __name__ == "__main__":
     demo.launch(mcp_server=True)