Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 7 days ago

Commit

39ae379

verified ·

1 Parent(s): 072fbc2

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -20

app.py CHANGED Viewed

@@ -1,29 +1,228 @@
 import gradio as gr
 import requests
-def fetch_content(url):
     """
-    This function takes a URL as input, fetches its HTML content,
-    and returns it as a string. It includes error handling for common
-    request issues.
     """
     try:
-        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
-        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
-        return response.text
     except requests.exceptions.RequestException as e:
-        return f"An error occurred: {e}"
-# Define the Gradio interface
-demo = gr.Interface(
-    fn=fetch_content,
-    inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
-    outputs=gr.Textbox(label="Page Content"),
-    title="Fetch MCP",
-    description="Enter a URL to fetch the full HTML content of the web page.",
-    allow_flagging="never",
-    theme="Nymbo/Nymbo_Theme"
-)
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

+# File: app.py
+# Purpose: Fetch only the readable text from a web page and return it as Markdown
+# Notes: This version is more efficient and user-friendly than returning raw HTML.
+import re
+import time
 import gradio as gr
 import requests
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup  # used as a fallback cleaner
+from readability import Document  # isolates the "main content" like reader view
+import html2text  # converts HTML to Markdown
+# ----------------------------
+# Simple in-memory cache (tiny LRU-ish)
+# ----------------------------
+# layman's terms: we remember recent results so repeated requests for the same URL are instant
+_CACHE = {}
+_CACHE_ORDER = []
+_CACHE_MAX = 64
+_CACHE_TTL_SECONDS = 10 * 60  # 10 minutes
+def _cache_get(key):
+    # layman's terms: give me the saved value if it's still fresh
+    item = _CACHE.get(key)
+    if not item:
+        return None
+    value, ts = item
+    if time.time() - ts > _CACHE_TTL_SECONDS:
+        _CACHE.pop(key, None)
+        return None
+    # refresh order
+    if key in _CACHE_ORDER:
+        _CACHE_ORDER.remove(key)
+    _CACHE_ORDER.append(key)
+    return value
+def _cache_set(key, value):
+    # layman's terms: save a result and keep the list from growing too large
+    _CACHE[key] = (value, time.time())
+    if key in _CACHE_ORDER:
+        _CACHE_ORDER.remove(key)
+    _CACHE_ORDER.append(key)
+    while len(_CACHE_ORDER) > _CACHE_MAX:
+        oldest = _CACHE_ORDER.pop(0)
+        _CACHE.pop(oldest, None)
+# ----------------------------
+# Helpers
+# ----------------------------
+def _normalize_url(url: str) -> str:
+    """
+    layman's terms: if the user forgot 'https://', add it.
+    """
+    url = url.strip()
+    parsed = urlparse(url)
+    if not parsed.scheme:
+        url = "https://" + url
+    return url
+def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
+    """
+    layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
+    """
+    try:
+        head = requests.head(
+            url,
+            allow_redirects=True,
+            timeout=(5, 10),
+            headers={
+                "User-Agent": "Mozilla/5.0",
+                "Accept": "text/html,application/xhtml+xml",
+                "Accept-Encoding": "gzip, deflate, br",
+            },
+        )
+        size = head.headers.get("Content-Length")
+        if size and size.isdigit():
+            return int(size) > max_bytes
+    except requests.exceptions.RequestException:
+        # layman's terms: if HEAD fails, we won't block the GET just because of that
+        pass
+    return False
+def _fetch_html(url: str) -> str:
+    """
+    layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
+    """
+    resp = requests.get(
+        url,
+        timeout=(5, 20),  # connect, read
+        headers={
+            "User-Agent": "Mozilla/5.0",
+            "Accept": "text/html,application/xhtml+xml",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Accept-Language": "en-US,en;q=0.8",
+        },
+    )
+    resp.raise_for_status()
+    # Only proceed for text/html payloads
+    ctype = resp.headers.get("Content-Type", "")
+    if "text/html" not in ctype.lower():
+        # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
+        return resp.text
+    # Respect declared encoding where possible
+    resp.encoding = resp.encoding or "utf-8"
+    return resp.text
+def _extract_main_html(html: str) -> str:
     """
+    layman's terms: use reader mode (Readability) to isolate the main article/body content.
+    Falls back to stripping scripts/styles if Readability can't find a core.
     """
     try:
+        doc = Document(html)
+        main_html = doc.summary(html_partial=True)  # main content as HTML
+        # Make sure we still have something useful
+        if main_html and len(main_html) > 40:
+            return main_html
+    except Exception:
+        pass
+    # Fallback: strip scripts/styles and return a body-only HTML
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "noscript"]):
+        tag.decompose()
+    body = soup.body or soup
+    return str(body)
+def _html_to_markdown(html: str) -> str:
+    """
+    layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
+    """
+    h = html2text.HTML2Text()
+    h.ignore_images = True          # don't inline images in Markdown
+    h.ignore_links = False          # keep links as [text](url)
+    h.body_width = 0                # don't hard-wrap lines
+    h.protect_links = True
+    h.single_line_break = True
+    md = h.handle(html)
+    # Tidy up excessive blank lines/whitespace
+    md = re.sub(r"\n{3,}", "\n\n", md).strip()
+    return md or "_No readable text found on this page._"
+# ----------------------------
+# Main callable for Gradio
+# ----------------------------
+def fetch_markdown(url: str) -> str:
+    """
+    layman's terms: the function the UI calls.
+    Steps:
+      1) sanitize the URL
+      2) quick HEAD check to avoid massive pages
+      3) GET the HTML
+      4) isolate the main content
+      5) convert to Markdown
+      6) return Markdown
+    """
+    if not url or not url.strip():
+        return "_Please enter a URL._"
+    try:
+        url = _normalize_url(url)
+        # Return cached value if available
+        cached = _cache_get(url)
+        if cached:
+            return cached
+        # Optional efficiency: skip very large pages before downloading
+        if _too_large_via_head(url):
+            return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
+        html = _fetch_html(url)
+        # If server returned non-HTML (e.g., JSON), just code-fence it
+        if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
+            # This condition is a no-op; we already content-typed in _fetch_html.
+            pass
+        main_html = _extract_main_html(html)
+        markdown = _html_to_markdown(main_html)
+        _cache_set(url, markdown)
+        return markdown
     except requests.exceptions.RequestException as e:
+        # layman's terms: network or HTTP error
+        return f"_Network error: {e}_"
+    except Exception as e:
+        # layman's terms: any other unexpected error
+        return f"_Unexpected error: {e}_"
+# ----------------------------
+# Gradio UI
+# ----------------------------
+with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
+    # layman's terms: a simple, centered header explaining what this tool does
+    gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
+    with gr.Row():
+        url_box = gr.Textbox(
+            label="URL",
+            placeholder="example.com or https://example.com/article",
+        )
+        fetch_btn = gr.Button("Fetch")
+    # layman's terms: show the result as rendered Markdown (not a plain textbox)
+    output_md = gr.Markdown(label="Readable Markdown")
+    # layman's terms: helpful example URLs to try with one click
+    gr.Examples(
+        examples=[
+            ["https://en.wikipedia.org/wiki/Hugging_Face"],
+            ["https://huggingface.co/blog"],
+            ["https://www.bbc.com/news"],
+        ],
+        inputs=[url_box],
+    )
+    fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
+    url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)