Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 4 days ago

Commit

ac9f3b0

verified ·

1 Parent(s): f890eb5

Update app.py

Browse files

Files changed (1) hide show

app.py +322 -365

app.py CHANGED Viewed

@@ -1,407 +1,364 @@
 # File: main/app.py
-# Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
-# Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
-# UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
-# Notes: Comments are in layman's terms to explain each section.
-import gradio as gr                    # UI framework for the web app
-import requests                        # HTTP client to fetch web pages
-from bs4 import BeautifulSoup          # HTML parser to extract tags and text
-from readability import Document       # Readability algorithm to find main content
-from urllib.parse import urljoin, urlparse  # Tools to resolve relative/absolute URLs
-from dataclasses import dataclass       # For neat, typed containers
-from typing import List, Dict, Tuple
-import re                               # Regular expressions for cleanup
-from datetime import datetime           # For formatting dates in metadata safely
-# =========================
-# Helpers: small data shapes
-# =========================
-@dataclass
-class PageMetadata:
-    # Simple holder for high-level metadata we care about
-    title: str = ""
-    canonical_url: str = ""
-    description: str = ""
-    site_name: str = ""
-    og_type: str = ""
-    og_url: str = ""
-    published_time: str = ""  # ISO-ish if detected
-# =========================
-# Network: fetch raw HTML
-# =========================
-def fetch_html(url: str, timeout: int = 12) -> str:
     """
-    Downloads the HTML for a given URL using a browser-like User-Agent.
-    Returns text or raises an HTTP/Request error if something fails.
     """
     headers = {
-        # Pretend to be a modern desktop browser so we don't get blocked
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/127.0.0.0 Safari/537.36"
-        )
     }
-    resp = requests.get(url, headers=headers, timeout=timeout)
-    resp.raise_for_status()  # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
-    return resp.text
-# ===================================
-# Generic extraction: metadata + text
-# ===================================
-def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
     """
-    Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
-    We check Open Graph and Twitter tags as fallbacks too.
     """
-    md = PageMetadata()
-    # Title from <title> or og:title/twitter:title
-    title_tag = soup.find("title")
-    md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
-    # Meta helpers
-    def meta(name=None, property=None):
-        if name:
-            tag = soup.find("meta", attrs={"name": name})
-            if tag and tag.get("content"):
-                return tag["content"].strip()
-        if property:
-            tag = soup.find("meta", attrs={"property": property})
-            if tag and tag.get("content"):
-                return tag["content"].strip()
-        return ""
-    # Description (prefer og:description > twitter:description > meta description)
-    md.description = (
-        meta(property="og:description")
-        or meta(name="twitter:description")
-        or meta(name="description")
-        or ""
-    ).strip()
-    # Site name (if available)
-    md.site_name = (meta(property="og:site_name") or "").strip()
-    # OpenGraph URL + type (if available)
-    md.og_url = (meta(property="og:url") or "").strip()
-    md.og_type = (meta(property="og:type") or "").strip()
-    # Canonical URL (normalize relative -> absolute)
-    canon = soup.find("link", rel="canonical")
-    if canon and canon.get("href"):
-        md.canonical_url = urljoin(base_url, canon["href"].strip())
-    else:
-        # If no canonical, we may fallback to og:url if present
-        md.canonical_url = md.og_url or base_url
-    # Try some common publish-time signals
-    published = (
-        meta(property="article:published_time")
-        or meta(name="pubdate")
-        or meta(name="date")
-        or ""
-    ).strip()
-    md.published_time = published
-    # If no normal <title>, try OG or Twitter titles
-    if not md.title:
-        md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
-    return md
-def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     """
-    Uses the readability library to find the 'main content' of an article-like page.
-    Returns a clean text string and a BeautifulSoup of the main content HTML
-    (so we can also extract links from just the relevant area).
-    If readability fails/misfires (like index pages), we gracefully fallback to empty text.
     """
     try:
-        doc = Document(html)                     # Run Readability on the HTML
-        summary_html = doc.summary()             # This is the extracted main-content HTML
-        # Parse the readability summary into a soup so we can pull out links cleanly
-        summary_soup = BeautifulSoup(summary_html, "lxml")
-        # Turn HTML to plain text: keep paragraphs and line breaks readable
-        # Remove scripts/styles etc. if any slipped through
-        for tag in summary_soup(["script", "style", "noscript"]):
-            tag.decompose()
-        text = summary_soup.get_text("\n", strip=True)
-        text = re.sub(r"\n{3,}", "\n\n", text)   # Collapse superfluous line breaks
-        return text, summary_soup
     except Exception:
-        # If something goes wrong (e.g., not article-shaped), return empty content
-        return "", BeautifulSoup("", "lxml")
-def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
     """
-    Finds hyperlinks. If we have a 'main content' soup and the user asked for
-    content-only links, we grab links from there; otherwise, fall back to the whole page.
-    We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
     """
-    anchors = []
-    if soup and only_content_area:
-        anchors = soup.find_all("a")
-    else:
-        full = BeautifulSoup(fallback_html, "lxml")
-        anchors = full.find_all("a")
-    results = []
     seen = set()
-    for a in anchors:
-        href = (a.get("href") or "").strip()
-        text = a.get_text(" ", strip=True)
-        if not href:
-            continue
-        # Skip empty, anchors, JS, and non-http links
-        if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
             continue
-        # Make absolute
         absolute = urljoin(base_url, href)
-        # Deduplicate by absolute URL + link text combo
-        key = (absolute, text)
-        if key in seen:
             continue
-        seen.add(key)
-        domain = urlparse(absolute).netloc
-        results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
-    return results
-# ====================================
-# Special-case: Hacker News front page
-# ====================================
-def is_hn_front(url: str) -> bool:
-    """
-    Checks if the URL is the Hacker News front page (news.ycombinator.com).
-    We'll special-handle it for a great experience listing posts.
-    """
-    p = urlparse(url)
-    if p.netloc != "news.ycombinator.com":
-        return False
-    # Treat /, /news, or /front as "front page" style
-    return p.path in ("", "/", "/news", "/front")
-def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
     """
-    Parses the Hacker News front page HTML to extract ranked items with points and comments.
-    Returns a Markdown overview and a list-of-dicts suitable for a table.
     """
-    soup = BeautifulSoup(html, "lxml")
-    items = []
-    # Each story is a <tr class="athing">; subtext is in the immediate next <tr>
-    for story in soup.select("tr.athing"):
-        # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
-        rank_tag = story.select_one("span.rank")
-        rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")
-        # Title + URL (HN changed markup: 'span.titleline a' is current)
-        title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
-        title = title_a.get_text(strip=True) if title_a else "(no title)"
-        url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url
-        # Source domain (e.g., (github.com))
-        site = story.select_one("span.sitestr")
-        source = site.get_text(strip=True) if site else urlparse(url).netloc
-        # Subtext row comes right after the 'athing' row
-        subtext_row = story.find_next_sibling("tr")
-        points, comments, age, by = "", "", "", ""
-        if subtext_row:
-            # Points like "123 points"
-            score = subtext_row.select_one("span.score")
-            points = score.get_text(strip=True) if score else ""
-            # Byline: "by username"
-            user_a = subtext_row.select_one("a.hnuser")
-            by = user_a.get_text(strip=True) if user_a else ""
-            # Age: "5 hours ago"
-            age_tag = subtext_row.select_one("span.age")
-            age = age_tag.get_text(strip=True) if age_tag else ""
-            # Comments link: last <a> typically ends with "comments" or "discuss"
-            comment_a = None
-            links = subtext_row.select("a")
-            if links:
-                comment_a = links[-1]
-            comments = (comment_a.get_text(strip=True) if comment_a else "").lower()
-        items.append({
-            "Rank": rank,
-            "Title": title,
-            "URL": url,
-            "Source": source,
-            "Points": points,
-            "By": by,
-            "Age": age,
-            "Comments": comments,
-        })
-    # Build a tight Markdown digest so you can "use" HN inside the tool
-    md_lines = ["# Hacker News — Front Page",
-                "",
-                "Here are the current front-page posts (click to open):",
-                ""]
-    for it in items:
-        rank = it["Rank"] or "•"
-        title = it["Title"]
-        url = it["URL"]
-        pts = it["Points"] or ""
-        cmt = it["Comments"] or ""
-        age = it["Age"] or ""
-        src = it["Source"] or ""
-        # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
-        extras = " — ".join(filter(None, [
-            " ".join(filter(None, [pts, cmt])),
-            age,
-            f"({src})"
-        ]))
-        md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
-    md = "\n".join(md_lines) if items else "# Hacker News — No items found"
-    return md, items
-# ===========================
-# Public function for Gradio
-# ===========================
-def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
     """
-    Main function wired to the UI.
-    - Fetches the page
-    - If it's Hacker News front page, parse posts specially
-    - Otherwise: extract metadata, main text (optional), and links
-    - Returns Markdown (summary) + a table of links
     """
     try:
-        html = fetch_html(url)
     except requests.exceptions.RequestException as e:
-        # Friendly error message for the UI textbox
-        return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []
-    # Hacker News special handling for top-notch usability
-    if is_hn_front(url):
-        md, items = parse_hn_front(html, url)
-        return md, items  # For HN, the table is the rich story list
-    # Generic page pipeline
-    soup_full = BeautifulSoup(html, "lxml")                # Full page soup for metadata and optional link fallback
-    metadata = extract_metadata(soup_full, url)            # Title, canonical, description, etc.
-    main_text, summary_soup = extract_main_text(html)      # Readability content (may be empty on index pages)
-    # Choose where we harvest links from
-    links = collect_links(summary_soup, url, content_links_only, html)
-    if max_links and max_links > 0:
-        links = links[:max_links]
-    # Build a readable Markdown summary
-    md_lines = []
-    # Title line (prefer metadata title)
-    title_to_show = metadata.title or "(Untitled)"
-    md_lines.append(f"# {title_to_show}")
-    # Canonical + URL info
-    if metadata.canonical_url and metadata.canonical_url != url:
-        md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
-    md_lines.append(f"- **URL:** {url}")
-    # Optional metadata lines
-    if metadata.site_name:
-        md_lines.append(f"- **Site:** {metadata.site_name}")
-    if metadata.description:
-        md_lines.append(f"- **Description:** {metadata.description}")
-    if metadata.published_time:
-        md_lines.append(f"- **Published:** {metadata.published_time}")
-    if metadata.og_type:
-        md_lines.append(f"- **OG Type:** {metadata.og_type}")
-    # Spacer
-    md_lines.append("\n---\n")
-    # Main content (optional, controlled by checkbox)
-    if full_text and main_text:
-        md_lines.append("## Main Content")
-        # Keep things readable; long pages can be huge—Readability already helps keep it topical
-        md_lines.append(main_text)
-        md_lines.append("\n---\n")
-    # Links brief (we also return a structured table below)
-    md_lines.append("## Links Found")
-    md_lines.append(
-        f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
-        "Click any to open in a new tab."
     )
-    md = "\n".join(md_lines)
-    return md, links
-# ===========
-# Gradio UI
-# ===========
-# Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
-with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo:
-    # --- Header area: title + quick helper buttons
-    gr.Markdown("# Fetch MCP — Clean Text & Links\n"
-                "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
-                "- Special handling for **Hacker News** front page (rank, points, comments).\n"
-                "- Toggle **Full Text** if you also want the extracted article content.")
-    with gr.Row():
-        url_in = gr.Textbox(
-            label="URL",
-            placeholder="https://news.ycombinator.com/  •  https://example.com/article",
-            value="https://news.ycombinator.com/",
-            scale=4
-        )
-        fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)
-    with gr.Row():
-        full_text_chk = gr.Checkbox(
-            label="Include main content text (Readability extract)?",
-            value=False
-        )
-        content_only_chk = gr.Checkbox(
-            label="Links from main content only (fallback: full page)?",
-            value=True
-        )
-        max_links_sld = gr.Slider(
-            label="Max links to return",
-            minimum=10, maximum=500, value=100, step=10
-        )
-    # Outputs: Markdown summary + a table of links (or HN posts table)
-    summary_md = gr.Markdown(label="Summary")
-    links_tbl = gr.Dataframe(
-        headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
-        # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
-        interactive=False,
-        wrap=True,
-        row_count=(0, "dynamic"),
-        col_count=(0, "dynamic")
     )
-    # Wire up the action: clicking the button runs extract_page and shows results
     fetch_btn.click(
-        fn=extract_page,
-        inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
-        outputs=[summary_md, links_tbl]
     )
-# Keep MCP server behavior enabled for your setup
 if __name__ == "__main__":
     demo.launch(mcp_server=True)

 # File: main/app.py
+# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
+#          instead of returning full HTML. Output is compact and configurable to reduce verbosity.
+import gradio as gr                        # UI library
+import requests                            # HTTP client
+from bs4 import BeautifulSoup              # HTML parsing
+from readability import Document           # Readability algorithm to isolate main content
+from urllib.parse import urljoin, urldefrag, urlparse  # URL helpers
+import re                                  # For whitespace cleanup and simple formatting
+# -------------------------------
+# HTTP fetching with sane defaults
+# -------------------------------
+def _http_get(url: str) -> requests.Response:
     """
+    Make an HTTP GET request with headers and a timeout.
+    Layman's terms: downloads the webpage safely and politely.
     """
     headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
+    # Short timeouts so the app isn't stuck forever
+    return requests.get(url, headers=headers, timeout=15)
+# ----------------------------------------
+# Helpers: text cleanup & friendly trimming
+# ----------------------------------------
+def _normalize_whitespace(text: str) -> str:
+    """
+    Layman's terms: squash weird spacing and too many blank lines.
+    """
+    text = re.sub(r"[ \t\u00A0]+", " ", text)               # collapse runs of spaces
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())   # max 1 blank line at a time
+    return text.strip()
+def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
     """
+    Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
     """
+    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
+        return text, False
+    return text[:max_chars].rstrip() + " …", True
+def _domain_of(url: str) -> str:
     """
+    Layman's terms: show a friendly domain like example.com.
     """
     try:
+        return urlparse(url).netloc or ""
     except Exception:
+        return ""
+# -----------------------------------
+# Metadata extraction (title, etc.)
+# -----------------------------------
+def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
     """
+    Layman's terms: grab useful fields like title, description, site name, and canonical link.
+    """
+    meta = {}
+    # Title preference: <title> > og:title > twitter:title
+    title_candidates = [
+        (soup.title.string if soup.title and soup.title.string else None),
+        _og(soup, "og:title"),
+        _meta(soup, "twitter:title"),
+    ]
+    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
+    # Description preference: meta[name=description] > og:description > twitter:description
+    desc_candidates = [
+        _meta(soup, "description"),
+        _og(soup, "og:description"),
+        _meta(soup, "twitter:description"),
+    ]
+    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
+    # Canonical URL if provided (helps dedupe / standardize)
+    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
+    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
+    # Site name (nice for context)
+    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
+    # Language (if present)
+    html_tag = soup.find("html")
+    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
+    # Final resolved URL and domain
+    meta["fetched_url"] = final_url
+    meta["domain"] = _domain_of(final_url)
+    return meta
+def _meta(soup: BeautifulSoup, name: str) -> str | None:
+    tag = soup.find("meta", attrs={"name": name})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _og(soup: BeautifulSoup, prop: str) -> str | None:
+    tag = soup.find("meta", attrs={"property": prop})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+# ---------------------------------------------------------
+# Main content extraction with Readability + gentle cleanup
+# ---------------------------------------------------------
+def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
+    """
+    Layman's terms: use Readability to find the article body, then clean it to plain text.
+    Returns (clean_text, soup_of_readable_html) for link scraping.
+    """
+    # Readability gives us a simplified article HTML
+    doc = Document(html)
+    readable_html = doc.summary(html_partial=True)
+    # Parse the simplified HTML so we can clean it up further
+    s = BeautifulSoup(readable_html, "lxml")
+    # Remove obviously noisy elements if present
+    for sel in ["script", "style", "noscript", "iframe", "svg"]:
+        for tag in s.select(sel):
+            tag.decompose()
+    # Extract text with paragraphs preserved, then normalize whitespace
+    text_parts = []
+    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
+        # Keep list items and headers to retain structure without being too verbose
+        chunk = p.get_text(" ", strip=True)
+        if chunk:
+            text_parts.append(chunk)
+    clean_text = _normalize_whitespace("\n\n".join(text_parts))
+    return clean_text, s
+# ------------------------------------------
+# Link extraction from the simplified content
+# ------------------------------------------
+def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
+    """
+    Layman's terms: pull out clickable links from the article content only,
+    turn them into absolute URLs, drop junk, dedupe, and cap the list.
     """
     seen = set()
+    links: list[tuple[str, str]] = []
+    for a in readable_soup.find_all("a", href=True):
+        href = a.get("href").strip()
+        # Ignore anchors, mailto, javascript, and empty
+        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
             continue
+        # Resolve relative URLs and strip URL fragments (#section)
         absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        if absolute in seen:
             continue
+        seen.add(absolute)
+        text = a.get_text(" ", strip=True)
+        # Keep link text concise
+        if len(text) > 120:
+            text = text[:117] + "…"
+        links.append((text or absolute, absolute))
+        if len(links) >= max_links > 0:
+            break
+    return links
+# -------------------------
+# Formatter: compact output
+# -------------------------
+def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
+                     include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
     """
+    Layman's terms: turn the pieces into a neat, compact Markdown string.
     """
+    lines = []
+    # Title header
+    title = meta.get("title") or meta.get("domain") or "Untitled"
+    lines.append(f"# {title}")
+    # Metadata (compact)
+    if include_metadata:
+        md = []
+        # Only show fields that exist to keep things tight
+        if meta.get("description"):
+            md.append(f"- **Description:** {meta['description']}")
+        if meta.get("site_name"):
+            md.append(f"- **Site:** {meta['site_name']}")
+        if meta.get("canonical"):
+            md.append(f"- **Canonical:** {meta['canonical']}")
+        if meta.get("lang"):
+            md.append(f"- **Language:** {meta['lang']}")
+        if meta.get("fetched_url"):
+            md.append(f"- **Fetched From:** {meta['fetched_url']}")
+        if md:
+            lines.append("## Metadata")
+            lines.extend(md)
+    # Body text
+    if include_text and body:
+        # For "Brief", show a very small excerpt even after truncation
+        if verbosity == "Brief":
+            brief, was_more = _truncate(body, 800)
+            lines.append("## Text")
+            lines.append(brief)
+            if was_more or body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+        else:
+            lines.append("## Text")
+            lines.append(body)
+            if body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+    # Links
+    if include_links and links:
+        lines.append(f"## Links ({len(links)})")
+        for text, url in links:
+            lines.append(f"- [{text}]({url})")
+    return "\n\n".join(lines).strip()
+# --------------------------------
+# Gradio-facing function (the app)
+# --------------------------------
+def extract_relevant(
+    url: str,
+    verbosity: str = "Standard",
+    include_metadata: bool = True,
+    include_text: bool = True,
+    include_links: bool = True,
+    max_chars: int = 3000,
+    max_links: int = 20
+) -> str:
     """
+    Layman's terms: the main button action.
+    Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
     """
+    if not url or not url.strip():
+        return "Please enter a valid URL."
     try:
+        resp = _http_get(url)
+        resp.raise_for_status()
     except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
+    # Respect the final resolved URL (after redirects)
+    final_url = str(resp.url)
+    # Only process HTML-ish responses
+    ctype = resp.headers.get("Content-Type", "")
+    if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
+    # Decode as text (requests usually sets encoding; otherwise guess)
+    resp.encoding = resp.encoding or resp.apparent_encoding
+    html = resp.text
+    # Full page soup (to extract metadata accurately)
+    full_soup = BeautifulSoup(html, "lxml")
+    meta = _extract_metadata(full_soup, final_url)
+    # Extract main body text using Readability
+    body_text, readable_soup = _extract_main_text(html)
+    # If the body is suspiciously empty, fall back to a simpler text strategy
+    if not body_text:
+        fallback_text = full_soup.get_text(" ", strip=True)
+        body_text = _normalize_whitespace(fallback_text)
+    # Enforce verbosity presets unless user overrides via slider
+    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
+    target_cap = preset_caps.get(verbosity, 3000)
+    # Use the *smaller* of user cap and preset to keep things tidy
+    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
+    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
+    # Extract links from the readable portion only (cleaner than whole DOM)
+    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
+    # Build compact Markdown
+    md = _format_markdown(
+        meta=meta,
+        body=body_text,
+        body_truncated=truncated,
+        links=links,
+        include_text=include_text,
+        include_metadata=include_metadata,
+        include_links=include_links,
+        verbosity=verbosity
     )
+    return md or "No content could be extracted."
+# -----------------
+# Gradio UI (Blocks)
+# -----------------
+with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+    # Title & subtitle for clarity
+    gr.Markdown("# Fetch MCP — Clean Extract")
+    gr.Markdown(
+        "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
+        "Use Verbosity and caps to keep it tight."
     )
+    with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
+        fetch_btn = gr.Button("Fetch Clean Content")
+    with gr.Accordion("Options", open=False):
+        with gr.Row():
+            verbosity = gr.Dropdown(
+                label="Verbosity",
+                choices=["Brief", "Standard", "Full"],
+                value="Standard",
+                info="Controls how much text you get back."
+            )
+            max_chars = gr.Slider(
+                400, 12000, value=3000, step=100,
+                label="Max Characters (body text)",
+                info="Hard cap for body text. Lower = less verbose."
+            )
+            max_links = gr.Slider(
+                0, 100, value=20, step=1,
+                label="Max Links",
+                info="Limit how many hyperlinks we include."
+            )
+        with gr.Row():
+            include_metadata = gr.Checkbox(value=True, label="Include Metadata")
+            include_text = gr.Checkbox(value=True, label="Include Main Text")
+            include_links = gr.Checkbox(value=True, label="Include Links")
+    # Output as Markdown (compact and readable)
+    out = gr.Markdown(label="Result")
+    # Wire up the click
     fetch_btn.click(
+        fn=extract_relevant,
+        inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
+        outputs=out
     )
+# Keep MCP server enabled
 if __name__ == "__main__":
     demo.launch(mcp_server=True)