Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 4 days ago

Commit

60bdd74

verified ·

1 Parent(s): ac9f3b0

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -351

app.py CHANGED Viewed

@@ -1,364 +1,102 @@
-# File: main/app.py
-# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
-#          instead of returning full HTML. Output is compact and configurable to reduce verbosity.
-import gradio as gr                        # UI library
-import requests                            # HTTP client
-from bs4 import BeautifulSoup              # HTML parsing
-from readability import Document           # Readability algorithm to isolate main content
-from urllib.parse import urljoin, urldefrag, urlparse  # URL helpers
-import re                                  # For whitespace cleanup and simple formatting
-# -------------------------------
-# HTTP fetching with sane defaults
-# -------------------------------
-def _http_get(url: str) -> requests.Response:
-    """
-    Make an HTTP GET request with headers and a timeout.
-    Layman's terms: downloads the webpage safely and politely.
-    """
-    headers = {
-        "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
-        "Accept-Language": "en-US,en;q=0.9",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-    }
-    # Short timeouts so the app isn't stuck forever
-    return requests.get(url, headers=headers, timeout=15)
-# ----------------------------------------
-# Helpers: text cleanup & friendly trimming
-# ----------------------------------------
-def _normalize_whitespace(text: str) -> str:
-    """
-    Layman's terms: squash weird spacing and too many blank lines.
-    """
-    text = re.sub(r"[ \t\u00A0]+", " ", text)               # collapse runs of spaces
-    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())   # max 1 blank line at a time
-    return text.strip()
-def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
-    """
-    Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
-    """
-    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
-        return text, False
-    return text[:max_chars].rstrip() + " …", True
-def _domain_of(url: str) -> str:
-    """
-    Layman's terms: show a friendly domain like example.com.
-    """
-    try:
-        return urlparse(url).netloc or ""
-    except Exception:
-        return ""
-# -----------------------------------
-# Metadata extraction (title, etc.)
-# -----------------------------------
-def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
-    """
-    Layman's terms: grab useful fields like title, description, site name, and canonical link.
-    """
-    meta = {}
-    # Title preference: <title> > og:title > twitter:title
-    title_candidates = [
-        (soup.title.string if soup.title and soup.title.string else None),
-        _og(soup, "og:title"),
-        _meta(soup, "twitter:title"),
     ]
-    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
-    # Description preference: meta[name=description] > og:description > twitter:description
-    desc_candidates = [
-        _meta(soup, "description"),
-        _og(soup, "og:description"),
-        _meta(soup, "twitter:description"),
     ]
-    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
-    # Canonical URL if provided (helps dedupe / standardize)
-    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
-    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
-    # Site name (nice for context)
-    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
-    # Language (if present)
-    html_tag = soup.find("html")
-    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
-    # Final resolved URL and domain
-    meta["fetched_url"] = final_url
-    meta["domain"] = _domain_of(final_url)
-    return meta
-def _meta(soup: BeautifulSoup, name: str) -> str | None:
-    tag = soup.find("meta", attrs={"name": name})
-    return tag.get("content") if tag and tag.has_attr("content") else None
-def _og(soup: BeautifulSoup, prop: str) -> str | None:
-    tag = soup.find("meta", attrs={"property": prop})
-    return tag.get("content") if tag and tag.has_attr("content") else None
-# ---------------------------------------------------------
-# Main content extraction with Readability + gentle cleanup
-# ---------------------------------------------------------
-def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
-    """
-    Layman's terms: use Readability to find the article body, then clean it to plain text.
-    Returns (clean_text, soup_of_readable_html) for link scraping.
     """
-    # Readability gives us a simplified article HTML
-    doc = Document(html)
-    readable_html = doc.summary(html_partial=True)
-    # Parse the simplified HTML so we can clean it up further
-    s = BeautifulSoup(readable_html, "lxml")
-    # Remove obviously noisy elements if present
-    for sel in ["script", "style", "noscript", "iframe", "svg"]:
-        for tag in s.select(sel):
-            tag.decompose()
-    # Extract text with paragraphs preserved, then normalize whitespace
-    text_parts = []
-    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
-        # Keep list items and headers to retain structure without being too verbose
-        chunk = p.get_text(" ", strip=True)
-        if chunk:
-            text_parts.append(chunk)
-    clean_text = _normalize_whitespace("\n\n".join(text_parts))
-    return clean_text, s
-# ------------------------------------------
-# Link extraction from the simplified content
-# ------------------------------------------
-def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
     """
-    Layman's terms: pull out clickable links from the article content only,
-    turn them into absolute URLs, drop junk, dedupe, and cap the list.
-    """
-    seen = set()
-    links: list[tuple[str, str]] = []
-    for a in readable_soup.find_all("a", href=True):
-        href = a.get("href").strip()
-        # Ignore anchors, mailto, javascript, and empty
-        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
-            continue
-        # Resolve relative URLs and strip URL fragments (#section)
-        absolute = urljoin(base_url, href)
-        absolute, _ = urldefrag(absolute)
-        if absolute in seen:
-            continue
-        seen.add(absolute)
-        text = a.get_text(" ", strip=True)
-        # Keep link text concise
-        if len(text) > 120:
-            text = text[:117] + "…"
-        links.append((text or absolute, absolute))
-        if len(links) >= max_links > 0:
-            break
-    return links
-# -------------------------
-# Formatter: compact output
-# -------------------------
-def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
-                     include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
-    """
-    Layman's terms: turn the pieces into a neat, compact Markdown string.
-    """
-    lines = []
-    # Title header
-    title = meta.get("title") or meta.get("domain") or "Untitled"
-    lines.append(f"# {title}")
-    # Metadata (compact)
-    if include_metadata:
-        md = []
-        # Only show fields that exist to keep things tight
-        if meta.get("description"):
-            md.append(f"- **Description:** {meta['description']}")
-        if meta.get("site_name"):
-            md.append(f"- **Site:** {meta['site_name']}")
-        if meta.get("canonical"):
-            md.append(f"- **Canonical:** {meta['canonical']}")
-        if meta.get("lang"):
-            md.append(f"- **Language:** {meta['lang']}")
-        if meta.get("fetched_url"):
-            md.append(f"- **Fetched From:** {meta['fetched_url']}")
-        if md:
-            lines.append("## Metadata")
-            lines.extend(md)
-    # Body text
-    if include_text and body:
-        # For "Brief", show a very small excerpt even after truncation
-        if verbosity == "Brief":
-            brief, was_more = _truncate(body, 800)
-            lines.append("## Text")
-            lines.append(brief)
-            if was_more or body_truncated:
-                lines.append("\n> (Trimmed for brevity)")
-        else:
-            lines.append("## Text")
-            lines.append(body)
-            if body_truncated:
-                lines.append("\n> (Trimmed for brevity)")
-    # Links
-    if include_links and links:
-        lines.append(f"## Links ({len(links)})")
-        for text, url in links:
-            lines.append(f"- [{text}]({url})")
-    return "\n\n".join(lines).strip()
-# --------------------------------
-# Gradio-facing function (the app)
-# --------------------------------
-def extract_relevant(
-    url: str,
-    verbosity: str = "Standard",
-    include_metadata: bool = True,
-    include_text: bool = True,
-    include_links: bool = True,
-    max_chars: int = 3000,
-    max_links: int = 20
-) -> str:
-    """
-    Layman's terms: the main button action.
-    Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
-    """
-    if not url or not url.strip():
-        return "Please enter a valid URL."
     try:
-        resp = _http_get(url)
-        resp.raise_for_status()
-    except requests.exceptions.RequestException as e:
-        return f"An error occurred: {e}"
-    # Respect the final resolved URL (after redirects)
-    final_url = str(resp.url)
-    # Only process HTML-ish responses
-    ctype = resp.headers.get("Content-Type", "")
-    if "html" not in ctype.lower():
-        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
-    # Decode as text (requests usually sets encoding; otherwise guess)
-    resp.encoding = resp.encoding or resp.apparent_encoding
-    html = resp.text
-    # Full page soup (to extract metadata accurately)
-    full_soup = BeautifulSoup(html, "lxml")
-    meta = _extract_metadata(full_soup, final_url)
-    # Extract main body text using Readability
-    body_text, readable_soup = _extract_main_text(html)
-    # If the body is suspiciously empty, fall back to a simpler text strategy
-    if not body_text:
-        fallback_text = full_soup.get_text(" ", strip=True)
-        body_text = _normalize_whitespace(fallback_text)
-    # Enforce verbosity presets unless user overrides via slider
-    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
-    target_cap = preset_caps.get(verbosity, 3000)
-    # Use the *smaller* of user cap and preset to keep things tidy
-    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
-    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
-    # Extract links from the readable portion only (cleaner than whole DOM)
-    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
-    # Build compact Markdown
-    md = _format_markdown(
-        meta=meta,
-        body=body_text,
-        body_truncated=truncated,
-        links=links,
-        include_text=include_text,
-        include_metadata=include_metadata,
-        include_links=include_links,
-        verbosity=verbosity
-    )
-    return md or "No content could be extracted."
-# -----------------
-# Gradio UI (Blocks)
-# -----------------
-with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-    # Title & subtitle for clarity
-    gr.Markdown("# Fetch MCP — Clean Extract")
-    gr.Markdown(
-        "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
-        "Use Verbosity and caps to keep it tight."
-    )
-    with gr.Row():
-        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
-        fetch_btn = gr.Button("Fetch Clean Content")
-    with gr.Accordion("Options", open=False):
-        with gr.Row():
-            verbosity = gr.Dropdown(
-                label="Verbosity",
-                choices=["Brief", "Standard", "Full"],
-                value="Standard",
-                info="Controls how much text you get back."
-            )
-            max_chars = gr.Slider(
-                400, 12000, value=3000, step=100,
-                label="Max Characters (body text)",
-                info="Hard cap for body text. Lower = less verbose."
-            )
-            max_links = gr.Slider(
-                0, 100, value=20, step=1,
-                label="Max Links",
-                info="Limit how many hyperlinks we include."
-            )
-        with gr.Row():
-            include_metadata = gr.Checkbox(value=True, label="Include Metadata")
-            include_text = gr.Checkbox(value=True, label="Include Main Text")
-            include_links = gr.Checkbox(value=True, label="Include Links")
-    # Output as Markdown (compact and readable)
-    out = gr.Markdown(label="Result")
-    # Wire up the click
-    fetch_btn.click(
-        fn=extract_relevant,
-        inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
-        outputs=out
-    )
-# Keep MCP server enabled
 if __name__ == "__main__":
     demo.launch(mcp_server=True)

+# app.py
+# Hugging Face Space: Cleaner web-page fetcher
+# -------------------------------------------------------------
+# Fetches a URL and returns a concise, human-readable snapshot:
+#   • Title
+#   • Meta description
+#   • Main text (readability-extracted)
+#   • Hyperlinks (anchor text → absolute URL)
+# -------------------------------------------------------------
+import requests                         # HTTP client
+from bs4 import BeautifulSoup           # HTML parsing
+from readability import Document        # Boiler-plate removal
+from urllib.parse import urljoin        # Build absolute link URLs
+import gradio as gr                     # UI framework
+def extract_relevant_text(html: str, base_url: str) -> str:
+    """
+    Convert raw HTML into a clean, plain-text summary.
+    - html: the page's HTML source
+    - base_url: needed for resolving relative <a href="">
+    Returns a formatted string ready for display.
+    """
+    # 1) Let readability isolate the primary article/content
+    doc = Document(html)
+    title = doc.short_title()
+    summary_html = doc.summary()  # cleaned, minimal HTML
+    summary_soup = BeautifulSoup(summary_html, "lxml")
+    # 2) Grab visible paragraph & list text
+    body_parts = [
+        tag.get_text(" ", strip=True)
+        for tag in summary_soup.find_all(["p", "li"])
+        if tag.get_text(strip=True)
     ]
+    main_text = "\n\n".join(body_parts) or "[No main text extracted]"
+    # 3) Extract meta description from the *full* document
+    full_soup = BeautifulSoup(html, "lxml")
+    meta_desc = ""
+    meta_tag = full_soup.find("meta", attrs={"name": "description"})
+    if meta_tag and meta_tag.get("content"):
+        meta_desc = meta_tag["content"].strip()
+    else:  # Fallback to Open Graph description
+        og_tag = full_soup.find("meta", attrs={"property": "og:description"})
+        if og_tag and og_tag.get("content"):
+            meta_desc = og_tag["content"].strip()
+    # 4) Build a neat list of hyperlinks (anchor text → absolute URL)
+    links = []
+    for a in summary_soup.find_all("a", href=True):
+        href_abs = urljoin(base_url, a["href"])
+        text = a.get_text(" ", strip=True) or "[link]"
+        links.append(f"• {text} → {href_abs}")
+    # 5) Compose the final plaintext output
+    sections = [
+        f"Title: {title}",
+        f"Description: {meta_desc or '[None]'}",
+        f"Body:\n{main_text}",
+        "Links:\n" + ("\n".join(links) if links else "[No links]")
     ]
+    return "\n\n".join(sections)
+def fetch_content(url: str) -> str:
     """
+    Fetch the URL and return a concise summary.
+    Includes basic error handling for network issues.
     """
     try:
+        # Friendly user-agent prevents some 403s
+        headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()  # 4xx/5xx → exception
+        return extract_relevant_text(response.text, url)
+    except requests.exceptions.RequestException as err:
+        # Any network or HTTP error bubbles up here
+        return f"[Error] {err}"
+# -------------------------- Gradio UI --------------------------
+demo = gr.Interface(
+    fn=fetch_content,
+    inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
+    outputs=gr.Textbox(
+        label="Clean Page Snapshot",
+        interactive=False,
+        lines=25,                 # taller box for readability
+    ),
+    title="Clean Web Snapshot",
+    description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
 if __name__ == "__main__":
+    # Expose as an MCP server so you can chain it with other Spaces
     demo.launch(mcp_server=True)