Spaces:

Nymbo
/

Fetch

Sleeping

App Files Files Community

Nymbo commited on 4 days ago

Commit

b708e4c

verified ·

1 Parent(s): 60bdd74

Update app.py

Browse files

Files changed (1) hide show

app.py +351 -89

app.py CHANGED Viewed

@@ -1,102 +1,364 @@
-# app.py
-# Hugging Face Space: Cleaner web-page fetcher
-# -------------------------------------------------------------
-# Fetches a URL and returns a concise, human-readable snapshot:
-#   • Title
-#   • Meta description
-#   • Main text (readability-extracted)
-#   • Hyperlinks (anchor text → absolute URL)
-# -------------------------------------------------------------
-import requests                         # HTTP client
-from bs4 import BeautifulSoup           # HTML parsing
-from readability import Document        # Boiler-plate removal
-from urllib.parse import urljoin        # Build absolute link URLs
-import gradio as gr                     # UI framework
-def extract_relevant_text(html: str, base_url: str) -> str:
-    """
-    Convert raw HTML into a clean, plain-text summary.
-    - html: the page's HTML source
-    - base_url: needed for resolving relative <a href="">
-    Returns a formatted string ready for display.
-    """
-    # 1) Let readability isolate the primary article/content
-    doc = Document(html)
-    title = doc.short_title()
-    summary_html = doc.summary()  # cleaned, minimal HTML
-    summary_soup = BeautifulSoup(summary_html, "lxml")
-    # 2) Grab visible paragraph & list text
-    body_parts = [
-        tag.get_text(" ", strip=True)
-        for tag in summary_soup.find_all(["p", "li"])
-        if tag.get_text(strip=True)
     ]
-    main_text = "\n\n".join(body_parts) or "[No main text extracted]"
-    # 3) Extract meta description from the *full* document
-    full_soup = BeautifulSoup(html, "lxml")
-    meta_desc = ""
-    meta_tag = full_soup.find("meta", attrs={"name": "description"})
-    if meta_tag and meta_tag.get("content"):
-        meta_desc = meta_tag["content"].strip()
-    else:  # Fallback to Open Graph description
-        og_tag = full_soup.find("meta", attrs={"property": "og:description"})
-        if og_tag and og_tag.get("content"):
-            meta_desc = og_tag["content"].strip()
-    # 4) Build a neat list of hyperlinks (anchor text → absolute URL)
-    links = []
-    for a in summary_soup.find_all("a", href=True):
-        href_abs = urljoin(base_url, a["href"])
-        text = a.get_text(" ", strip=True) or "[link]"
-        links.append(f"• {text} → {href_abs}")
-    # 5) Compose the final plaintext output
-    sections = [
-        f"Title: {title}",
-        f"Description: {meta_desc or '[None]'}",
-        f"Body:\n{main_text}",
-        "Links:\n" + ("\n".join(links) if links else "[No links]")
     ]
-    return "\n\n".join(sections)
-def fetch_content(url: str) -> str:
     """
-    Fetch the URL and return a concise summary.
-    Includes basic error handling for network issues.
     """
     try:
-        # Friendly user-agent prevents some 403s
-        headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
-        response = requests.get(url, headers=headers, timeout=15)
-        response.raise_for_status()  # 4xx/5xx → exception
-        return extract_relevant_text(response.text, url)
-    except requests.exceptions.RequestException as err:
-        # Any network or HTTP error bubbles up here
-        return f"[Error] {err}"
-# -------------------------- Gradio UI --------------------------
-demo = gr.Interface(
-    fn=fetch_content,
-    inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
-    outputs=gr.Textbox(
-        label="Clean Page Snapshot",
-        interactive=False,
-        lines=25,                 # taller box for readability
-    ),
-    title="Clean Web Snapshot",
-    description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
-    allow_flagging="never",
-    theme="Nymbo/Nymbo_Theme",
-)
 if __name__ == "__main__":
-    # Expose as an MCP server so you can chain it with other Spaces
     demo.launch(mcp_server=True)

+# File: main/app.py
+# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
+#          instead of returning full HTML. Output is compact and configurable to reduce verbosity.
+import gradio as gr                        # UI library
+import requests                            # HTTP client
+from bs4 import BeautifulSoup              # HTML parsing
+from readability import Document           # Readability algorithm to isolate main content
+from urllib.parse import urljoin, urldefrag, urlparse  # URL helpers
+import re                                  # For whitespace cleanup and simple formatting
+# -------------------------------
+# HTTP fetching with sane defaults
+# -------------------------------
+def _http_get(url: str) -> requests.Response:
+    """
+    Make an HTTP GET request with headers and a timeout.
+    Layman's terms: downloads the webpage safely and politely.
+    """
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    }
+    # Short timeouts so the app isn't stuck forever
+    return requests.get(url, headers=headers, timeout=15)
+# ----------------------------------------
+# Helpers: text cleanup & friendly trimming
+# ----------------------------------------
+def _normalize_whitespace(text: str) -> str:
+    """
+    Layman's terms: squash weird spacing and too many blank lines.
+    """
+    text = re.sub(r"[ \t\u00A0]+", " ", text)               # collapse runs of spaces
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())   # max 1 blank line at a time
+    return text.strip()
+def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
+    """
+    Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
+    """
+    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
+        return text, False
+    return text[:max_chars].rstrip() + " …", True
+def _domain_of(url: str) -> str:
+    """
+    Layman's terms: show a friendly domain like example.com.
+    """
+    try:
+        return urlparse(url).netloc or ""
+    except Exception:
+        return ""
+# -----------------------------------
+# Metadata extraction (title, etc.)
+# -----------------------------------
+def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
+    """
+    Layman's terms: grab useful fields like title, description, site name, and canonical link.
+    """
+    meta = {}
+    # Title preference: <title> > og:title > twitter:title
+    title_candidates = [
+        (soup.title.string if soup.title and soup.title.string else None),
+        _og(soup, "og:title"),
+        _meta(soup, "twitter:title"),
     ]
+    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
+    # Description preference: meta[name=description] > og:description > twitter:description
+    desc_candidates = [
+        _meta(soup, "description"),
+        _og(soup, "og:description"),
+        _meta(soup, "twitter:description"),
     ]
+    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
+    # Canonical URL if provided (helps dedupe / standardize)
+    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
+    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
+    # Site name (nice for context)
+    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
+    # Language (if present)
+    html_tag = soup.find("html")
+    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
+    # Final resolved URL and domain
+    meta["fetched_url"] = final_url
+    meta["domain"] = _domain_of(final_url)
+    return meta
+def _meta(soup: BeautifulSoup, name: str) -> str | None:
+    tag = soup.find("meta", attrs={"name": name})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _og(soup: BeautifulSoup, prop: str) -> str | None:
+    tag = soup.find("meta", attrs={"property": prop})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+# ---------------------------------------------------------
+# Main content extraction with Readability + gentle cleanup
+# ---------------------------------------------------------
+def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
+    """
+    Layman's terms: use Readability to find the article body, then clean it to plain text.
+    Returns (clean_text, soup_of_readable_html) for link scraping.
     """
+    # Readability gives us a simplified article HTML
+    doc = Document(html)
+    readable_html = doc.summary(html_partial=True)
+    # Parse the simplified HTML so we can clean it up further
+    s = BeautifulSoup(readable_html, "lxml")
+    # Remove obviously noisy elements if present
+    for sel in ["script", "style", "noscript", "iframe", "svg"]:
+        for tag in s.select(sel):
+            tag.decompose()
+    # Extract text with paragraphs preserved, then normalize whitespace
+    text_parts = []
+    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
+        # Keep list items and headers to retain structure without being too verbose
+        chunk = p.get_text(" ", strip=True)
+        if chunk:
+            text_parts.append(chunk)
+    clean_text = _normalize_whitespace("\n\n".join(text_parts))
+    return clean_text, s
+# ------------------------------------------
+# Link extraction from the simplified content
+# ------------------------------------------
+def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
     """
+    Layman's terms: pull out clickable links from the article content only,
+    turn them into absolute URLs, drop junk, dedupe, and cap the list.
+    """
+    seen = set()
+    links: list[tuple[str, str]] = []
+    for a in readable_soup.find_all("a", href=True):
+        href = a.get("href").strip()
+        # Ignore anchors, mailto, javascript, and empty
+        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
+            continue
+        # Resolve relative URLs and strip URL fragments (#section)
+        absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        if absolute in seen:
+            continue
+        seen.add(absolute)
+        text = a.get_text(" ", strip=True)
+        # Keep link text concise
+        if len(text) > 120:
+            text = text[:117] + "…"
+        links.append((text or absolute, absolute))
+        if len(links) >= max_links > 0:
+            break
+    return links
+# -------------------------
+# Formatter: compact output
+# -------------------------
+def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
+                     include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
+    """
+    Layman's terms: turn the pieces into a neat, compact Markdown string.
+    """
+    lines = []
+    # Title header
+    title = meta.get("title") or meta.get("domain") or "Untitled"
+    lines.append(f"# {title}")
+    # Metadata (compact)
+    if include_metadata:
+        md = []
+        # Only show fields that exist to keep things tight
+        if meta.get("description"):
+            md.append(f"- **Description:** {meta['description']}")
+        if meta.get("site_name"):
+            md.append(f"- **Site:** {meta['site_name']}")
+        if meta.get("canonical"):
+            md.append(f"- **Canonical:** {meta['canonical']}")
+        if meta.get("lang"):
+            md.append(f"- **Language:** {meta['lang']}")
+        if meta.get("fetched_url"):
+            md.append(f"- **Fetched From:** {meta['fetched_url']}")
+        if md:
+            lines.append("## Metadata")
+            lines.extend(md)
+    # Body text
+    if include_text and body:
+        # For "Brief", show a very small excerpt even after truncation
+        if verbosity == "Brief":
+            brief, was_more = _truncate(body, 800)
+            lines.append("## Text")
+            lines.append(brief)
+            if was_more or body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+        else:
+            lines.append("## Text")
+            lines.append(body)
+            if body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+    # Links
+    if include_links and links:
+        lines.append(f"## Links ({len(links)})")
+        for text, url in links:
+            lines.append(f"- [{text}]({url})")
+    return "\n\n".join(lines).strip()
+# --------------------------------
+# Gradio-facing function (the app)
+# --------------------------------
+def extract_relevant(
+    url: str,
+    verbosity: str = "Standard",
+    include_metadata: bool = True,
+    include_text: bool = True,
+    include_links: bool = True,
+    max_chars: int = 3000,
+    max_links: int = 20
+) -> str:
+    """
+    Layman's terms: the main button action.
+    Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
+    """
+    if not url or not url.strip():
+        return "Please enter a valid URL."
     try:
+        resp = _http_get(url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
+    # Respect the final resolved URL (after redirects)
+    final_url = str(resp.url)
+    # Only process HTML-ish responses
+    ctype = resp.headers.get("Content-Type", "")
+    if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
+    # Decode as text (requests usually sets encoding; otherwise guess)
+    resp.encoding = resp.encoding or resp.apparent_encoding
+    html = resp.text
+    # Full page soup (to extract metadata accurately)
+    full_soup = BeautifulSoup(html, "lxml")
+    meta = _extract_metadata(full_soup, final_url)
+    # Extract main body text using Readability
+    body_text, readable_soup = _extract_main_text(html)
+    # If the body is suspiciously empty, fall back to a simpler text strategy
+    if not body_text:
+        fallback_text = full_soup.get_text(" ", strip=True)
+        body_text = _normalize_whitespace(fallback_text)
+    # Enforce verbosity presets unless user overrides via slider
+    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
+    target_cap = preset_caps.get(verbosity, 3000)
+    # Use the *smaller* of user cap and preset to keep things tidy
+    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
+    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
+    # Extract links from the readable portion only (cleaner than whole DOM)
+    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
+    # Build compact Markdown
+    md = _format_markdown(
+        meta=meta,
+        body=body_text,
+        body_truncated=truncated,
+        links=links,
+        include_text=include_text,
+        include_metadata=include_metadata,
+        include_links=include_links,
+        verbosity=verbosity
+    )
+    return md or "No content could be extracted."
+# -----------------
+# Gradio UI (Blocks)
+# -----------------
+with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+    # Title & subtitle for clarity
+    gr.Markdown("# Fetch MCP — Clean Extract")
+    gr.Markdown(
+        "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
+        "Use Verbosity and caps to keep it tight."
+    )
+    with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
+        fetch_btn = gr.Button("Fetch Clean Content")
+    with gr.Accordion("Options", open=False):
+        with gr.Row():
+            verbosity = gr.Dropdown(
+                label="Verbosity",
+                choices=["Brief", "Standard", "Full"],
+                value="Standard",
+                info="Controls how much text you get back."
+            )
+            max_chars = gr.Slider(
+                400, 12000, value=3000, step=100,
+                label="Max Characters (body text)",
+                info="Hard cap for body text. Lower = less verbose."
+            )
+            max_links = gr.Slider(
+                0, 100, value=20, step=1,
+                label="Max Links",
+                info="Limit how many hyperlinks we include."
+            )
+        with gr.Row():
+            include_metadata = gr.Checkbox(value=True, label="Include Metadata")
+            include_text = gr.Checkbox(value=True, label="Include Main Text")
+            include_links = gr.Checkbox(value=True, label="Include Links")
+    # Output as Markdown (compact and readable)
+    out = gr.Markdown(label="Result")
+    # Wire up the click
+    fetch_btn.click(
+        fn=extract_relevant,
+        inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
+        outputs=out
+    )
+# Keep MCP server enabled
 if __name__ == "__main__":
     demo.launch(mcp_server=True)