Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 4 days ago

Commit

32db98e

verified ·

1 Parent(s): 598ab39

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -39

app.py CHANGED Viewed

@@ -1,46 +1,135 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-def fetch_content(url):
     """
-    This function takes a URL as input, fetches its HTML content,
-    extracts only the relevant text content, and returns it as a clean string.
-    It includes error handling for common request issues.
     """
     try:
-        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
-        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
-        # Parse the HTML content
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove script, style, and other non-content elements
-        for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
-            element.extract()
-        # Get the text content
-        text = soup.get_text()
-        # Clean up the text: remove extra whitespace, empty lines, etc.
-        lines = (line.strip() for line in text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = '\n'.join(chunk for chunk in chunks if chunk)
-        return text
     except requests.exceptions.RequestException as e:
-        return f"An error occurred: {e}"
-# Define the Gradio interface
-demo = gr.Interface(
-    fn=fetch_content,
-    inputs=gr.Textbox(label="URL", placeholder="https://www.example.com"),
-    outputs=gr.Textbox(label="Page Content"),
-    title="Web Page Text Extractor",
-    description="Enter a URL to extract and display only the text content of the web page.",
-    allow_flagging="never",
-    theme="Nymbo/Nymbo_Theme"
-)
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

+# File: app.py
+# Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
+#          relevant human-readable text instead of the entire HTML.
+#          Includes robust error handling, timeouts, and fallbacks.
+import gradio as gr               # UI framework
+import requests                   # makes the web request
+from bs4 import BeautifulSoup     # parses HTML so we can work with it
+from readability import Document  # distills a page down to its "main article" content
+import html                       # unescapes HTML entities like &amp; → &
+import re                         # simple cleanup with regex
+# ---- helper: clean up text nicely -------------------------------------------
+def _normalize_text(text: str) -> str:
+    """
+    Layman's terms: This tidies up the text we extracted so it looks nice.
+    - Converts &amp; things back to normal characters
+    - Collapses too many blank lines
+    - Trims leading/trailing whitespace
+    """
+    text = html.unescape(text)
+    # Replace Windows/Mac line endings with Unix and normalize spaces
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    # Collapse 3+ newlines down to 2
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+# ---- core fetcher: return main text or raw HTML ------------------------------
+def fetch_page(url: str, extract_text: bool = True) -> str:
     """
+    Layman's terms: We download the web page. If 'extract_text' is True,
+    we try to grab only the main article/important text. Otherwise we
+    return the raw HTML (like your original app).
     """
     try:
+        # Make the request with a friendly browser-like header and a timeout
+        resp = requests.get(
+            url,
+            headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
+            timeout=15,
+            allow_redirects=True,
+        )
+        resp.raise_for_status()  # If site returns 4xx/5xx, this will raise an error
     except requests.exceptions.RequestException as e:
+        # Layman's terms: If anything goes wrong with the request, report it nicely.
+        return f"Request error: {e}"
+    # If the user wants full HTML, behave like the original version
+    if not extract_text:
+        return resp.text
+    # Try readability first (usually best for articles/blog posts)
+    try:
+        # readability extracts the "main" content and returns HTML of just that part
+        doc = Document(resp.text)
+        main_html = doc.summary(html_partial=True)
+        # Parse the article-only HTML and get just the visible text
+        soup = BeautifulSoup(main_html, "lxml")
+        # Remove script/style just in case
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        main_text = soup.get_text(separator="\n")
+        main_text = _normalize_text(main_text)
+        # Fallback: if extraction produced nearly nothing, try a simpler approach
+        if len(main_text.split()) < 40:
+            raise ValueError("Readability extraction too short; falling back")
+        return main_text
+    except Exception:
+        # Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
+        try:
+            soup = BeautifulSoup(resp.text, "lxml")
+            # Remove common noise: scripts, styles, nav, footer, header, forms
+            for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
+                tag.decompose()
+            # If there's a <main> or an article-like block, prefer that
+            candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
+            if candidate:
+                text = candidate.get_text(separator="\n")
+            else:
+                text = soup.get_text(separator="\n")
+            return _normalize_text(text)
+        except Exception as e:
+            # Last resort: give raw HTML if even fallback parsing fails
+            return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"
+# ---- Gradio UI ---------------------------------------------------------------
+# Layman's terms: This is the app window. You paste a URL and choose whether to
+# extract readable text or keep full HTML. Then click "Fetch".
+with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
+    gr.Markdown(
+        """
+# Fetch MCP
+Small utility that fetches a web page and returns **just the readable text** by default
+*(toggle off to get the full HTML like before)*.
+        """
+    )
+    with gr.Row():
+        url_input = gr.Textbox(
+            label="URL",
+            placeholder="https://example.com/article",
+            lines=1,
+        )
+    with gr.Row():
+        extract_toggle = gr.Checkbox(
+            value=True,
+            label="Extract only the main readable text (recommended)",
+        )
+    fetch_btn = gr.Button("Fetch", variant="primary")
+    # Output as plain text so it’s easy to copy or pipe into other tools
+    output = gr.Textbox(
+        label="Output",
+        lines=20,
+        interactive=False,
+        placeholder="Fetched content will appear here…",
+    )
+    # Wire the button to our function
+    fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)
+# Run as normal, keeping MCP server enabled
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)