Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

8e5c5da

verified ·

1 Parent(s): 65add20

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -128

app.py CHANGED Viewed

@@ -1,135 +1,111 @@
-# File: app.py
-# Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
-#          relevant human-readable text instead of the entire HTML.
-#          Includes robust error handling, timeouts, and fallbacks.
-import gradio as gr               # UI framework
-import requests                   # makes the web request
-from bs4 import BeautifulSoup     # parses HTML so we can work with it
-from readability import Document  # distills a page down to its "main article" content
-import html                       # unescapes HTML entities like &amp; → &
-import re                         # simple cleanup with regex
-# ---- helper: clean up text nicely -------------------------------------------
-def _normalize_text(text: str) -> str:
-    """
-    Layman's terms: This tidies up the text we extracted so it looks nice.
-    - Converts &amp; things back to normal characters
-    - Collapses too many blank lines
-    - Trims leading/trailing whitespace
-    """
-    text = html.unescape(text)
-    # Replace Windows/Mac line endings with Unix and normalize spaces
-    text = text.replace("\r\n", "\n").replace("\r", "\n")
-    # Collapse 3+ newlines down to 2
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    return text.strip()
-# ---- core fetcher: return main text or raw HTML ------------------------------
-def fetch_page(url: str, extract_text: bool = True) -> str:
     """
-    Layman's terms: We download the web page. If 'extract_text' is True,
-    we try to grab only the main article/important text. Otherwise we
-    return the raw HTML (like your original app).
     """
     try:
-        # Make the request with a friendly browser-like header and a timeout
-        resp = requests.get(
-            url,
-            headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
-            timeout=15,
-            allow_redirects=True,
-        )
-        resp.raise_for_status()  # If site returns 4xx/5xx, this will raise an error
     except requests.exceptions.RequestException as e:
-        # Layman's terms: If anything goes wrong with the request, report it nicely.
-        return f"Request error: {e}"
-    # If the user wants full HTML, behave like the original version
-    if not extract_text:
-        return resp.text
-    # Try readability first (usually best for articles/blog posts)
-    try:
-        # readability extracts the "main" content and returns HTML of just that part
-        doc = Document(resp.text)
-        main_html = doc.summary(html_partial=True)
-        # Parse the article-only HTML and get just the visible text
-        soup = BeautifulSoup(main_html, "lxml")
-        # Remove script/style just in case
-        for tag in soup(["script", "style", "noscript"]):
-            tag.decompose()
-        main_text = soup.get_text(separator="\n")
-        main_text = _normalize_text(main_text)
-        # Fallback: if extraction produced nearly nothing, try a simpler approach
-        if len(main_text.split()) < 40:
-            raise ValueError("Readability extraction too short; falling back")
-        return main_text
-    except Exception:
-        # Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
-        try:
-            soup = BeautifulSoup(resp.text, "lxml")
-            # Remove common noise: scripts, styles, nav, footer, header, forms
-            for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
-                tag.decompose()
-            # If there's a <main> or an article-like block, prefer that
-            candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
-            if candidate:
-                text = candidate.get_text(separator="\n")
-            else:
-                text = soup.get_text(separator="\n")
-            return _normalize_text(text)
-        except Exception as e:
-            # Last resort: give raw HTML if even fallback parsing fails
-            return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"
-# ---- Gradio UI ---------------------------------------------------------------
-# Layman's terms: This is the app window. You paste a URL and choose whether to
-# extract readable text or keep full HTML. Then click "Fetch".
-with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
-    gr.Markdown(
-        """
-# Fetch MCP
-Small utility that fetches a web page and returns **just the readable text** by default
-*(toggle off to get the full HTML like before)*.
-        """
-    )
-    with gr.Row():
-        url_input = gr.Textbox(
-            label="URL",
-            placeholder="https://example.com/article",
-            lines=1,
-        )
-    with gr.Row():
-        extract_toggle = gr.Checkbox(
-            value=True,
-            label="Extract only the main readable text (recommended)",
-        )
-    fetch_btn = gr.Button("Fetch", variant="primary")
-    # Output as plain text so it’s easy to copy or pipe into other tools
-    output = gr.Textbox(
-        label="Output",
-        lines=20,
-        interactive=False,
-        placeholder="Fetched content will appear here…",
-    )
-    # Wire the button to our function
-    fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)
-# Run as normal, keeping MCP server enabled
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+def fetch_and_parse_hn(url):
     """
+    This function takes a Hacker News URL, fetches its content, parses it,
+    and returns a formatted Markdown string with titles, metadata, and hyperlinks.
     """
+    if not url.strip():
+        return "Please enter a URL."
     try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raises an HTTPError for bad responses
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract page title
+        page_title = soup.title.string if soup.title else "Hacker News"
+        output_md = [f"# {page_title}\n"]
+        # HN stories are in 'tr' tags with class 'athing'
+        story_rows = soup.find_all('tr', class_='athing')
+        if not story_rows:
+            return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."
+        for story_row in story_rows:
+            # --- Story Details (title, link, rank) ---
+            title_span = story_row.find('span', class_='titleline')
+            if not title_span:
+                continue
+            rank_span = story_row.find('span', class_='rank')
+            rank = rank_span.text.strip() if rank_span else ""
+            link_tag = title_span.find('a')
+            title = link_tag.text if link_tag else "No Title"
+            article_url = link_tag.get('href', '#')
+            # Handle relative URLs for internal posts (e.g., "Ask HN:")
+            if not article_url.startswith('http'):
+                article_url = urllib.parse.urljoin(url, article_url)
+            site_span = title_span.find('span', class_='sitebit')
+            site = f"({site_span.text})" if site_span else ""
+            # --- Metadata (points, user, comments) ---
+            # Metadata is in the next 'tr' sibling
+            metadata_row = story_row.find_next_sibling('tr')
+            if not metadata_row:
+                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
+                continue
+            subtext = metadata_row.find('td', class_='subtext')
+            if not subtext:
+                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
+                continue
+            score = subtext.find('span', class_='score')
+            user = subtext.find('a', class_='hnuser')
+            # The comments link is usually the last link in the subtext
+            comments_link = subtext.find_all('a')[-1]
+            # Build metadata string
+            meta_parts = []
+            if score:
+                meta_parts.append(score.text)
+            if user:
+                meta_parts.append(f"by {user.text}")
+            if comments_link and 'item?id=' in comments_link.get('href', ''):
+                comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
+                comments_url = urllib.parse.urljoin(url, comments_link['href'])
+                meta_parts.append(f"[{comments_text}]({comments_url})")
+            metadata_str = " | ".join(meta_parts)
+            # Assemble the final markdown for the item
+            output_md.append(f"{rank} **[{title}]({article_url})** {site}")
+            if metadata_str:
+                output_md.append(f"   - *{metadata_str}*\n")
+        return "\n".join(output_md)
     except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
+    except Exception as e:
+        return f"An unexpected error occurred during parsing: {e}"
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=fetch_and_parse_hn,
+    inputs=gr.Textbox(
+        label="Hacker News URL",
+        placeholder="e.g., https://news.ycombinator.com",
+        value="https://news.ycombinator.com"
+    ),
+    outputs=gr.Markdown(label="Hacker News Digest"),
+    title="Hacker News Digest Fetcher",
+    description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+    examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
+)
 if __name__ == "__main__":
+    demo.launch()