Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

f890eb5

verified ·

1 Parent(s): 8e5c5da

Update app.py

Browse files

Files changed (1) hide show

app.py +399 -103

app.py CHANGED Viewed

@@ -1,111 +1,407 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-import urllib.parse
-def fetch_and_parse_hn(url):
     """
-    This function takes a Hacker News URL, fetches its content, parses it,
-    and returns a formatted Markdown string with titles, metadata, and hyperlinks.
     """
-    if not url.strip():
-        return "Please enter a URL."
     try:
-        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
-        response = requests.get(url, headers=headers)
-        response.raise_for_status()  # Raises an HTTPError for bad responses
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Extract page title
-        page_title = soup.title.string if soup.title else "Hacker News"
-        output_md = [f"# {page_title}\n"]
-        # HN stories are in 'tr' tags with class 'athing'
-        story_rows = soup.find_all('tr', class_='athing')
-        if not story_rows:
-            return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."
-        for story_row in story_rows:
-            # --- Story Details (title, link, rank) ---
-            title_span = story_row.find('span', class_='titleline')
-            if not title_span:
-                continue
-            rank_span = story_row.find('span', class_='rank')
-            rank = rank_span.text.strip() if rank_span else ""
-            link_tag = title_span.find('a')
-            title = link_tag.text if link_tag else "No Title"
-            article_url = link_tag.get('href', '#')
-            # Handle relative URLs for internal posts (e.g., "Ask HN:")
-            if not article_url.startswith('http'):
-                article_url = urllib.parse.urljoin(url, article_url)
-            site_span = title_span.find('span', class_='sitebit')
-            site = f"({site_span.text})" if site_span else ""
-            # --- Metadata (points, user, comments) ---
-            # Metadata is in the next 'tr' sibling
-            metadata_row = story_row.find_next_sibling('tr')
-            if not metadata_row:
-                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
-                continue
-            subtext = metadata_row.find('td', class_='subtext')
-            if not subtext:
-                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
-                continue
-            score = subtext.find('span', class_='score')
-            user = subtext.find('a', class_='hnuser')
-            # The comments link is usually the last link in the subtext
-            comments_link = subtext.find_all('a')[-1]
-            # Build metadata string
-            meta_parts = []
-            if score:
-                meta_parts.append(score.text)
-            if user:
-                meta_parts.append(f"by {user.text}")
-            if comments_link and 'item?id=' in comments_link.get('href', ''):
-                comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
-                comments_url = urllib.parse.urljoin(url, comments_link['href'])
-                meta_parts.append(f"[{comments_text}]({comments_url})")
-            metadata_str = " | ".join(meta_parts)
-            # Assemble the final markdown for the item
-            output_md.append(f"{rank} **[{title}]({article_url})** {site}")
-            if metadata_str:
-                output_md.append(f"   - *{metadata_str}*\n")
-        return "\n".join(output_md)
     except requests.exceptions.RequestException as e:
-        return f"An error occurred: {e}"
-    except Exception as e:
-        return f"An unexpected error occurred during parsing: {e}"
-# Define the Gradio interface
-demo = gr.Interface(
-    fn=fetch_and_parse_hn,
-    inputs=gr.Textbox(
-        label="Hacker News URL",
-        placeholder="e.g., https://news.ycombinator.com",
-        value="https://news.ycombinator.com"
-    ),
-    outputs=gr.Markdown(label="Hacker News Digest"),
-    title="Hacker News Digest Fetcher",
-    description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
-    allow_flagging="never",
-    theme="Nymbo/Nymbo_Theme",
-    examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
-)
 if __name__ == "__main__":
-    demo.launch()

+# File: main/app.py
+# Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
+# Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
+# UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
+# Notes: Comments are in layman's terms to explain each section.
+import gradio as gr                    # UI framework for the web app
+import requests                        # HTTP client to fetch web pages
+from bs4 import BeautifulSoup          # HTML parser to extract tags and text
+from readability import Document       # Readability algorithm to find main content
+from urllib.parse import urljoin, urlparse  # Tools to resolve relative/absolute URLs
+from dataclasses import dataclass       # For neat, typed containers
+from typing import List, Dict, Tuple
+import re                               # Regular expressions for cleanup
+from datetime import datetime           # For formatting dates in metadata safely
+# =========================
+# Helpers: small data shapes
+# =========================
+@dataclass
+class PageMetadata:
+    # Simple holder for high-level metadata we care about
+    title: str = ""
+    canonical_url: str = ""
+    description: str = ""
+    site_name: str = ""
+    og_type: str = ""
+    og_url: str = ""
+    published_time: str = ""  # ISO-ish if detected
+# =========================
+# Network: fetch raw HTML
+# =========================
+def fetch_html(url: str, timeout: int = 12) -> str:
+    """
+    Downloads the HTML for a given URL using a browser-like User-Agent.
+    Returns text or raises an HTTP/Request error if something fails.
+    """
+    headers = {
+        # Pretend to be a modern desktop browser so we don't get blocked
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/127.0.0.0 Safari/537.36"
+        )
+    }
+    resp = requests.get(url, headers=headers, timeout=timeout)
+    resp.raise_for_status()  # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
+    return resp.text
+# ===================================
+# Generic extraction: metadata + text
+# ===================================
+def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
+    """
+    Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
+    We check Open Graph and Twitter tags as fallbacks too.
+    """
+    md = PageMetadata()
+    # Title from <title> or og:title/twitter:title
+    title_tag = soup.find("title")
+    md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
+    # Meta helpers
+    def meta(name=None, property=None):
+        if name:
+            tag = soup.find("meta", attrs={"name": name})
+            if tag and tag.get("content"):
+                return tag["content"].strip()
+        if property:
+            tag = soup.find("meta", attrs={"property": property})
+            if tag and tag.get("content"):
+                return tag["content"].strip()
+        return ""
+    # Description (prefer og:description > twitter:description > meta description)
+    md.description = (
+        meta(property="og:description")
+        or meta(name="twitter:description")
+        or meta(name="description")
+        or ""
+    ).strip()
+    # Site name (if available)
+    md.site_name = (meta(property="og:site_name") or "").strip()
+    # OpenGraph URL + type (if available)
+    md.og_url = (meta(property="og:url") or "").strip()
+    md.og_type = (meta(property="og:type") or "").strip()
+    # Canonical URL (normalize relative -> absolute)
+    canon = soup.find("link", rel="canonical")
+    if canon and canon.get("href"):
+        md.canonical_url = urljoin(base_url, canon["href"].strip())
+    else:
+        # If no canonical, we may fallback to og:url if present
+        md.canonical_url = md.og_url or base_url
+    # Try some common publish-time signals
+    published = (
+        meta(property="article:published_time")
+        or meta(name="pubdate")
+        or meta(name="date")
+        or ""
+    ).strip()
+    md.published_time = published
+    # If no normal <title>, try OG or Twitter titles
+    if not md.title:
+        md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
+    return md
+def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     """
+    Uses the readability library to find the 'main content' of an article-like page.
+    Returns a clean text string and a BeautifulSoup of the main content HTML
+    (so we can also extract links from just the relevant area).
+    If readability fails/misfires (like index pages), we gracefully fallback to empty text.
     """
     try:
+        doc = Document(html)                     # Run Readability on the HTML
+        summary_html = doc.summary()             # This is the extracted main-content HTML
+        # Parse the readability summary into a soup so we can pull out links cleanly
+        summary_soup = BeautifulSoup(summary_html, "lxml")
+        # Turn HTML to plain text: keep paragraphs and line breaks readable
+        # Remove scripts/styles etc. if any slipped through
+        for tag in summary_soup(["script", "style", "noscript"]):
+            tag.decompose()
+        text = summary_soup.get_text("\n", strip=True)
+        text = re.sub(r"\n{3,}", "\n\n", text)   # Collapse superfluous line breaks
+        return text, summary_soup
+    except Exception:
+        # If something goes wrong (e.g., not article-shaped), return empty content
+        return "", BeautifulSoup("", "lxml")
+def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
+    """
+    Finds hyperlinks. If we have a 'main content' soup and the user asked for
+    content-only links, we grab links from there; otherwise, fall back to the whole page.
+    We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
+    """
+    anchors = []
+    if soup and only_content_area:
+        anchors = soup.find_all("a")
+    else:
+        full = BeautifulSoup(fallback_html, "lxml")
+        anchors = full.find_all("a")
+    results = []
+    seen = set()
+    for a in anchors:
+        href = (a.get("href") or "").strip()
+        text = a.get_text(" ", strip=True)
+        if not href:
+            continue
+        # Skip empty, anchors, JS, and non-http links
+        if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
+            continue
+        # Make absolute
+        absolute = urljoin(base_url, href)
+        # Deduplicate by absolute URL + link text combo
+        key = (absolute, text)
+        if key in seen:
+            continue
+        seen.add(key)
+        domain = urlparse(absolute).netloc
+        results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
+    return results
+# ====================================
+# Special-case: Hacker News front page
+# ====================================
+def is_hn_front(url: str) -> bool:
+    """
+    Checks if the URL is the Hacker News front page (news.ycombinator.com).
+    We'll special-handle it for a great experience listing posts.
+    """
+    p = urlparse(url)
+    if p.netloc != "news.ycombinator.com":
+        return False
+    # Treat /, /news, or /front as "front page" style
+    return p.path in ("", "/", "/news", "/front")
+def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
+    """
+    Parses the Hacker News front page HTML to extract ranked items with points and comments.
+    Returns a Markdown overview and a list-of-dicts suitable for a table.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    items = []
+    # Each story is a <tr class="athing">; subtext is in the immediate next <tr>
+    for story in soup.select("tr.athing"):
+        # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
+        rank_tag = story.select_one("span.rank")
+        rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")
+        # Title + URL (HN changed markup: 'span.titleline a' is current)
+        title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
+        title = title_a.get_text(strip=True) if title_a else "(no title)"
+        url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url
+        # Source domain (e.g., (github.com))
+        site = story.select_one("span.sitestr")
+        source = site.get_text(strip=True) if site else urlparse(url).netloc
+        # Subtext row comes right after the 'athing' row
+        subtext_row = story.find_next_sibling("tr")
+        points, comments, age, by = "", "", "", ""
+        if subtext_row:
+            # Points like "123 points"
+            score = subtext_row.select_one("span.score")
+            points = score.get_text(strip=True) if score else ""
+            # Byline: "by username"
+            user_a = subtext_row.select_one("a.hnuser")
+            by = user_a.get_text(strip=True) if user_a else ""
+            # Age: "5 hours ago"
+            age_tag = subtext_row.select_one("span.age")
+            age = age_tag.get_text(strip=True) if age_tag else ""
+            # Comments link: last <a> typically ends with "comments" or "discuss"
+            comment_a = None
+            links = subtext_row.select("a")
+            if links:
+                comment_a = links[-1]
+            comments = (comment_a.get_text(strip=True) if comment_a else "").lower()
+        items.append({
+            "Rank": rank,
+            "Title": title,
+            "URL": url,
+            "Source": source,
+            "Points": points,
+            "By": by,
+            "Age": age,
+            "Comments": comments,
+        })
+    # Build a tight Markdown digest so you can "use" HN inside the tool
+    md_lines = ["# Hacker News — Front Page",
+                "",
+                "Here are the current front-page posts (click to open):",
+                ""]
+    for it in items:
+        rank = it["Rank"] or "•"
+        title = it["Title"]
+        url = it["URL"]
+        pts = it["Points"] or ""
+        cmt = it["Comments"] or ""
+        age = it["Age"] or ""
+        src = it["Source"] or ""
+        # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
+        extras = " — ".join(filter(None, [
+            " ".join(filter(None, [pts, cmt])),
+            age,
+            f"({src})"
+        ]))
+        md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
+    md = "\n".join(md_lines) if items else "# Hacker News — No items found"
+    return md, items
+# ===========================
+# Public function for Gradio
+# ===========================
+def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
+    """
+    Main function wired to the UI.
+    - Fetches the page
+    - If it's Hacker News front page, parse posts specially
+    - Otherwise: extract metadata, main text (optional), and links
+    - Returns Markdown (summary) + a table of links
+    """
+    try:
+        html = fetch_html(url)
     except requests.exceptions.RequestException as e:
+        # Friendly error message for the UI textbox
+        return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []
+    # Hacker News special handling for top-notch usability
+    if is_hn_front(url):
+        md, items = parse_hn_front(html, url)
+        return md, items  # For HN, the table is the rich story list
+    # Generic page pipeline
+    soup_full = BeautifulSoup(html, "lxml")                # Full page soup for metadata and optional link fallback
+    metadata = extract_metadata(soup_full, url)            # Title, canonical, description, etc.
+    main_text, summary_soup = extract_main_text(html)      # Readability content (may be empty on index pages)
+    # Choose where we harvest links from
+    links = collect_links(summary_soup, url, content_links_only, html)
+    if max_links and max_links > 0:
+        links = links[:max_links]
+    # Build a readable Markdown summary
+    md_lines = []
+    # Title line (prefer metadata title)
+    title_to_show = metadata.title or "(Untitled)"
+    md_lines.append(f"# {title_to_show}")
+    # Canonical + URL info
+    if metadata.canonical_url and metadata.canonical_url != url:
+        md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
+    md_lines.append(f"- **URL:** {url}")
+    # Optional metadata lines
+    if metadata.site_name:
+        md_lines.append(f"- **Site:** {metadata.site_name}")
+    if metadata.description:
+        md_lines.append(f"- **Description:** {metadata.description}")
+    if metadata.published_time:
+        md_lines.append(f"- **Published:** {metadata.published_time}")
+    if metadata.og_type:
+        md_lines.append(f"- **OG Type:** {metadata.og_type}")
+    # Spacer
+    md_lines.append("\n---\n")
+    # Main content (optional, controlled by checkbox)
+    if full_text and main_text:
+        md_lines.append("## Main Content")
+        # Keep things readable; long pages can be huge—Readability already helps keep it topical
+        md_lines.append(main_text)
+        md_lines.append("\n---\n")
+    # Links brief (we also return a structured table below)
+    md_lines.append("## Links Found")
+    md_lines.append(
+        f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
+        "Click any to open in a new tab."
+    )
+    md = "\n".join(md_lines)
+    return md, links
+# ===========
+# Gradio UI
+# ===========
+# Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
+with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo:
+    # --- Header area: title + quick helper buttons
+    gr.Markdown("# Fetch MCP — Clean Text & Links\n"
+                "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
+                "- Special handling for **Hacker News** front page (rank, points, comments).\n"
+                "- Toggle **Full Text** if you also want the extracted article content.")
+    with gr.Row():
+        url_in = gr.Textbox(
+            label="URL",
+            placeholder="https://news.ycombinator.com/  •  https://example.com/article",
+            value="https://news.ycombinator.com/",
+            scale=4
+        )
+        fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)
+    with gr.Row():
+        full_text_chk = gr.Checkbox(
+            label="Include main content text (Readability extract)?",
+            value=False
+        )
+        content_only_chk = gr.Checkbox(
+            label="Links from main content only (fallback: full page)?",
+            value=True
+        )
+        max_links_sld = gr.Slider(
+            label="Max links to return",
+            minimum=10, maximum=500, value=100, step=10
+        )
+    # Outputs: Markdown summary + a table of links (or HN posts table)
+    summary_md = gr.Markdown(label="Summary")
+    links_tbl = gr.Dataframe(
+        headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
+        # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
+        interactive=False,
+        wrap=True,
+        row_count=(0, "dynamic"),
+        col_count=(0, "dynamic")
+    )
+    # Wire up the action: clicking the button runs extract_page and shows results
+    fetch_btn.click(
+        fn=extract_page,
+        inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
+        outputs=[summary_md, links_tbl]
+    )
+# Keep MCP server behavior enabled for your setup
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)