Nymbo commited on
Commit
f890eb5
·
verified ·
1 Parent(s): 8e5c5da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +399 -103
app.py CHANGED
@@ -1,111 +1,407 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import urllib.parse
 
5
 
6
- def fetch_and_parse_hn(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
- This function takes a Hacker News URL, fetches its content, parses it,
9
- and returns a formatted Markdown string with titles, metadata, and hyperlinks.
 
 
10
  """
11
- if not url.strip():
12
- return "Please enter a URL."
13
-
14
  try:
15
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
16
- response = requests.get(url, headers=headers)
17
- response.raise_for_status() # Raises an HTTPError for bad responses
18
-
19
- soup = BeautifulSoup(response.text, 'html.parser')
20
-
21
- # Extract page title
22
- page_title = soup.title.string if soup.title else "Hacker News"
23
- output_md = [f"# {page_title}\n"]
24
-
25
- # HN stories are in 'tr' tags with class 'athing'
26
- story_rows = soup.find_all('tr', class_='athing')
27
-
28
- if not story_rows:
29
- return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."
30
-
31
- for story_row in story_rows:
32
- # --- Story Details (title, link, rank) ---
33
- title_span = story_row.find('span', class_='titleline')
34
- if not title_span:
35
- continue
36
-
37
- rank_span = story_row.find('span', class_='rank')
38
- rank = rank_span.text.strip() if rank_span else ""
39
-
40
- link_tag = title_span.find('a')
41
- title = link_tag.text if link_tag else "No Title"
42
- article_url = link_tag.get('href', '#')
43
-
44
- # Handle relative URLs for internal posts (e.g., "Ask HN:")
45
- if not article_url.startswith('http'):
46
- article_url = urllib.parse.urljoin(url, article_url)
47
-
48
- site_span = title_span.find('span', class_='sitebit')
49
- site = f"({site_span.text})" if site_span else ""
50
-
51
- # --- Metadata (points, user, comments) ---
52
- # Metadata is in the next 'tr' sibling
53
- metadata_row = story_row.find_next_sibling('tr')
54
- if not metadata_row:
55
- output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
56
- continue
57
-
58
- subtext = metadata_row.find('td', class_='subtext')
59
- if not subtext:
60
- output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
61
- continue
62
-
63
- score = subtext.find('span', class_='score')
64
- user = subtext.find('a', class_='hnuser')
65
-
66
- # The comments link is usually the last link in the subtext
67
- comments_link = subtext.find_all('a')[-1]
68
-
69
- # Build metadata string
70
- meta_parts = []
71
- if score:
72
- meta_parts.append(score.text)
73
- if user:
74
- meta_parts.append(f"by {user.text}")
75
- if comments_link and 'item?id=' in comments_link.get('href', ''):
76
- comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
77
- comments_url = urllib.parse.urljoin(url, comments_link['href'])
78
- meta_parts.append(f"[{comments_text}]({comments_url})")
79
-
80
- metadata_str = " | ".join(meta_parts)
81
-
82
- # Assemble the final markdown for the item
83
- output_md.append(f"{rank} **[{title}]({article_url})** {site}")
84
- if metadata_str:
85
- output_md.append(f" - *{metadata_str}*\n")
86
-
87
- return "\n".join(output_md)
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  except requests.exceptions.RequestException as e:
90
- return f"An error occurred: {e}"
91
- except Exception as e:
92
- return f"An unexpected error occurred during parsing: {e}"
93
-
94
- # Define the Gradio interface
95
- demo = gr.Interface(
96
- fn=fetch_and_parse_hn,
97
- inputs=gr.Textbox(
98
- label="Hacker News URL",
99
- placeholder="e.g., https://news.ycombinator.com",
100
- value="https://news.ycombinator.com"
101
- ),
102
- outputs=gr.Markdown(label="Hacker News Digest"),
103
- title="Hacker News Digest Fetcher",
104
- description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
105
- allow_flagging="never",
106
- theme="Nymbo/Nymbo_Theme",
107
- examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
108
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
 
110
  if __name__ == "__main__":
111
- demo.launch()
 
1
+ # File: main/app.py
2
+ # Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
3
+ # Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
4
+ # UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
5
+ # Notes: Comments are in layman's terms to explain each section.
6
 
7
+ import gradio as gr # UI framework for the web app
8
+ import requests # HTTP client to fetch web pages
9
+ from bs4 import BeautifulSoup # HTML parser to extract tags and text
10
+ from readability import Document # Readability algorithm to find main content
11
+ from urllib.parse import urljoin, urlparse # Tools to resolve relative/absolute URLs
12
+ from dataclasses import dataclass # For neat, typed containers
13
+ from typing import List, Dict, Tuple
14
+ import re # Regular expressions for cleanup
15
+ from datetime import datetime # For formatting dates in metadata safely
16
+
17
+
18
+ # =========================
19
+ # Helpers: small data shapes
20
+ # =========================
21
+
22
+ @dataclass
23
+ class PageMetadata:
24
+ # Simple holder for high-level metadata we care about
25
+ title: str = ""
26
+ canonical_url: str = ""
27
+ description: str = ""
28
+ site_name: str = ""
29
+ og_type: str = ""
30
+ og_url: str = ""
31
+ published_time: str = "" # ISO-ish if detected
32
+
33
+
34
+ # =========================
35
+ # Network: fetch raw HTML
36
+ # =========================
37
+
38
+ def fetch_html(url: str, timeout: int = 12) -> str:
39
+ """
40
+ Downloads the HTML for a given URL using a browser-like User-Agent.
41
+ Returns text or raises an HTTP/Request error if something fails.
42
+ """
43
+ headers = {
44
+ # Pretend to be a modern desktop browser so we don't get blocked
45
+ "User-Agent": (
46
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
47
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
48
+ "Chrome/127.0.0.0 Safari/537.36"
49
+ )
50
+ }
51
+ resp = requests.get(url, headers=headers, timeout=timeout)
52
+ resp.raise_for_status() # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
53
+ return resp.text
54
+
55
+
56
+ # ===================================
57
+ # Generic extraction: metadata + text
58
+ # ===================================
59
+
60
+ def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
61
+ """
62
+ Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
63
+ We check Open Graph and Twitter tags as fallbacks too.
64
+ """
65
+ md = PageMetadata()
66
+
67
+ # Title from <title> or og:title/twitter:title
68
+ title_tag = soup.find("title")
69
+ md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
70
+
71
+ # Meta helpers
72
+ def meta(name=None, property=None):
73
+ if name:
74
+ tag = soup.find("meta", attrs={"name": name})
75
+ if tag and tag.get("content"):
76
+ return tag["content"].strip()
77
+ if property:
78
+ tag = soup.find("meta", attrs={"property": property})
79
+ if tag and tag.get("content"):
80
+ return tag["content"].strip()
81
+ return ""
82
+
83
+ # Description (prefer og:description > twitter:description > meta description)
84
+ md.description = (
85
+ meta(property="og:description")
86
+ or meta(name="twitter:description")
87
+ or meta(name="description")
88
+ or ""
89
+ ).strip()
90
+
91
+ # Site name (if available)
92
+ md.site_name = (meta(property="og:site_name") or "").strip()
93
+
94
+ # OpenGraph URL + type (if available)
95
+ md.og_url = (meta(property="og:url") or "").strip()
96
+ md.og_type = (meta(property="og:type") or "").strip()
97
+
98
+ # Canonical URL (normalize relative -> absolute)
99
+ canon = soup.find("link", rel="canonical")
100
+ if canon and canon.get("href"):
101
+ md.canonical_url = urljoin(base_url, canon["href"].strip())
102
+ else:
103
+ # If no canonical, we may fallback to og:url if present
104
+ md.canonical_url = md.og_url or base_url
105
+
106
+ # Try some common publish-time signals
107
+ published = (
108
+ meta(property="article:published_time")
109
+ or meta(name="pubdate")
110
+ or meta(name="date")
111
+ or ""
112
+ ).strip()
113
+ md.published_time = published
114
+
115
+ # If no normal <title>, try OG or Twitter titles
116
+ if not md.title:
117
+ md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
118
+
119
+ return md
120
+
121
+
122
+ def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
123
  """
124
+ Uses the readability library to find the 'main content' of an article-like page.
125
+ Returns a clean text string and a BeautifulSoup of the main content HTML
126
+ (so we can also extract links from just the relevant area).
127
+ If readability fails/misfires (like index pages), we gracefully fallback to empty text.
128
  """
 
 
 
129
  try:
130
+ doc = Document(html) # Run Readability on the HTML
131
+ summary_html = doc.summary() # This is the extracted main-content HTML
132
+ # Parse the readability summary into a soup so we can pull out links cleanly
133
+ summary_soup = BeautifulSoup(summary_html, "lxml")
134
+ # Turn HTML to plain text: keep paragraphs and line breaks readable
135
+ # Remove scripts/styles etc. if any slipped through
136
+ for tag in summary_soup(["script", "style", "noscript"]):
137
+ tag.decompose()
138
+ text = summary_soup.get_text("\n", strip=True)
139
+ text = re.sub(r"\n{3,}", "\n\n", text) # Collapse superfluous line breaks
140
+ return text, summary_soup
141
+ except Exception:
142
+ # If something goes wrong (e.g., not article-shaped), return empty content
143
+ return "", BeautifulSoup("", "lxml")
144
+
145
+
146
+ def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
147
+ """
148
+ Finds hyperlinks. If we have a 'main content' soup and the user asked for
149
+ content-only links, we grab links from there; otherwise, fall back to the whole page.
150
+ We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
151
+ """
152
+ anchors = []
153
+ if soup and only_content_area:
154
+ anchors = soup.find_all("a")
155
+ else:
156
+ full = BeautifulSoup(fallback_html, "lxml")
157
+ anchors = full.find_all("a")
158
+
159
+ results = []
160
+ seen = set()
161
+ for a in anchors:
162
+ href = (a.get("href") or "").strip()
163
+ text = a.get_text(" ", strip=True)
164
+ if not href:
165
+ continue
166
+ # Skip empty, anchors, JS, and non-http links
167
+ if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
168
+ continue
169
+ # Make absolute
170
+ absolute = urljoin(base_url, href)
171
+ # Deduplicate by absolute URL + link text combo
172
+ key = (absolute, text)
173
+ if key in seen:
174
+ continue
175
+ seen.add(key)
176
+ domain = urlparse(absolute).netloc
177
+ results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
178
+ return results
179
+
180
+
181
+ # ====================================
182
+ # Special-case: Hacker News front page
183
+ # ====================================
184
+
185
+ def is_hn_front(url: str) -> bool:
186
+ """
187
+ Checks if the URL is the Hacker News front page (news.ycombinator.com).
188
+ We'll special-handle it for a great experience listing posts.
189
+ """
190
+ p = urlparse(url)
191
+ if p.netloc != "news.ycombinator.com":
192
+ return False
193
+ # Treat /, /news, or /front as "front page" style
194
+ return p.path in ("", "/", "/news", "/front")
 
 
 
 
 
 
 
 
195
 
196
+
197
+ def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
198
+ """
199
+ Parses the Hacker News front page HTML to extract ranked items with points and comments.
200
+ Returns a Markdown overview and a list-of-dicts suitable for a table.
201
+ """
202
+ soup = BeautifulSoup(html, "lxml")
203
+ items = []
204
+
205
+ # Each story is a <tr class="athing">; subtext is in the immediate next <tr>
206
+ for story in soup.select("tr.athing"):
207
+ # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
208
+ rank_tag = story.select_one("span.rank")
209
+ rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")
210
+
211
+ # Title + URL (HN changed markup: 'span.titleline a' is current)
212
+ title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
213
+ title = title_a.get_text(strip=True) if title_a else "(no title)"
214
+ url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url
215
+
216
+ # Source domain (e.g., (github.com))
217
+ site = story.select_one("span.sitestr")
218
+ source = site.get_text(strip=True) if site else urlparse(url).netloc
219
+
220
+ # Subtext row comes right after the 'athing' row
221
+ subtext_row = story.find_next_sibling("tr")
222
+ points, comments, age, by = "", "", "", ""
223
+ if subtext_row:
224
+ # Points like "123 points"
225
+ score = subtext_row.select_one("span.score")
226
+ points = score.get_text(strip=True) if score else ""
227
+ # Byline: "by username"
228
+ user_a = subtext_row.select_one("a.hnuser")
229
+ by = user_a.get_text(strip=True) if user_a else ""
230
+ # Age: "5 hours ago"
231
+ age_tag = subtext_row.select_one("span.age")
232
+ age = age_tag.get_text(strip=True) if age_tag else ""
233
+ # Comments link: last <a> typically ends with "comments" or "discuss"
234
+ comment_a = None
235
+ links = subtext_row.select("a")
236
+ if links:
237
+ comment_a = links[-1]
238
+ comments = (comment_a.get_text(strip=True) if comment_a else "").lower()
239
+
240
+ items.append({
241
+ "Rank": rank,
242
+ "Title": title,
243
+ "URL": url,
244
+ "Source": source,
245
+ "Points": points,
246
+ "By": by,
247
+ "Age": age,
248
+ "Comments": comments,
249
+ })
250
+
251
+ # Build a tight Markdown digest so you can "use" HN inside the tool
252
+ md_lines = ["# Hacker News — Front Page",
253
+ "",
254
+ "Here are the current front-page posts (click to open):",
255
+ ""]
256
+ for it in items:
257
+ rank = it["Rank"] or "•"
258
+ title = it["Title"]
259
+ url = it["URL"]
260
+ pts = it["Points"] or ""
261
+ cmt = it["Comments"] or ""
262
+ age = it["Age"] or ""
263
+ src = it["Source"] or ""
264
+ # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
265
+ extras = " — ".join(filter(None, [
266
+ " ".join(filter(None, [pts, cmt])),
267
+ age,
268
+ f"({src})"
269
+ ]))
270
+ md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
271
+ md = "\n".join(md_lines) if items else "# Hacker News — No items found"
272
+
273
+ return md, items
274
+
275
+
276
+ # ===========================
277
+ # Public function for Gradio
278
+ # ===========================
279
+
280
+ def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
281
+ """
282
+ Main function wired to the UI.
283
+ - Fetches the page
284
+ - If it's Hacker News front page, parse posts specially
285
+ - Otherwise: extract metadata, main text (optional), and links
286
+ - Returns Markdown (summary) + a table of links
287
+ """
288
+ try:
289
+ html = fetch_html(url)
290
  except requests.exceptions.RequestException as e:
291
+ # Friendly error message for the UI textbox
292
+ return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []
293
+
294
+ # Hacker News special handling for top-notch usability
295
+ if is_hn_front(url):
296
+ md, items = parse_hn_front(html, url)
297
+ return md, items # For HN, the table is the rich story list
298
+
299
+ # Generic page pipeline
300
+ soup_full = BeautifulSoup(html, "lxml") # Full page soup for metadata and optional link fallback
301
+ metadata = extract_metadata(soup_full, url) # Title, canonical, description, etc.
302
+ main_text, summary_soup = extract_main_text(html) # Readability content (may be empty on index pages)
303
+
304
+ # Choose where we harvest links from
305
+ links = collect_links(summary_soup, url, content_links_only, html)
306
+ if max_links and max_links > 0:
307
+ links = links[:max_links]
308
+
309
+ # Build a readable Markdown summary
310
+ md_lines = []
311
+
312
+ # Title line (prefer metadata title)
313
+ title_to_show = metadata.title or "(Untitled)"
314
+ md_lines.append(f"# {title_to_show}")
315
+
316
+ # Canonical + URL info
317
+ if metadata.canonical_url and metadata.canonical_url != url:
318
+ md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
319
+ md_lines.append(f"- **URL:** {url}")
320
+
321
+ # Optional metadata lines
322
+ if metadata.site_name:
323
+ md_lines.append(f"- **Site:** {metadata.site_name}")
324
+ if metadata.description:
325
+ md_lines.append(f"- **Description:** {metadata.description}")
326
+ if metadata.published_time:
327
+ md_lines.append(f"- **Published:** {metadata.published_time}")
328
+ if metadata.og_type:
329
+ md_lines.append(f"- **OG Type:** {metadata.og_type}")
330
+
331
+ # Spacer
332
+ md_lines.append("\n---\n")
333
+
334
+ # Main content (optional, controlled by checkbox)
335
+ if full_text and main_text:
336
+ md_lines.append("## Main Content")
337
+ # Keep things readable; long pages can be huge—Readability already helps keep it topical
338
+ md_lines.append(main_text)
339
+ md_lines.append("\n---\n")
340
+
341
+ # Links brief (we also return a structured table below)
342
+ md_lines.append("## Links Found")
343
+ md_lines.append(
344
+ f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
345
+ "Click any to open in a new tab."
346
+ )
347
+
348
+ md = "\n".join(md_lines)
349
+ return md, links
350
+
351
+
352
+ # ===========
353
+ # Gradio UI
354
+ # ===========
355
+
356
+ # Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
357
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo:
358
+ # --- Header area: title + quick helper buttons
359
+ gr.Markdown("# Fetch MCP — Clean Text & Links\n"
360
+ "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
361
+ "- Special handling for **Hacker News** front page (rank, points, comments).\n"
362
+ "- Toggle **Full Text** if you also want the extracted article content.")
363
+
364
+ with gr.Row():
365
+ url_in = gr.Textbox(
366
+ label="URL",
367
+ placeholder="https://news.ycombinator.com/ • https://example.com/article",
368
+ value="https://news.ycombinator.com/",
369
+ scale=4
370
+ )
371
+ fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)
372
+
373
+ with gr.Row():
374
+ full_text_chk = gr.Checkbox(
375
+ label="Include main content text (Readability extract)?",
376
+ value=False
377
+ )
378
+ content_only_chk = gr.Checkbox(
379
+ label="Links from main content only (fallback: full page)?",
380
+ value=True
381
+ )
382
+ max_links_sld = gr.Slider(
383
+ label="Max links to return",
384
+ minimum=10, maximum=500, value=100, step=10
385
+ )
386
+
387
+ # Outputs: Markdown summary + a table of links (or HN posts table)
388
+ summary_md = gr.Markdown(label="Summary")
389
+ links_tbl = gr.Dataframe(
390
+ headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
391
+ # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
392
+ interactive=False,
393
+ wrap=True,
394
+ row_count=(0, "dynamic"),
395
+ col_count=(0, "dynamic")
396
+ )
397
+
398
+ # Wire up the action: clicking the button runs extract_page and shows results
399
+ fetch_btn.click(
400
+ fn=extract_page,
401
+ inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
402
+ outputs=[summary_md, links_tbl]
403
+ )
404
 
405
+ # Keep MCP server behavior enabled for your setup
406
  if __name__ == "__main__":
407
+ demo.launch(mcp_server=True)