Nymbo commited on
Commit
ac9f3b0
·
verified ·
1 Parent(s): f890eb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +322 -365
app.py CHANGED
@@ -1,407 +1,364 @@
1
  # File: main/app.py
2
- # Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
3
- # Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
4
- # UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
5
- # Notes: Comments are in layman's terms to explain each section.
6
-
7
- import gradio as gr # UI framework for the web app
8
- import requests # HTTP client to fetch web pages
9
- from bs4 import BeautifulSoup # HTML parser to extract tags and text
10
- from readability import Document # Readability algorithm to find main content
11
- from urllib.parse import urljoin, urlparse # Tools to resolve relative/absolute URLs
12
- from dataclasses import dataclass # For neat, typed containers
13
- from typing import List, Dict, Tuple
14
- import re # Regular expressions for cleanup
15
- from datetime import datetime # For formatting dates in metadata safely
16
-
17
-
18
- # =========================
19
- # Helpers: small data shapes
20
- # =========================
21
-
22
- @dataclass
23
- class PageMetadata:
24
- # Simple holder for high-level metadata we care about
25
- title: str = ""
26
- canonical_url: str = ""
27
- description: str = ""
28
- site_name: str = ""
29
- og_type: str = ""
30
- og_url: str = ""
31
- published_time: str = "" # ISO-ish if detected
32
-
33
-
34
- # =========================
35
- # Network: fetch raw HTML
36
- # =========================
37
-
38
- def fetch_html(url: str, timeout: int = 12) -> str:
39
  """
40
- Downloads the HTML for a given URL using a browser-like User-Agent.
41
- Returns text or raises an HTTP/Request error if something fails.
42
  """
43
  headers = {
44
- # Pretend to be a modern desktop browser so we don't get blocked
45
- "User-Agent": (
46
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
47
- "AppleWebKit/537.36 (KHTML, like Gecko) "
48
- "Chrome/127.0.0.0 Safari/537.36"
49
- )
50
  }
51
- resp = requests.get(url, headers=headers, timeout=timeout)
52
- resp.raise_for_status() # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
53
- return resp.text
54
 
55
 
56
- # ===================================
57
- # Generic extraction: metadata + text
58
- # ===================================
 
 
 
 
 
 
 
 
59
 
60
- def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
61
  """
62
- Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
63
- We check Open Graph and Twitter tags as fallbacks too.
64
  """
65
- md = PageMetadata()
66
-
67
- # Title from <title> or og:title/twitter:title
68
- title_tag = soup.find("title")
69
- md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
70
-
71
- # Meta helpers
72
- def meta(name=None, property=None):
73
- if name:
74
- tag = soup.find("meta", attrs={"name": name})
75
- if tag and tag.get("content"):
76
- return tag["content"].strip()
77
- if property:
78
- tag = soup.find("meta", attrs={"property": property})
79
- if tag and tag.get("content"):
80
- return tag["content"].strip()
81
- return ""
82
 
83
- # Description (prefer og:description > twitter:description > meta description)
84
- md.description = (
85
- meta(property="og:description")
86
- or meta(name="twitter:description")
87
- or meta(name="description")
88
- or ""
89
- ).strip()
90
-
91
- # Site name (if available)
92
- md.site_name = (meta(property="og:site_name") or "").strip()
93
-
94
- # OpenGraph URL + type (if available)
95
- md.og_url = (meta(property="og:url") or "").strip()
96
- md.og_type = (meta(property="og:type") or "").strip()
97
-
98
- # Canonical URL (normalize relative -> absolute)
99
- canon = soup.find("link", rel="canonical")
100
- if canon and canon.get("href"):
101
- md.canonical_url = urljoin(base_url, canon["href"].strip())
102
- else:
103
- # If no canonical, we may fallback to og:url if present
104
- md.canonical_url = md.og_url or base_url
105
-
106
- # Try some common publish-time signals
107
- published = (
108
- meta(property="article:published_time")
109
- or meta(name="pubdate")
110
- or meta(name="date")
111
- or ""
112
- ).strip()
113
- md.published_time = published
114
-
115
- # If no normal <title>, try OG or Twitter titles
116
- if not md.title:
117
- md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
118
-
119
- return md
120
-
121
-
122
- def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
123
  """
124
- Uses the readability library to find the 'main content' of an article-like page.
125
- Returns a clean text string and a BeautifulSoup of the main content HTML
126
- (so we can also extract links from just the relevant area).
127
- If readability fails/misfires (like index pages), we gracefully fallback to empty text.
128
  """
129
  try:
130
- doc = Document(html) # Run Readability on the HTML
131
- summary_html = doc.summary() # This is the extracted main-content HTML
132
- # Parse the readability summary into a soup so we can pull out links cleanly
133
- summary_soup = BeautifulSoup(summary_html, "lxml")
134
- # Turn HTML to plain text: keep paragraphs and line breaks readable
135
- # Remove scripts/styles etc. if any slipped through
136
- for tag in summary_soup(["script", "style", "noscript"]):
137
- tag.decompose()
138
- text = summary_soup.get_text("\n", strip=True)
139
- text = re.sub(r"\n{3,}", "\n\n", text) # Collapse superfluous line breaks
140
- return text, summary_soup
141
  except Exception:
142
- # If something goes wrong (e.g., not article-shaped), return empty content
143
- return "", BeautifulSoup("", "lxml")
144
 
145
 
146
- def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
 
 
 
147
  """
148
- Finds hyperlinks. If we have a 'main content' soup and the user asked for
149
- content-only links, we grab links from there; otherwise, fall back to the whole page.
150
- We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  """
152
- anchors = []
153
- if soup and only_content_area:
154
- anchors = soup.find_all("a")
155
- else:
156
- full = BeautifulSoup(fallback_html, "lxml")
157
- anchors = full.find_all("a")
158
-
159
- results = []
160
  seen = set()
161
- for a in anchors:
162
- href = (a.get("href") or "").strip()
163
- text = a.get_text(" ", strip=True)
164
- if not href:
165
- continue
166
- # Skip empty, anchors, JS, and non-http links
167
- if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
168
  continue
169
- # Make absolute
 
170
  absolute = urljoin(base_url, href)
171
- # Deduplicate by absolute URL + link text combo
172
- key = (absolute, text)
173
- if key in seen:
174
  continue
175
- seen.add(key)
176
- domain = urlparse(absolute).netloc
177
- results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
178
- return results
 
 
179
 
 
180
 
181
- # ====================================
182
- # Special-case: Hacker News front page
183
- # ====================================
184
 
185
- def is_hn_front(url: str) -> bool:
186
- """
187
- Checks if the URL is the Hacker News front page (news.ycombinator.com).
188
- We'll special-handle it for a great experience listing posts.
189
- """
190
- p = urlparse(url)
191
- if p.netloc != "news.ycombinator.com":
192
- return False
193
- # Treat /, /news, or /front as "front page" style
194
- return p.path in ("", "/", "/news", "/front")
195
 
196
 
197
- def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
 
 
 
 
198
  """
199
- Parses the Hacker News front page HTML to extract ranked items with points and comments.
200
- Returns a Markdown overview and a list-of-dicts suitable for a table.
201
  """
202
- soup = BeautifulSoup(html, "lxml")
203
- items = []
204
-
205
- # Each story is a <tr class="athing">; subtext is in the immediate next <tr>
206
- for story in soup.select("tr.athing"):
207
- # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
208
- rank_tag = story.select_one("span.rank")
209
- rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")
210
-
211
- # Title + URL (HN changed markup: 'span.titleline a' is current)
212
- title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
213
- title = title_a.get_text(strip=True) if title_a else "(no title)"
214
- url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url
215
-
216
- # Source domain (e.g., (github.com))
217
- site = story.select_one("span.sitestr")
218
- source = site.get_text(strip=True) if site else urlparse(url).netloc
219
-
220
- # Subtext row comes right after the 'athing' row
221
- subtext_row = story.find_next_sibling("tr")
222
- points, comments, age, by = "", "", "", ""
223
- if subtext_row:
224
- # Points like "123 points"
225
- score = subtext_row.select_one("span.score")
226
- points = score.get_text(strip=True) if score else ""
227
- # Byline: "by username"
228
- user_a = subtext_row.select_one("a.hnuser")
229
- by = user_a.get_text(strip=True) if user_a else ""
230
- # Age: "5 hours ago"
231
- age_tag = subtext_row.select_one("span.age")
232
- age = age_tag.get_text(strip=True) if age_tag else ""
233
- # Comments link: last <a> typically ends with "comments" or "discuss"
234
- comment_a = None
235
- links = subtext_row.select("a")
236
- if links:
237
- comment_a = links[-1]
238
- comments = (comment_a.get_text(strip=True) if comment_a else "").lower()
239
-
240
- items.append({
241
- "Rank": rank,
242
- "Title": title,
243
- "URL": url,
244
- "Source": source,
245
- "Points": points,
246
- "By": by,
247
- "Age": age,
248
- "Comments": comments,
249
- })
250
-
251
- # Build a tight Markdown digest so you can "use" HN inside the tool
252
- md_lines = ["# Hacker News Front Page",
253
- "",
254
- "Here are the current front-page posts (click to open):",
255
- ""]
256
- for it in items:
257
- rank = it["Rank"] or "•"
258
- title = it["Title"]
259
- url = it["URL"]
260
- pts = it["Points"] or ""
261
- cmt = it["Comments"] or ""
262
- age = it["Age"] or ""
263
- src = it["Source"] or ""
264
- # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
265
- extras = " — ".join(filter(None, [
266
- " ".join(filter(None, [pts, cmt])),
267
- age,
268
- f"({src})"
269
- ]))
270
- md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
271
- md = "\n".join(md_lines) if items else "# Hacker News — No items found"
272
-
273
- return md, items
274
-
275
-
276
- # ===========================
277
- # Public function for Gradio
278
- # ===========================
279
-
280
- def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
281
  """
282
- Main function wired to the UI.
283
- - Fetches the page
284
- - If it's Hacker News front page, parse posts specially
285
- - Otherwise: extract metadata, main text (optional), and links
286
- - Returns Markdown (summary) + a table of links
287
  """
 
 
 
288
  try:
289
- html = fetch_html(url)
 
290
  except requests.exceptions.RequestException as e:
291
- # Friendly error message for the UI textbox
292
- return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []
293
-
294
- # Hacker News special handling for top-notch usability
295
- if is_hn_front(url):
296
- md, items = parse_hn_front(html, url)
297
- return md, items # For HN, the table is the rich story list
298
-
299
- # Generic page pipeline
300
- soup_full = BeautifulSoup(html, "lxml") # Full page soup for metadata and optional link fallback
301
- metadata = extract_metadata(soup_full, url) # Title, canonical, description, etc.
302
- main_text, summary_soup = extract_main_text(html) # Readability content (may be empty on index pages)
303
-
304
- # Choose where we harvest links from
305
- links = collect_links(summary_soup, url, content_links_only, html)
306
- if max_links and max_links > 0:
307
- links = links[:max_links]
308
-
309
- # Build a readable Markdown summary
310
- md_lines = []
311
-
312
- # Title line (prefer metadata title)
313
- title_to_show = metadata.title or "(Untitled)"
314
- md_lines.append(f"# {title_to_show}")
315
-
316
- # Canonical + URL info
317
- if metadata.canonical_url and metadata.canonical_url != url:
318
- md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
319
- md_lines.append(f"- **URL:** {url}")
320
-
321
- # Optional metadata lines
322
- if metadata.site_name:
323
- md_lines.append(f"- **Site:** {metadata.site_name}")
324
- if metadata.description:
325
- md_lines.append(f"- **Description:** {metadata.description}")
326
- if metadata.published_time:
327
- md_lines.append(f"- **Published:** {metadata.published_time}")
328
- if metadata.og_type:
329
- md_lines.append(f"- **OG Type:** {metadata.og_type}")
330
-
331
- # Spacer
332
- md_lines.append("\n---\n")
333
-
334
- # Main content (optional, controlled by checkbox)
335
- if full_text and main_text:
336
- md_lines.append("## Main Content")
337
- # Keep things readable; long pages can be huge—Readability already helps keep it topical
338
- md_lines.append(main_text)
339
- md_lines.append("\n---\n")
340
-
341
- # Links brief (we also return a structured table below)
342
- md_lines.append("## Links Found")
343
- md_lines.append(
344
- f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
345
- "Click any to open in a new tab."
346
  )
347
-
348
- md = "\n".join(md_lines)
349
- return md, links
350
-
351
-
352
- # ===========
353
- # Gradio UI
354
- # ===========
355
-
356
- # Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
357
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCPClean Text & Links") as demo:
358
- # --- Header area: title + quick helper buttons
359
- gr.Markdown("# Fetch MCP — Clean Text & Links\n"
360
- "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
361
- "- Special handling for **Hacker News** front page (rank, points, comments).\n"
362
- "- Toggle **Full Text** if you also want the extracted article content.")
363
-
364
- with gr.Row():
365
- url_in = gr.Textbox(
366
- label="URL",
367
- placeholder="https://news.ycombinator.com/ • https://example.com/article",
368
- value="https://news.ycombinator.com/",
369
- scale=4
370
- )
371
- fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)
372
-
373
- with gr.Row():
374
- full_text_chk = gr.Checkbox(
375
- label="Include main content text (Readability extract)?",
376
- value=False
377
- )
378
- content_only_chk = gr.Checkbox(
379
- label="Links from main content only (fallback: full page)?",
380
- value=True
381
- )
382
- max_links_sld = gr.Slider(
383
- label="Max links to return",
384
- minimum=10, maximum=500, value=100, step=10
385
- )
386
-
387
- # Outputs: Markdown summary + a table of links (or HN posts table)
388
- summary_md = gr.Markdown(label="Summary")
389
- links_tbl = gr.Dataframe(
390
- headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
391
- # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
392
- interactive=False,
393
- wrap=True,
394
- row_count=(0, "dynamic"),
395
- col_count=(0, "dynamic")
396
  )
397
 
398
- # Wire up the action: clicking the button runs extract_page and shows results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  fetch_btn.click(
400
- fn=extract_page,
401
- inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
402
- outputs=[summary_md, links_tbl]
403
  )
404
 
405
- # Keep MCP server behavior enabled for your setup
406
  if __name__ == "__main__":
407
  demo.launch(mcp_server=True)
 
1
  # File: main/app.py
2
+ # Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
3
+ # instead of returning full HTML. Output is compact and configurable to reduce verbosity.
4
+
5
+ import gradio as gr # UI library
6
+ import requests # HTTP client
7
+ from bs4 import BeautifulSoup # HTML parsing
8
+ from readability import Document # Readability algorithm to isolate main content
9
+ from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
10
+ import re # For whitespace cleanup and simple formatting
11
+
12
+
13
+ # -------------------------------
14
+ # HTTP fetching with sane defaults
15
+ # -------------------------------
16
+ def _http_get(url: str) -> requests.Response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
+ Make an HTTP GET request with headers and a timeout.
19
+ Layman's terms: downloads the webpage safely and politely.
20
  """
21
  headers = {
22
+ "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
23
+ "Accept-Language": "en-US,en;q=0.9",
24
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 
 
 
25
  }
26
+ # Short timeouts so the app isn't stuck forever
27
+ return requests.get(url, headers=headers, timeout=15)
 
28
 
29
 
30
+ # ----------------------------------------
31
+ # Helpers: text cleanup & friendly trimming
32
+ # ----------------------------------------
33
+ def _normalize_whitespace(text: str) -> str:
34
+ """
35
+ Layman's terms: squash weird spacing and too many blank lines.
36
+ """
37
+ text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
38
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time
39
+ return text.strip()
40
+
41
 
42
+ def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
43
  """
44
+ Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
 
45
  """
46
+ if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
47
+ return text, False
48
+ return text[:max_chars].rstrip() + " …", True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+
51
+ def _domain_of(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
+ Layman's terms: show a friendly domain like example.com.
 
 
 
54
  """
55
  try:
56
+ return urlparse(url).netloc or ""
 
 
 
 
 
 
 
 
 
 
57
  except Exception:
58
+ return ""
 
59
 
60
 
61
+ # -----------------------------------
62
+ # Metadata extraction (title, etc.)
63
+ # -----------------------------------
64
+ def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
65
  """
66
+ Layman's terms: grab useful fields like title, description, site name, and canonical link.
67
+ """
68
+ meta = {}
69
+
70
+ # Title preference: <title> > og:title > twitter:title
71
+ title_candidates = [
72
+ (soup.title.string if soup.title and soup.title.string else None),
73
+ _og(soup, "og:title"),
74
+ _meta(soup, "twitter:title"),
75
+ ]
76
+ meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
77
+
78
+ # Description preference: meta[name=description] > og:description > twitter:description
79
+ desc_candidates = [
80
+ _meta(soup, "description"),
81
+ _og(soup, "og:description"),
82
+ _meta(soup, "twitter:description"),
83
+ ]
84
+ meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
85
+
86
+ # Canonical URL if provided (helps dedupe / standardize)
87
+ link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
88
+ meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
89
+
90
+ # Site name (nice for context)
91
+ meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
92
+
93
+ # Language (if present)
94
+ html_tag = soup.find("html")
95
+ meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
96
+
97
+ # Final resolved URL and domain
98
+ meta["fetched_url"] = final_url
99
+ meta["domain"] = _domain_of(final_url)
100
+
101
+ return meta
102
+
103
+
104
+ def _meta(soup: BeautifulSoup, name: str) -> str | None:
105
+ tag = soup.find("meta", attrs={"name": name})
106
+ return tag.get("content") if tag and tag.has_attr("content") else None
107
+
108
+
109
+ def _og(soup: BeautifulSoup, prop: str) -> str | None:
110
+ tag = soup.find("meta", attrs={"property": prop})
111
+ return tag.get("content") if tag and tag.has_attr("content") else None
112
+
113
+
114
+ # ---------------------------------------------------------
115
+ # Main content extraction with Readability + gentle cleanup
116
+ # ---------------------------------------------------------
117
+ def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
118
+ """
119
+ Layman's terms: use Readability to find the article body, then clean it to plain text.
120
+ Returns (clean_text, soup_of_readable_html) for link scraping.
121
+ """
122
+ # Readability gives us a simplified article HTML
123
+ doc = Document(html)
124
+ readable_html = doc.summary(html_partial=True)
125
+
126
+ # Parse the simplified HTML so we can clean it up further
127
+ s = BeautifulSoup(readable_html, "lxml")
128
+
129
+ # Remove obviously noisy elements if present
130
+ for sel in ["script", "style", "noscript", "iframe", "svg"]:
131
+ for tag in s.select(sel):
132
+ tag.decompose()
133
+
134
+ # Extract text with paragraphs preserved, then normalize whitespace
135
+ text_parts = []
136
+ for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
137
+ # Keep list items and headers to retain structure without being too verbose
138
+ chunk = p.get_text(" ", strip=True)
139
+ if chunk:
140
+ text_parts.append(chunk)
141
+
142
+ clean_text = _normalize_whitespace("\n\n".join(text_parts))
143
+ return clean_text, s
144
+
145
+
146
+ # ------------------------------------------
147
+ # Link extraction from the simplified content
148
+ # ------------------------------------------
149
+ def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
150
+ """
151
+ Layman's terms: pull out clickable links from the article content only,
152
+ turn them into absolute URLs, drop junk, dedupe, and cap the list.
153
  """
 
 
 
 
 
 
 
 
154
  seen = set()
155
+ links: list[tuple[str, str]] = []
156
+
157
+ for a in readable_soup.find_all("a", href=True):
158
+ href = a.get("href").strip()
159
+ # Ignore anchors, mailto, javascript, and empty
160
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
 
161
  continue
162
+
163
+ # Resolve relative URLs and strip URL fragments (#section)
164
  absolute = urljoin(base_url, href)
165
+ absolute, _ = urldefrag(absolute)
166
+
167
+ if absolute in seen:
168
  continue
169
+ seen.add(absolute)
170
+
171
+ text = a.get_text(" ", strip=True)
172
+ # Keep link text concise
173
+ if len(text) > 120:
174
+ text = text[:117] + "…"
175
 
176
+ links.append((text or absolute, absolute))
177
 
178
+ if len(links) >= max_links > 0:
179
+ break
 
180
 
181
+ return links
 
 
 
 
 
 
 
 
 
182
 
183
 
184
+ # -------------------------
185
+ # Formatter: compact output
186
+ # -------------------------
187
+ def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
188
+ include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
189
  """
190
+ Layman's terms: turn the pieces into a neat, compact Markdown string.
 
191
  """
192
+ lines = []
193
+
194
+ # Title header
195
+ title = meta.get("title") or meta.get("domain") or "Untitled"
196
+ lines.append(f"# {title}")
197
+
198
+ # Metadata (compact)
199
+ if include_metadata:
200
+ md = []
201
+ # Only show fields that exist to keep things tight
202
+ if meta.get("description"):
203
+ md.append(f"- **Description:** {meta['description']}")
204
+ if meta.get("site_name"):
205
+ md.append(f"- **Site:** {meta['site_name']}")
206
+ if meta.get("canonical"):
207
+ md.append(f"- **Canonical:** {meta['canonical']}")
208
+ if meta.get("lang"):
209
+ md.append(f"- **Language:** {meta['lang']}")
210
+ if meta.get("fetched_url"):
211
+ md.append(f"- **Fetched From:** {meta['fetched_url']}")
212
+
213
+ if md:
214
+ lines.append("## Metadata")
215
+ lines.extend(md)
216
+
217
+ # Body text
218
+ if include_text and body:
219
+ # For "Brief", show a very small excerpt even after truncation
220
+ if verbosity == "Brief":
221
+ brief, was_more = _truncate(body, 800)
222
+ lines.append("## Text")
223
+ lines.append(brief)
224
+ if was_more or body_truncated:
225
+ lines.append("\n> (Trimmed for brevity)")
226
+ else:
227
+ lines.append("## Text")
228
+ lines.append(body)
229
+ if body_truncated:
230
+ lines.append("\n> (Trimmed for brevity)")
231
+
232
+ # Links
233
+ if include_links and links:
234
+ lines.append(f"## Links ({len(links)})")
235
+ for text, url in links:
236
+ lines.append(f"- [{text}]({url})")
237
+
238
+ return "\n\n".join(lines).strip()
239
+
240
+
241
+ # --------------------------------
242
+ # Gradio-facing function (the app)
243
+ # --------------------------------
244
+ def extract_relevant(
245
+ url: str,
246
+ verbosity: str = "Standard",
247
+ include_metadata: bool = True,
248
+ include_text: bool = True,
249
+ include_links: bool = True,
250
+ max_chars: int = 3000,
251
+ max_links: int = 20
252
+ ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  """
254
+ Layman's terms: the main button action.
255
+ Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
 
 
 
256
  """
257
+ if not url or not url.strip():
258
+ return "Please enter a valid URL."
259
+
260
  try:
261
+ resp = _http_get(url)
262
+ resp.raise_for_status()
263
  except requests.exceptions.RequestException as e:
264
+ return f"An error occurred: {e}"
265
+
266
+ # Respect the final resolved URL (after redirects)
267
+ final_url = str(resp.url)
268
+
269
+ # Only process HTML-ish responses
270
+ ctype = resp.headers.get("Content-Type", "")
271
+ if "html" not in ctype.lower():
272
+ return f"Unsupported content type for extraction: {ctype or 'unknown'}"
273
+
274
+ # Decode as text (requests usually sets encoding; otherwise guess)
275
+ resp.encoding = resp.encoding or resp.apparent_encoding
276
+ html = resp.text
277
+
278
+ # Full page soup (to extract metadata accurately)
279
+ full_soup = BeautifulSoup(html, "lxml")
280
+ meta = _extract_metadata(full_soup, final_url)
281
+
282
+ # Extract main body text using Readability
283
+ body_text, readable_soup = _extract_main_text(html)
284
+
285
+ # If the body is suspiciously empty, fall back to a simpler text strategy
286
+ if not body_text:
287
+ fallback_text = full_soup.get_text(" ", strip=True)
288
+ body_text = _normalize_whitespace(fallback_text)
289
+
290
+ # Enforce verbosity presets unless user overrides via slider
291
+ preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
292
+ target_cap = preset_caps.get(verbosity, 3000)
293
+ # Use the *smaller* of user cap and preset to keep things tidy
294
+ cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
295
+ body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
296
+
297
+ # Extract links from the readable portion only (cleaner than whole DOM)
298
+ links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
299
+
300
+ # Build compact Markdown
301
+ md = _format_markdown(
302
+ meta=meta,
303
+ body=body_text,
304
+ body_truncated=truncated,
305
+ links=links,
306
+ include_text=include_text,
307
+ include_metadata=include_metadata,
308
+ include_links=include_links,
309
+ verbosity=verbosity
 
 
 
 
 
 
 
 
 
310
  )
311
+ return md or "No content could be extracted."
312
+
313
+
314
+ # -----------------
315
+ # Gradio UI (Blocks)
316
+ # -----------------
317
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
318
+ # Title & subtitle for clarity
319
+ gr.Markdown("# Fetch MCP — Clean Extract")
320
+ gr.Markdown(
321
+ "Extract **title**, **metadata**, **clean text**, and **links** without the noisy HTML. "
322
+ "Use Verbosity and caps to keep it tight."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  )
324
 
325
+ with gr.Row():
326
+ url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
327
+ fetch_btn = gr.Button("Fetch Clean Content")
328
+
329
+ with gr.Accordion("Options", open=False):
330
+ with gr.Row():
331
+ verbosity = gr.Dropdown(
332
+ label="Verbosity",
333
+ choices=["Brief", "Standard", "Full"],
334
+ value="Standard",
335
+ info="Controls how much text you get back."
336
+ )
337
+ max_chars = gr.Slider(
338
+ 400, 12000, value=3000, step=100,
339
+ label="Max Characters (body text)",
340
+ info="Hard cap for body text. Lower = less verbose."
341
+ )
342
+ max_links = gr.Slider(
343
+ 0, 100, value=20, step=1,
344
+ label="Max Links",
345
+ info="Limit how many hyperlinks we include."
346
+ )
347
+ with gr.Row():
348
+ include_metadata = gr.Checkbox(value=True, label="Include Metadata")
349
+ include_text = gr.Checkbox(value=True, label="Include Main Text")
350
+ include_links = gr.Checkbox(value=True, label="Include Links")
351
+
352
+ # Output as Markdown (compact and readable)
353
+ out = gr.Markdown(label="Result")
354
+
355
+ # Wire up the click
356
  fetch_btn.click(
357
+ fn=extract_relevant,
358
+ inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
359
+ outputs=out
360
  )
361
 
362
+ # Keep MCP server enabled
363
  if __name__ == "__main__":
364
  demo.launch(mcp_server=True)