Nymbo commited on
Commit
b708e4c
·
verified ·
1 Parent(s): 60bdd74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +351 -89
app.py CHANGED
@@ -1,102 +1,364 @@
1
- # app.py
2
- # Hugging Face Space: Cleaner web-page fetcher
3
- # -------------------------------------------------------------
4
- # Fetches a URL and returns a concise, human-readable snapshot:
5
- # Title
6
- # Meta description
7
- # Main text (readability-extracted)
8
- # Hyperlinks (anchor text absolute URL)
9
- # -------------------------------------------------------------
10
-
11
- import requests # HTTP client
12
- from bs4 import BeautifulSoup # HTML parsing
13
- from readability import Document # Boiler-plate removal
14
- from urllib.parse import urljoin # Build absolute link URLs
15
- import gradio as gr # UI framework
16
-
17
- def extract_relevant_text(html: str, base_url: str) -> str:
18
- """
19
- Convert raw HTML into a clean, plain-text summary.
20
- - html: the page's HTML source
21
- - base_url: needed for resolving relative <a href="">
22
- Returns a formatted string ready for display.
23
- """
24
- # 1) Let readability isolate the primary article/content
25
- doc = Document(html)
26
- title = doc.short_title()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- summary_html = doc.summary() # cleaned, minimal HTML
29
- summary_soup = BeautifulSoup(summary_html, "lxml")
 
 
 
 
 
 
30
 
31
- # 2) Grab visible paragraph & list text
32
- body_parts = [
33
- tag.get_text(" ", strip=True)
34
- for tag in summary_soup.find_all(["p", "li"])
35
- if tag.get_text(strip=True)
36
  ]
37
- main_text = "\n\n".join(body_parts) or "[No main text extracted]"
38
 
39
- # 3) Extract meta description from the *full* document
40
- full_soup = BeautifulSoup(html, "lxml")
41
- meta_desc = ""
42
- meta_tag = full_soup.find("meta", attrs={"name": "description"})
43
- if meta_tag and meta_tag.get("content"):
44
- meta_desc = meta_tag["content"].strip()
45
- else: # Fallback to Open Graph description
46
- og_tag = full_soup.find("meta", attrs={"property": "og:description"})
47
- if og_tag and og_tag.get("content"):
48
- meta_desc = og_tag["content"].strip()
49
-
50
- # 4) Build a neat list of hyperlinks (anchor text → absolute URL)
51
- links = []
52
- for a in summary_soup.find_all("a", href=True):
53
- href_abs = urljoin(base_url, a["href"])
54
- text = a.get_text(" ", strip=True) or "[link]"
55
- links.append(f"• {text} → {href_abs}")
56
-
57
- # 5) Compose the final plaintext output
58
- sections = [
59
- f"Title: {title}",
60
- f"Description: {meta_desc or '[None]'}",
61
- f"Body:\n{main_text}",
62
- "Links:\n" + ("\n".join(links) if links else "[No links]")
63
  ]
64
- return "\n\n".join(sections)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
- def fetch_content(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
- Fetch the URL and return a concise summary.
70
- Includes basic error handling for network issues.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
- # Friendly user-agent prevents some 403s
74
- headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
75
- response = requests.get(url, headers=headers, timeout=15)
76
- response.raise_for_status() # 4xx/5xx exception
77
-
78
- return extract_relevant_text(response.text, url)
79
-
80
- except requests.exceptions.RequestException as err:
81
- # Any network or HTTP error bubbles up here
82
- return f"[Error] {err}"
83
-
84
-
85
- # -------------------------- Gradio UI --------------------------
86
- demo = gr.Interface(
87
- fn=fetch_content,
88
- inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
89
- outputs=gr.Textbox(
90
- label="Clean Page Snapshot",
91
- interactive=False,
92
- lines=25, # taller box for readability
93
- ),
94
- title="Clean Web Snapshot",
95
- description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
96
- allow_flagging="never",
97
- theme="Nymbo/Nymbo_Theme",
98
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
100
  if __name__ == "__main__":
101
- # Expose as an MCP server so you can chain it with other Spaces
102
  demo.launch(mcp_server=True)
 
1
+ # File: main/app.py
2
+ # Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
3
+ # instead of returning full HTML. Output is compact and configurable to reduce verbosity.
4
+
5
+ import gradio as gr # UI library
6
+ import requests # HTTP client
7
+ from bs4 import BeautifulSoup # HTML parsing
8
+ from readability import Document # Readability algorithm to isolate main content
9
+ from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
10
+ import re # For whitespace cleanup and simple formatting
11
+
12
+
13
+ # -------------------------------
14
+ # HTTP fetching with sane defaults
15
+ # -------------------------------
16
+ def _http_get(url: str) -> requests.Response:
17
+ """
18
+ Make an HTTP GET request with headers and a timeout.
19
+ Layman's terms: downloads the webpage safely and politely.
20
+ """
21
+ headers = {
22
+ "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
23
+ "Accept-Language": "en-US,en;q=0.9",
24
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
25
+ }
26
+ # Short timeouts so the app isn't stuck forever
27
+ return requests.get(url, headers=headers, timeout=15)
28
+
29
+
30
+ # ----------------------------------------
31
+ # Helpers: text cleanup & friendly trimming
32
+ # ----------------------------------------
33
+ def _normalize_whitespace(text: str) -> str:
34
+ """
35
+ Layman's terms: squash weird spacing and too many blank lines.
36
+ """
37
+ text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
38
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time
39
+ return text.strip()
40
+
41
+
42
+ def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
43
+ """
44
+ Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
45
+ """
46
+ if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
47
+ return text, False
48
+ return text[:max_chars].rstrip() + " …", True
49
+
50
+
51
+ def _domain_of(url: str) -> str:
52
+ """
53
+ Layman's terms: show a friendly domain like example.com.
54
+ """
55
+ try:
56
+ return urlparse(url).netloc or ""
57
+ except Exception:
58
+ return ""
59
+
60
 
61
+ # -----------------------------------
62
+ # Metadata extraction (title, etc.)
63
+ # -----------------------------------
64
+ def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
65
+ """
66
+ Layman's terms: grab useful fields like title, description, site name, and canonical link.
67
+ """
68
+ meta = {}
69
 
70
+ # Title preference: <title> > og:title > twitter:title
71
+ title_candidates = [
72
+ (soup.title.string if soup.title and soup.title.string else None),
73
+ _og(soup, "og:title"),
74
+ _meta(soup, "twitter:title"),
75
  ]
76
+ meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
77
 
78
+ # Description preference: meta[name=description] > og:description > twitter:description
79
+ desc_candidates = [
80
+ _meta(soup, "description"),
81
+ _og(soup, "og:description"),
82
+ _meta(soup, "twitter:description"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  ]
84
+ meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
85
+
86
+ # Canonical URL if provided (helps dedupe / standardize)
87
+ link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
88
+ meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
89
+
90
+ # Site name (nice for context)
91
+ meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
92
+
93
+ # Language (if present)
94
+ html_tag = soup.find("html")
95
+ meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
96
+
97
+ # Final resolved URL and domain
98
+ meta["fetched_url"] = final_url
99
+ meta["domain"] = _domain_of(final_url)
100
+
101
+ return meta
102
+
103
+
104
+ def _meta(soup: BeautifulSoup, name: str) -> str | None:
105
+ tag = soup.find("meta", attrs={"name": name})
106
+ return tag.get("content") if tag and tag.has_attr("content") else None
107
 
108
 
109
+ def _og(soup: BeautifulSoup, prop: str) -> str | None:
110
+ tag = soup.find("meta", attrs={"property": prop})
111
+ return tag.get("content") if tag and tag.has_attr("content") else None
112
+
113
+
114
+ # ---------------------------------------------------------
115
+ # Main content extraction with Readability + gentle cleanup
116
+ # ---------------------------------------------------------
117
+ def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
118
+ """
119
+ Layman's terms: use Readability to find the article body, then clean it to plain text.
120
+ Returns (clean_text, soup_of_readable_html) for link scraping.
121
  """
122
+ # Readability gives us a simplified article HTML
123
+ doc = Document(html)
124
+ readable_html = doc.summary(html_partial=True)
125
+
126
+ # Parse the simplified HTML so we can clean it up further
127
+ s = BeautifulSoup(readable_html, "lxml")
128
+
129
+ # Remove obviously noisy elements if present
130
+ for sel in ["script", "style", "noscript", "iframe", "svg"]:
131
+ for tag in s.select(sel):
132
+ tag.decompose()
133
+
134
+ # Extract text with paragraphs preserved, then normalize whitespace
135
+ text_parts = []
136
+ for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
137
+ # Keep list items and headers to retain structure without being too verbose
138
+ chunk = p.get_text(" ", strip=True)
139
+ if chunk:
140
+ text_parts.append(chunk)
141
+
142
+ clean_text = _normalize_whitespace("\n\n".join(text_parts))
143
+ return clean_text, s
144
+
145
+
146
+ # ------------------------------------------
147
+ # Link extraction from the simplified content
148
+ # ------------------------------------------
149
+ def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
150
  """
151
+ Layman's terms: pull out clickable links from the article content only,
152
+ turn them into absolute URLs, drop junk, dedupe, and cap the list.
153
+ """
154
+ seen = set()
155
+ links: list[tuple[str, str]] = []
156
+
157
+ for a in readable_soup.find_all("a", href=True):
158
+ href = a.get("href").strip()
159
+ # Ignore anchors, mailto, javascript, and empty
160
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
161
+ continue
162
+
163
+ # Resolve relative URLs and strip URL fragments (#section)
164
+ absolute = urljoin(base_url, href)
165
+ absolute, _ = urldefrag(absolute)
166
+
167
+ if absolute in seen:
168
+ continue
169
+ seen.add(absolute)
170
+
171
+ text = a.get_text(" ", strip=True)
172
+ # Keep link text concise
173
+ if len(text) > 120:
174
+ text = text[:117] + "…"
175
+
176
+ links.append((text or absolute, absolute))
177
+
178
+ if len(links) >= max_links > 0:
179
+ break
180
+
181
+ return links
182
+
183
+
184
+ # -------------------------
185
+ # Formatter: compact output
186
+ # -------------------------
187
+ def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
188
+ include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
189
+ """
190
+ Layman's terms: turn the pieces into a neat, compact Markdown string.
191
+ """
192
+ lines = []
193
+
194
+ # Title header
195
+ title = meta.get("title") or meta.get("domain") or "Untitled"
196
+ lines.append(f"# {title}")
197
+
198
+ # Metadata (compact)
199
+ if include_metadata:
200
+ md = []
201
+ # Only show fields that exist to keep things tight
202
+ if meta.get("description"):
203
+ md.append(f"- **Description:** {meta['description']}")
204
+ if meta.get("site_name"):
205
+ md.append(f"- **Site:** {meta['site_name']}")
206
+ if meta.get("canonical"):
207
+ md.append(f"- **Canonical:** {meta['canonical']}")
208
+ if meta.get("lang"):
209
+ md.append(f"- **Language:** {meta['lang']}")
210
+ if meta.get("fetched_url"):
211
+ md.append(f"- **Fetched From:** {meta['fetched_url']}")
212
+
213
+ if md:
214
+ lines.append("## Metadata")
215
+ lines.extend(md)
216
+
217
+ # Body text
218
+ if include_text and body:
219
+ # For "Brief", show a very small excerpt even after truncation
220
+ if verbosity == "Brief":
221
+ brief, was_more = _truncate(body, 800)
222
+ lines.append("## Text")
223
+ lines.append(brief)
224
+ if was_more or body_truncated:
225
+ lines.append("\n> (Trimmed for brevity)")
226
+ else:
227
+ lines.append("## Text")
228
+ lines.append(body)
229
+ if body_truncated:
230
+ lines.append("\n> (Trimmed for brevity)")
231
+
232
+ # Links
233
+ if include_links and links:
234
+ lines.append(f"## Links ({len(links)})")
235
+ for text, url in links:
236
+ lines.append(f"- [{text}]({url})")
237
+
238
+ return "\n\n".join(lines).strip()
239
+
240
+
241
+ # --------------------------------
242
+ # Gradio-facing function (the app)
243
+ # --------------------------------
244
+ def extract_relevant(
245
+ url: str,
246
+ verbosity: str = "Standard",
247
+ include_metadata: bool = True,
248
+ include_text: bool = True,
249
+ include_links: bool = True,
250
+ max_chars: int = 3000,
251
+ max_links: int = 20
252
+ ) -> str:
253
+ """
254
+ Layman's terms: the main button action.
255
+ Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
256
+ """
257
+ if not url or not url.strip():
258
+ return "Please enter a valid URL."
259
+
260
  try:
261
+ resp = _http_get(url)
262
+ resp.raise_for_status()
263
+ except requests.exceptions.RequestException as e:
264
+ return f"An error occurred: {e}"
265
+
266
+ # Respect the final resolved URL (after redirects)
267
+ final_url = str(resp.url)
268
+
269
+ # Only process HTML-ish responses
270
+ ctype = resp.headers.get("Content-Type", "")
271
+ if "html" not in ctype.lower():
272
+ return f"Unsupported content type for extraction: {ctype or 'unknown'}"
273
+
274
+ # Decode as text (requests usually sets encoding; otherwise guess)
275
+ resp.encoding = resp.encoding or resp.apparent_encoding
276
+ html = resp.text
277
+
278
+ # Full page soup (to extract metadata accurately)
279
+ full_soup = BeautifulSoup(html, "lxml")
280
+ meta = _extract_metadata(full_soup, final_url)
281
+
282
+ # Extract main body text using Readability
283
+ body_text, readable_soup = _extract_main_text(html)
284
+
285
+ # If the body is suspiciously empty, fall back to a simpler text strategy
286
+ if not body_text:
287
+ fallback_text = full_soup.get_text(" ", strip=True)
288
+ body_text = _normalize_whitespace(fallback_text)
289
+
290
+ # Enforce verbosity presets unless user overrides via slider
291
+ preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
292
+ target_cap = preset_caps.get(verbosity, 3000)
293
+ # Use the *smaller* of user cap and preset to keep things tidy
294
+ cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
295
+ body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
296
+
297
+ # Extract links from the readable portion only (cleaner than whole DOM)
298
+ links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
299
+
300
+ # Build compact Markdown
301
+ md = _format_markdown(
302
+ meta=meta,
303
+ body=body_text,
304
+ body_truncated=truncated,
305
+ links=links,
306
+ include_text=include_text,
307
+ include_metadata=include_metadata,
308
+ include_links=include_links,
309
+ verbosity=verbosity
310
+ )
311
+ return md or "No content could be extracted."
312
+
313
+
314
+ # -----------------
315
+ # Gradio UI (Blocks)
316
+ # -----------------
317
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
318
+ # Title & subtitle for clarity
319
+ gr.Markdown("# Fetch MCP — Clean Extract")
320
+ gr.Markdown(
321
+ "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
322
+ "Use Verbosity and caps to keep it tight."
323
+ )
324
+
325
+ with gr.Row():
326
+ url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
327
+ fetch_btn = gr.Button("Fetch Clean Content")
328
+
329
+ with gr.Accordion("Options", open=False):
330
+ with gr.Row():
331
+ verbosity = gr.Dropdown(
332
+ label="Verbosity",
333
+ choices=["Brief", "Standard", "Full"],
334
+ value="Standard",
335
+ info="Controls how much text you get back."
336
+ )
337
+ max_chars = gr.Slider(
338
+ 400, 12000, value=3000, step=100,
339
+ label="Max Characters (body text)",
340
+ info="Hard cap for body text. Lower = less verbose."
341
+ )
342
+ max_links = gr.Slider(
343
+ 0, 100, value=20, step=1,
344
+ label="Max Links",
345
+ info="Limit how many hyperlinks we include."
346
+ )
347
+ with gr.Row():
348
+ include_metadata = gr.Checkbox(value=True, label="Include Metadata")
349
+ include_text = gr.Checkbox(value=True, label="Include Main Text")
350
+ include_links = gr.Checkbox(value=True, label="Include Links")
351
+
352
+ # Output as Markdown (compact and readable)
353
+ out = gr.Markdown(label="Result")
354
+
355
+ # Wire up the click
356
+ fetch_btn.click(
357
+ fn=extract_relevant,
358
+ inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
359
+ outputs=out
360
+ )
361
 
362
+ # Keep MCP server enabled
363
  if __name__ == "__main__":
 
364
  demo.launch(mcp_server=True)