Nymbo commited on
Commit
60bdd74
·
verified ·
1 Parent(s): ac9f3b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -351
app.py CHANGED
@@ -1,364 +1,102 @@
1
- # File: main/app.py
2
- # Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
3
- # instead of returning full HTML. Output is compact and configurable to reduce verbosity.
4
-
5
- import gradio as gr # UI library
6
- import requests # HTTP client
7
- from bs4 import BeautifulSoup # HTML parsing
8
- from readability import Document # Readability algorithm to isolate main content
9
- from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
10
- import re # For whitespace cleanup and simple formatting
11
-
12
-
13
- # -------------------------------
14
- # HTTP fetching with sane defaults
15
- # -------------------------------
16
- def _http_get(url: str) -> requests.Response:
17
- """
18
- Make an HTTP GET request with headers and a timeout.
19
- Layman's terms: downloads the webpage safely and politely.
20
- """
21
- headers = {
22
- "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
23
- "Accept-Language": "en-US,en;q=0.9",
24
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
25
- }
26
- # Short timeouts so the app isn't stuck forever
27
- return requests.get(url, headers=headers, timeout=15)
28
-
29
-
30
- # ----------------------------------------
31
- # Helpers: text cleanup & friendly trimming
32
- # ----------------------------------------
33
- def _normalize_whitespace(text: str) -> str:
34
- """
35
- Layman's terms: squash weird spacing and too many blank lines.
36
- """
37
- text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
38
- text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time
39
- return text.strip()
40
-
41
-
42
- def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
43
- """
44
- Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
45
- """
46
- if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
47
- return text, False
48
- return text[:max_chars].rstrip() + " …", True
49
-
50
-
51
- def _domain_of(url: str) -> str:
52
- """
53
- Layman's terms: show a friendly domain like example.com.
54
- """
55
- try:
56
- return urlparse(url).netloc or ""
57
- except Exception:
58
- return ""
59
-
60
 
61
- # -----------------------------------
62
- # Metadata extraction (title, etc.)
63
- # -----------------------------------
64
- def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
65
- """
66
- Layman's terms: grab useful fields like title, description, site name, and canonical link.
67
- """
68
- meta = {}
69
 
70
- # Title preference: <title> > og:title > twitter:title
71
- title_candidates = [
72
- (soup.title.string if soup.title and soup.title.string else None),
73
- _og(soup, "og:title"),
74
- _meta(soup, "twitter:title"),
75
  ]
76
- meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
77
 
78
- # Description preference: meta[name=description] > og:description > twitter:description
79
- desc_candidates = [
80
- _meta(soup, "description"),
81
- _og(soup, "og:description"),
82
- _meta(soup, "twitter:description"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  ]
84
- meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
85
-
86
- # Canonical URL if provided (helps dedupe / standardize)
87
- link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
88
- meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
89
-
90
- # Site name (nice for context)
91
- meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
92
-
93
- # Language (if present)
94
- html_tag = soup.find("html")
95
- meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
96
-
97
- # Final resolved URL and domain
98
- meta["fetched_url"] = final_url
99
- meta["domain"] = _domain_of(final_url)
100
-
101
- return meta
102
-
103
-
104
- def _meta(soup: BeautifulSoup, name: str) -> str | None:
105
- tag = soup.find("meta", attrs={"name": name})
106
- return tag.get("content") if tag and tag.has_attr("content") else None
107
 
108
 
109
- def _og(soup: BeautifulSoup, prop: str) -> str | None:
110
- tag = soup.find("meta", attrs={"property": prop})
111
- return tag.get("content") if tag and tag.has_attr("content") else None
112
-
113
-
114
- # ---------------------------------------------------------
115
- # Main content extraction with Readability + gentle cleanup
116
- # ---------------------------------------------------------
117
- def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
118
- """
119
- Layman's terms: use Readability to find the article body, then clean it to plain text.
120
- Returns (clean_text, soup_of_readable_html) for link scraping.
121
  """
122
- # Readability gives us a simplified article HTML
123
- doc = Document(html)
124
- readable_html = doc.summary(html_partial=True)
125
-
126
- # Parse the simplified HTML so we can clean it up further
127
- s = BeautifulSoup(readable_html, "lxml")
128
-
129
- # Remove obviously noisy elements if present
130
- for sel in ["script", "style", "noscript", "iframe", "svg"]:
131
- for tag in s.select(sel):
132
- tag.decompose()
133
-
134
- # Extract text with paragraphs preserved, then normalize whitespace
135
- text_parts = []
136
- for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
137
- # Keep list items and headers to retain structure without being too verbose
138
- chunk = p.get_text(" ", strip=True)
139
- if chunk:
140
- text_parts.append(chunk)
141
-
142
- clean_text = _normalize_whitespace("\n\n".join(text_parts))
143
- return clean_text, s
144
-
145
-
146
- # ------------------------------------------
147
- # Link extraction from the simplified content
148
- # ------------------------------------------
149
- def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
150
  """
151
- Layman's terms: pull out clickable links from the article content only,
152
- turn them into absolute URLs, drop junk, dedupe, and cap the list.
153
- """
154
- seen = set()
155
- links: list[tuple[str, str]] = []
156
-
157
- for a in readable_soup.find_all("a", href=True):
158
- href = a.get("href").strip()
159
- # Ignore anchors, mailto, javascript, and empty
160
- if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
161
- continue
162
-
163
- # Resolve relative URLs and strip URL fragments (#section)
164
- absolute = urljoin(base_url, href)
165
- absolute, _ = urldefrag(absolute)
166
-
167
- if absolute in seen:
168
- continue
169
- seen.add(absolute)
170
-
171
- text = a.get_text(" ", strip=True)
172
- # Keep link text concise
173
- if len(text) > 120:
174
- text = text[:117] + "…"
175
-
176
- links.append((text or absolute, absolute))
177
-
178
- if len(links) >= max_links > 0:
179
- break
180
-
181
- return links
182
-
183
-
184
- # -------------------------
185
- # Formatter: compact output
186
- # -------------------------
187
- def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
188
- include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
189
- """
190
- Layman's terms: turn the pieces into a neat, compact Markdown string.
191
- """
192
- lines = []
193
-
194
- # Title header
195
- title = meta.get("title") or meta.get("domain") or "Untitled"
196
- lines.append(f"# {title}")
197
-
198
- # Metadata (compact)
199
- if include_metadata:
200
- md = []
201
- # Only show fields that exist to keep things tight
202
- if meta.get("description"):
203
- md.append(f"- **Description:** {meta['description']}")
204
- if meta.get("site_name"):
205
- md.append(f"- **Site:** {meta['site_name']}")
206
- if meta.get("canonical"):
207
- md.append(f"- **Canonical:** {meta['canonical']}")
208
- if meta.get("lang"):
209
- md.append(f"- **Language:** {meta['lang']}")
210
- if meta.get("fetched_url"):
211
- md.append(f"- **Fetched From:** {meta['fetched_url']}")
212
-
213
- if md:
214
- lines.append("## Metadata")
215
- lines.extend(md)
216
-
217
- # Body text
218
- if include_text and body:
219
- # For "Brief", show a very small excerpt even after truncation
220
- if verbosity == "Brief":
221
- brief, was_more = _truncate(body, 800)
222
- lines.append("## Text")
223
- lines.append(brief)
224
- if was_more or body_truncated:
225
- lines.append("\n> (Trimmed for brevity)")
226
- else:
227
- lines.append("## Text")
228
- lines.append(body)
229
- if body_truncated:
230
- lines.append("\n> (Trimmed for brevity)")
231
-
232
- # Links
233
- if include_links and links:
234
- lines.append(f"## Links ({len(links)})")
235
- for text, url in links:
236
- lines.append(f"- [{text}]({url})")
237
-
238
- return "\n\n".join(lines).strip()
239
-
240
-
241
- # --------------------------------
242
- # Gradio-facing function (the app)
243
- # --------------------------------
244
- def extract_relevant(
245
- url: str,
246
- verbosity: str = "Standard",
247
- include_metadata: bool = True,
248
- include_text: bool = True,
249
- include_links: bool = True,
250
- max_chars: int = 3000,
251
- max_links: int = 20
252
- ) -> str:
253
- """
254
- Layman's terms: the main button action.
255
- Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
256
- """
257
- if not url or not url.strip():
258
- return "Please enter a valid URL."
259
-
260
  try:
261
- resp = _http_get(url)
262
- resp.raise_for_status()
263
- except requests.exceptions.RequestException as e:
264
- return f"An error occurred: {e}"
265
-
266
- # Respect the final resolved URL (after redirects)
267
- final_url = str(resp.url)
268
-
269
- # Only process HTML-ish responses
270
- ctype = resp.headers.get("Content-Type", "")
271
- if "html" not in ctype.lower():
272
- return f"Unsupported content type for extraction: {ctype or 'unknown'}"
273
-
274
- # Decode as text (requests usually sets encoding; otherwise guess)
275
- resp.encoding = resp.encoding or resp.apparent_encoding
276
- html = resp.text
277
-
278
- # Full page soup (to extract metadata accurately)
279
- full_soup = BeautifulSoup(html, "lxml")
280
- meta = _extract_metadata(full_soup, final_url)
281
-
282
- # Extract main body text using Readability
283
- body_text, readable_soup = _extract_main_text(html)
284
-
285
- # If the body is suspiciously empty, fall back to a simpler text strategy
286
- if not body_text:
287
- fallback_text = full_soup.get_text(" ", strip=True)
288
- body_text = _normalize_whitespace(fallback_text)
289
-
290
- # Enforce verbosity presets unless user overrides via slider
291
- preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
292
- target_cap = preset_caps.get(verbosity, 3000)
293
- # Use the *smaller* of user cap and preset to keep things tidy
294
- cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
295
- body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
296
-
297
- # Extract links from the readable portion only (cleaner than whole DOM)
298
- links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
299
-
300
- # Build compact Markdown
301
- md = _format_markdown(
302
- meta=meta,
303
- body=body_text,
304
- body_truncated=truncated,
305
- links=links,
306
- include_text=include_text,
307
- include_metadata=include_metadata,
308
- include_links=include_links,
309
- verbosity=verbosity
310
- )
311
- return md or "No content could be extracted."
312
-
313
-
314
- # -----------------
315
- # Gradio UI (Blocks)
316
- # -----------------
317
- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
318
- # Title & subtitle for clarity
319
- gr.Markdown("# Fetch MCP — Clean Extract")
320
- gr.Markdown(
321
- "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
322
- "Use Verbosity and caps to keep it tight."
323
- )
324
-
325
- with gr.Row():
326
- url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
327
- fetch_btn = gr.Button("Fetch Clean Content")
328
-
329
- with gr.Accordion("Options", open=False):
330
- with gr.Row():
331
- verbosity = gr.Dropdown(
332
- label="Verbosity",
333
- choices=["Brief", "Standard", "Full"],
334
- value="Standard",
335
- info="Controls how much text you get back."
336
- )
337
- max_chars = gr.Slider(
338
- 400, 12000, value=3000, step=100,
339
- label="Max Characters (body text)",
340
- info="Hard cap for body text. Lower = less verbose."
341
- )
342
- max_links = gr.Slider(
343
- 0, 100, value=20, step=1,
344
- label="Max Links",
345
- info="Limit how many hyperlinks we include."
346
- )
347
- with gr.Row():
348
- include_metadata = gr.Checkbox(value=True, label="Include Metadata")
349
- include_text = gr.Checkbox(value=True, label="Include Main Text")
350
- include_links = gr.Checkbox(value=True, label="Include Links")
351
-
352
- # Output as Markdown (compact and readable)
353
- out = gr.Markdown(label="Result")
354
-
355
- # Wire up the click
356
- fetch_btn.click(
357
- fn=extract_relevant,
358
- inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
359
- outputs=out
360
- )
361
 
362
- # Keep MCP server enabled
363
  if __name__ == "__main__":
 
364
  demo.launch(mcp_server=True)
 
1
+ # app.py
2
+ # Hugging Face Space: Cleaner web-page fetcher
3
+ # -------------------------------------------------------------
4
+ # Fetches a URL and returns a concise, human-readable snapshot:
5
+ # Title
6
+ # Meta description
7
+ # • Main text (readability-extracted)
8
+ # Hyperlinks (anchor text absolute URL)
9
+ # -------------------------------------------------------------
10
+
11
+ import requests # HTTP client
12
+ from bs4 import BeautifulSoup # HTML parsing
13
+ from readability import Document # Boiler-plate removal
14
+ from urllib.parse import urljoin # Build absolute link URLs
15
+ import gradio as gr # UI framework
16
+
17
+ def extract_relevant_text(html: str, base_url: str) -> str:
18
+ """
19
+ Convert raw HTML into a clean, plain-text summary.
20
+ - html: the page's HTML source
21
+ - base_url: needed for resolving relative <a href="">
22
+ Returns a formatted string ready for display.
23
+ """
24
+ # 1) Let readability isolate the primary article/content
25
+ doc = Document(html)
26
+ title = doc.short_title()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ summary_html = doc.summary() # cleaned, minimal HTML
29
+ summary_soup = BeautifulSoup(summary_html, "lxml")
 
 
 
 
 
 
30
 
31
+ # 2) Grab visible paragraph & list text
32
+ body_parts = [
33
+ tag.get_text(" ", strip=True)
34
+ for tag in summary_soup.find_all(["p", "li"])
35
+ if tag.get_text(strip=True)
36
  ]
37
+ main_text = "\n\n".join(body_parts) or "[No main text extracted]"
38
 
39
+ # 3) Extract meta description from the *full* document
40
+ full_soup = BeautifulSoup(html, "lxml")
41
+ meta_desc = ""
42
+ meta_tag = full_soup.find("meta", attrs={"name": "description"})
43
+ if meta_tag and meta_tag.get("content"):
44
+ meta_desc = meta_tag["content"].strip()
45
+ else: # Fallback to Open Graph description
46
+ og_tag = full_soup.find("meta", attrs={"property": "og:description"})
47
+ if og_tag and og_tag.get("content"):
48
+ meta_desc = og_tag["content"].strip()
49
+
50
+ # 4) Build a neat list of hyperlinks (anchor text → absolute URL)
51
+ links = []
52
+ for a in summary_soup.find_all("a", href=True):
53
+ href_abs = urljoin(base_url, a["href"])
54
+ text = a.get_text(" ", strip=True) or "[link]"
55
+ links.append(f"• {text} → {href_abs}")
56
+
57
+ # 5) Compose the final plaintext output
58
+ sections = [
59
+ f"Title: {title}",
60
+ f"Description: {meta_desc or '[None]'}",
61
+ f"Body:\n{main_text}",
62
+ "Links:\n" + ("\n".join(links) if links else "[No links]")
63
  ]
64
+ return "\n\n".join(sections)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
+ def fetch_content(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
+ Fetch the URL and return a concise summary.
70
+ Includes basic error handling for network issues.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
+ # Friendly user-agent prevents some 403s
74
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
75
+ response = requests.get(url, headers=headers, timeout=15)
76
+ response.raise_for_status() # 4xx/5xx exception
77
+
78
+ return extract_relevant_text(response.text, url)
79
+
80
+ except requests.exceptions.RequestException as err:
81
+ # Any network or HTTP error bubbles up here
82
+ return f"[Error] {err}"
83
+
84
+
85
+ # -------------------------- Gradio UI --------------------------
86
+ demo = gr.Interface(
87
+ fn=fetch_content,
88
+ inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
89
+ outputs=gr.Textbox(
90
+ label="Clean Page Snapshot",
91
+ interactive=False,
92
+ lines=25, # taller box for readability
93
+ ),
94
+ title="Clean Web Snapshot",
95
+ description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
96
+ allow_flagging="never",
97
+ theme="Nymbo/Nymbo_Theme",
98
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
100
  if __name__ == "__main__":
101
+ # Expose as an MCP server so you can chain it with other Spaces
102
  demo.launch(mcp_server=True)