Nymbo commited on
Commit
ed27cf5
·
verified ·
1 Parent(s): 39ae379

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -175
app.py CHANGED
@@ -1,228 +1,241 @@
1
  # File: app.py
2
- # Purpose: Fetch only the readable text from a web page and return it as Markdown
3
- # Notes: This version is more efficient and user-friendly than returning raw HTML.
4
 
5
  import re
6
- import time
7
  import gradio as gr
8
  import requests
9
  from urllib.parse import urlparse
10
- from bs4 import BeautifulSoup # used as a fallback cleaner
11
- from readability import Document # isolates the "main content" like reader view
12
- import html2text # converts HTML to Markdown
13
-
14
- # ----------------------------
15
- # Simple in-memory cache (tiny LRU-ish)
16
- # ----------------------------
17
- # layman's terms: we remember recent results so repeated requests for the same URL are instant
18
- _CACHE = {}
19
- _CACHE_ORDER = []
20
- _CACHE_MAX = 64
21
- _CACHE_TTL_SECONDS = 10 * 60 # 10 minutes
22
-
23
- def _cache_get(key):
24
- # layman's terms: give me the saved value if it's still fresh
25
- item = _CACHE.get(key)
26
- if not item:
27
- return None
28
- value, ts = item
29
- if time.time() - ts > _CACHE_TTL_SECONDS:
30
- _CACHE.pop(key, None)
31
- return None
32
- # refresh order
33
- if key in _CACHE_ORDER:
34
- _CACHE_ORDER.remove(key)
35
- _CACHE_ORDER.append(key)
36
- return value
37
-
38
- def _cache_set(key, value):
39
- # layman's terms: save a result and keep the list from growing too large
40
- _CACHE[key] = (value, time.time())
41
- if key in _CACHE_ORDER:
42
- _CACHE_ORDER.remove(key)
43
- _CACHE_ORDER.append(key)
44
- while len(_CACHE_ORDER) > _CACHE_MAX:
45
- oldest = _CACHE_ORDER.pop(0)
46
- _CACHE.pop(oldest, None)
47
-
48
- # ----------------------------
49
  # Helpers
50
- # ----------------------------
51
 
52
  def _normalize_url(url: str) -> str:
53
  """
54
- layman's terms: if the user forgot 'https://', add it.
 
55
  """
56
- url = url.strip()
 
 
57
  parsed = urlparse(url)
58
  if not parsed.scheme:
59
  url = "https://" + url
60
  return url
61
 
62
- def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
 
 
 
 
63
  """
64
- layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
 
 
 
 
 
 
 
 
 
 
 
65
  """
66
  try:
67
- head = requests.head(
68
- url,
69
- allow_redirects=True,
70
- timeout=(5, 10),
71
- headers={
72
- "User-Agent": "Mozilla/5.0",
73
- "Accept": "text/html,application/xhtml+xml",
74
- "Accept-Encoding": "gzip, deflate, br",
75
- },
76
- )
77
- size = head.headers.get("Content-Length")
78
- if size and size.isdigit():
79
- return int(size) > max_bytes
80
- except requests.exceptions.RequestException:
81
- # layman's terms: if HEAD fails, we won't block the GET just because of that
82
- pass
83
- return False
84
-
85
- def _fetch_html(url: str) -> str:
86
  """
87
- layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
 
88
  """
89
- resp = requests.get(
90
- url,
91
- timeout=(5, 20), # connect, read
92
- headers={
93
- "User-Agent": "Mozilla/5.0",
94
- "Accept": "text/html,application/xhtml+xml",
95
- "Accept-Encoding": "gzip, deflate, br",
96
- "Accept-Language": "en-US,en;q=0.8",
97
- },
 
 
 
 
 
 
98
  )
99
- resp.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Only proceed for text/html payloads
102
- ctype = resp.headers.get("Content-Type", "")
103
- if "text/html" not in ctype.lower():
104
- # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
105
- return resp.text
106
 
107
- # Respect declared encoding where possible
108
- resp.encoding = resp.encoding or "utf-8"
109
- return resp.text
110
 
111
- def _extract_main_html(html: str) -> str:
112
  """
113
- layman's terms: use reader mode (Readability) to isolate the main article/body content.
114
- Falls back to stripping scripts/styles if Readability can't find a core.
115
  """
 
 
116
  try:
117
- doc = Document(html)
118
- main_html = doc.summary(html_partial=True) # main content as HTML
119
- # Make sure we still have something useful
120
- if main_html and len(main_html) > 40:
121
- return main_html
 
 
 
122
  except Exception:
123
- pass
124
 
125
- # Fallback: strip scripts/styles and return a body-only HTML
126
- soup = BeautifulSoup(html, "html.parser")
127
- for tag in soup(["script", "style", "noscript"]):
128
- tag.decompose()
129
- body = soup.body or soup
130
- return str(body)
131
 
132
- def _html_to_markdown(html: str) -> str:
133
  """
134
- layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
135
  """
136
- h = html2text.HTML2Text()
137
- h.ignore_images = True # don't inline images in Markdown
138
- h.ignore_links = False # keep links as [text](url)
139
- h.body_width = 0 # don't hard-wrap lines
140
- h.protect_links = True
141
- h.single_line_break = True
142
- md = h.handle(html)
143
-
144
- # Tidy up excessive blank lines/whitespace
145
- md = re.sub(r"\n{3,}", "\n\n", md).strip()
146
- return md or "_No readable text found on this page._"
147
-
148
- # ----------------------------
149
- # Main callable for Gradio
150
- # ----------------------------
151
-
152
- def fetch_markdown(url: str) -> str:
153
  """
154
- layman's terms: the function the UI calls.
155
- Steps:
156
- 1) sanitize the URL
157
- 2) quick HEAD check to avoid massive pages
158
- 3) GET the HTML
159
- 4) isolate the main content
160
- 5) convert to Markdown
161
- 6) return Markdown
162
  """
163
- if not url or not url.strip():
164
- return "_Please enter a URL._"
165
-
166
  try:
167
  url = _normalize_url(url)
 
 
 
 
 
 
168
 
169
- # Return cached value if available
170
- cached = _cache_get(url)
171
- if cached:
172
- return cached
173
 
174
- # Optional efficiency: skip very large pages before downloading
175
- if _too_large_via_head(url):
176
- return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
 
 
 
177
 
178
- html = _fetch_html(url)
179
- # If server returned non-HTML (e.g., JSON), just code-fence it
180
- if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
181
- # This condition is a no-op; we already content-typed in _fetch_html.
182
- pass
183
 
184
- main_html = _extract_main_html(html)
185
- markdown = _html_to_markdown(main_html)
 
186
 
187
- _cache_set(url, markdown)
188
- return markdown
189
 
190
  except requests.exceptions.RequestException as e:
191
- # layman's terms: network or HTTP error
192
- return f"_Network error: {e}_"
 
193
  except Exception as e:
194
- # layman's terms: any other unexpected error
195
- return f"_Unexpected error: {e}_"
196
 
197
- # ----------------------------
198
- # Gradio UI
199
- # ----------------------------
200
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
201
- # layman's terms: a simple, centered header explaining what this tool does
202
- gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
 
 
 
 
 
 
 
203
 
204
  with gr.Row():
205
- url_box = gr.Textbox(
206
- label="URL",
207
- placeholder="example.com or https://example.com/article",
 
 
 
 
 
 
 
 
208
  )
209
- fetch_btn = gr.Button("Fetch")
210
-
211
- # layman's terms: show the result as rendered Markdown (not a plain textbox)
212
- output_md = gr.Markdown(label="Readable Markdown")
213
-
214
- # layman's terms: helpful example URLs to try with one click
215
- gr.Examples(
216
- examples=[
217
- ["https://en.wikipedia.org/wiki/Hugging_Face"],
218
- ["https://huggingface.co/blog"],
219
- ["https://www.bbc.com/news"],
220
- ],
221
- inputs=[url_box],
222
- )
223
 
224
- fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
225
- url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
226
 
 
227
  if __name__ == "__main__":
228
  demo.launch(mcp_server=True)
 
1
  # File: app.py
2
+ # Purpose: Fetch only relevant text (not raw HTML) from a URL, with a smart extractor and a clean fallback.
 
3
 
4
  import re
 
5
  import gradio as gr
6
  import requests
7
  from urllib.parse import urlparse
8
+ from bs4 import BeautifulSoup
9
+
10
+ # Try to import the smart extractor if unavailable for any reason, we'll gracefully fall back.
11
+ try:
12
+ import trafilatura # Best-in-class main-content extractor
13
+ except Exception:
14
+ trafilatura = None
15
+
16
+
17
+ # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Helpers
19
+ # ---------------------------
20
 
21
  def _normalize_url(url: str) -> str:
22
  """
23
+ Make sure the URL has a scheme; default to https:// if missing.
24
+ This avoids 'Invalid URL' errors for inputs like 'example.com'.
25
  """
26
+ url = (url or "").strip()
27
+ if not url:
28
+ raise ValueError("Please enter a URL.")
29
  parsed = urlparse(url)
30
  if not parsed.scheme:
31
  url = "https://" + url
32
  return url
33
 
34
+
35
+ def _fetch(url: str, timeout: int = 15) -> requests.Response:
36
+ """
37
+ Fetch the page with a reasonable User-Agent and a timeout.
38
+ We allow redirects and raise on HTTP errors for clearer feedback.
39
  """
40
+ headers = {
41
+ "User-Agent": "Mozilla/5.0 (compatible; SmartTextFetcher/1.0; +https://huggingface.co/spaces)",
42
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
43
+ }
44
+ resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
45
+ resp.raise_for_status()
46
+ return resp
47
+
48
+
49
+ def _extract_title_from_html(html: str) -> str | None:
50
+ """
51
+ Pull the <title> tag text, if present, for a nicer header.
52
  """
53
  try:
54
+ soup = BeautifulSoup(html, "lxml")
55
+ except Exception:
56
+ soup = BeautifulSoup(html, "html.parser")
57
+ title_tag = soup.find("title")
58
+ if title_tag and title_tag.string:
59
+ return title_tag.string.strip()
60
+ return None
61
+
62
+
63
+ def _visible_text_from_html(html: str) -> str:
 
 
 
 
 
 
 
 
 
64
  """
65
+ Fallback extractor: strip scripts/styles/nav/ads and return visible text.
66
+ This is a heuristic but works well when the smart extractor isn't available.
67
  """
68
+ try:
69
+ soup = BeautifulSoup(html, "lxml")
70
+ except Exception:
71
+ soup = BeautifulSoup(html, "html.parser")
72
+
73
+ # Remove obviously non-content elements (scripts, styles, nav, ads, etc.)
74
+ for tag in soup(["script", "style", "noscript", "svg", "path", "form",
75
+ "header", "footer", "nav", "aside", "iframe"]):
76
+ tag.decompose()
77
+
78
+ # Also drop things that *look* like boilerplate (by id/class)
79
+ kill_words = (
80
+ "nav", "menu", "footer", "header", "cookie", "banner", "subscribe",
81
+ "newsletter", "sidebar", "social", "share", "comment", "promo",
82
+ "advert", "ad", "breadcrumbs", "breadcrumb"
83
  )
84
+ for el in soup.find_all(True):
85
+ meta = " ".join(el.get("class", []) + [el.get("id", "")]).lower()
86
+ if any(k in meta for k in kill_words):
87
+ el.decompose()
88
+
89
+ # Prefer the main/article region when available
90
+ main = soup.find("article") or soup.find("main") or soup.body or soup
91
+
92
+ # Gather block-level text for nicer spacing
93
+ blocks = main.find_all(["h1","h2","h3","h4","h5","h6","p","li","blockquote"])
94
+ lines = []
95
+ for b in blocks:
96
+ text = b.get_text(" ", strip=True)
97
+ if len(text) >= 3:
98
+ lines.append(text)
99
+
100
+ text = "\n\n".join(lines) if lines else main.get_text(" ", strip=True)
101
 
102
+ # Tidy whitespace a bit
103
+ text = re.sub(r"\n{3,}", "\n\n", text)
104
+ text = re.sub(r"[ \t]{2,}", " ", text)
105
+ return text.strip()
 
106
 
 
 
 
107
 
108
+ def _smart_main_text(html: str, url: str) -> str | None:
109
  """
110
+ Use Trafilatura to pull the main/article text when available.
111
+ Returns None if extraction fails.
112
  """
113
+ if not trafilatura:
114
+ return None
115
  try:
116
+ # Trafilatura works best when we give it the page content as a string.
117
+ extracted = trafilatura.extract(
118
+ html,
119
+ include_comments=False,
120
+ favor_recall=True, # a bit more inclusive; better for varied sites
121
+ url=url
122
+ )
123
+ return (extracted or None)
124
  except Exception:
125
+ return None
126
 
 
 
 
 
 
 
127
 
128
+ def _truncate(text: str, max_chars: int) -> str:
129
  """
130
+ Optional safety guard so outputs stay small and responsive.
131
  """
132
+ if max_chars is None or max_chars <= 0:
133
+ return text
134
+ if len(text) <= max_chars:
135
+ return text
136
+ return text[:max_chars].rstrip() + "\n\n… [truncated]"
137
+
138
+
139
+ # ---------------------------
140
+ # Gradio callback
141
+ # ---------------------------
142
+
143
+ def fetch_relevant_text(
144
+ url: str,
145
+ mode: str = "Main article (smart)",
146
+ max_chars: int = 8000,
147
+ include_title: bool = True
148
+ ) -> str:
149
  """
150
+ Main entry point powered by the UI.
151
+ - Validates the URL
152
+ - Fetches the page
153
+ - Extracts relevant text based on the selected mode
154
+ - Optionally prefixes the page <title>
 
 
 
155
  """
 
 
 
156
  try:
157
  url = _normalize_url(url)
158
+ resp = _fetch(url)
159
+ content_type = (resp.headers.get("Content-Type") or "").lower()
160
+
161
+ # If it's plain text, just return it directly.
162
+ if "text/plain" in content_type and resp.text:
163
+ text = resp.text.strip()
164
 
165
+ # If it's HTML/XHTML, run extractors.
166
+ elif "text/html" in content_type or "application/xhtml+xml" in content_type or "<html" in resp.text.lower():
167
+ html = resp.text
 
168
 
169
+ if mode.startswith("Main article"):
170
+ text = _smart_main_text(html, url) or _visible_text_from_html(html)
171
+ elif mode.startswith("Visible text"):
172
+ text = _visible_text_from_html(html)
173
+ else: # Raw HTML (debug) — exposed in UI but not the default
174
+ text = html
175
 
176
+ # Prepend title if requested and available (but don't do it in Raw HTML mode)
177
+ if include_title and not mode.startswith("Raw HTML"):
178
+ title = _extract_title_from_html(html)
179
+ if title:
180
+ text = f"{title}\n\n{text}".strip()
181
 
182
+ else:
183
+ # Not HTML or plain text — provide a helpful hint.
184
+ return f"Unsupported content type: {content_type or 'unknown'}. This tool extracts text from HTML pages."
185
 
186
+ # Keep response snappy by trimming overly long outputs.
187
+ return _truncate(text, max_chars)
188
 
189
  except requests.exceptions.RequestException as e:
190
+ return f"Network error while fetching the URL: {e}"
191
+ except ValueError as ve:
192
+ return f"{ve}"
193
  except Exception as e:
194
+ return f"Unexpected error: {e}"
195
+
196
 
197
+ # ---------------------------
198
+ # UI (Gradio)
199
+ # ---------------------------
200
+
201
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP Smart Text") as demo:
202
+ # Headline & quick explainer (human-friendly)
203
+ gr.Markdown(
204
+ """
205
+ # Fetch MCP — Smart Text
206
+ Enter a URL and get the **relevant text** back (not the raw HTML).
207
+ Use “Main article (smart)” for best results; switch to “Visible text” if needed.
208
+ """
209
+ )
210
 
211
  with gr.Row():
212
+ url_in = gr.Textbox(label="URL", placeholder="https://example.com/some-article", scale=4)
213
+ with gr.Row():
214
+ mode_in = gr.Radio(
215
+ label="Extraction mode",
216
+ choices=[
217
+ "Main article (smart)",
218
+ "Visible text (fallback)",
219
+ "Raw HTML (debug)"
220
+ ],
221
+ value="Main article (smart)",
222
+ scale=3
223
  )
224
+ include_title_in = gr.Checkbox(label="Include page title", value=True, scale=1)
225
+ max_chars_in = gr.Slider(
226
+ label="Max characters (to keep responses fast)",
227
+ minimum=500,
228
+ maximum=40000,
229
+ step=500,
230
+ value=8000,
231
+ scale=3
232
+ )
233
+
234
+ out = gr.Textbox(label="Extracted Text", lines=22)
 
 
 
235
 
236
+ go = gr.Button("Fetch")
237
+ go.click(fetch_relevant_text, inputs=[url_in, mode_in, max_chars_in, include_title_in], outputs=out)
238
 
239
+ # Keep MCP server flag for your Space
240
  if __name__ == "__main__":
241
  demo.launch(mcp_server=True)