Nymbo commited on
Commit
39ae379
·
verified ·
1 Parent(s): 072fbc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -20
app.py CHANGED
@@ -1,29 +1,228 @@
 
 
 
 
 
 
1
  import gradio as gr
2
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- def fetch_content(url):
 
 
 
 
 
 
 
 
 
 
5
  """
6
- This function takes a URL as input, fetches its HTML content,
7
- and returns it as a string. It includes error handling for common
8
- request issues.
9
  """
10
  try:
11
- response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
12
- response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
13
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  except requests.exceptions.RequestException as e:
15
- return f"An error occurred: {e}"
16
-
17
- # Define the Gradio interface
18
- demo = gr.Interface(
19
- fn=fetch_content,
20
- inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
21
- outputs=gr.Textbox(label="Page Content"),
22
- title="Fetch MCP",
23
- description="Enter a URL to fetch the full HTML content of the web page.",
24
- allow_flagging="never",
25
- theme="Nymbo/Nymbo_Theme"
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  if __name__ == "__main__":
29
- demo.launch(mcp_server=True)
 
1
+ # File: app.py
2
+ # Purpose: Fetch only the readable text from a web page and return it as Markdown
3
+ # Notes: This version is more efficient and user-friendly than returning raw HTML.
4
+
5
+ import re
6
+ import time
7
  import gradio as gr
8
  import requests
9
+ from urllib.parse import urlparse
10
+ from bs4 import BeautifulSoup # used as a fallback cleaner
11
+ from readability import Document # isolates the "main content" like reader view
12
+ import html2text # converts HTML to Markdown
13
+
14
+ # ----------------------------
15
+ # Simple in-memory cache (tiny LRU-ish)
16
+ # ----------------------------
17
+ # layman's terms: we remember recent results so repeated requests for the same URL are instant
18
+ _CACHE = {}
19
+ _CACHE_ORDER = []
20
+ _CACHE_MAX = 64
21
+ _CACHE_TTL_SECONDS = 10 * 60 # 10 minutes
22
+
23
+ def _cache_get(key):
24
+ # layman's terms: give me the saved value if it's still fresh
25
+ item = _CACHE.get(key)
26
+ if not item:
27
+ return None
28
+ value, ts = item
29
+ if time.time() - ts > _CACHE_TTL_SECONDS:
30
+ _CACHE.pop(key, None)
31
+ return None
32
+ # refresh order
33
+ if key in _CACHE_ORDER:
34
+ _CACHE_ORDER.remove(key)
35
+ _CACHE_ORDER.append(key)
36
+ return value
37
+
38
+ def _cache_set(key, value):
39
+ # layman's terms: save a result and keep the list from growing too large
40
+ _CACHE[key] = (value, time.time())
41
+ if key in _CACHE_ORDER:
42
+ _CACHE_ORDER.remove(key)
43
+ _CACHE_ORDER.append(key)
44
+ while len(_CACHE_ORDER) > _CACHE_MAX:
45
+ oldest = _CACHE_ORDER.pop(0)
46
+ _CACHE.pop(oldest, None)
47
+
48
+ # ----------------------------
49
+ # Helpers
50
+ # ----------------------------
51
+
52
+ def _normalize_url(url: str) -> str:
53
+ """
54
+ layman's terms: if the user forgot 'https://', add it.
55
+ """
56
+ url = url.strip()
57
+ parsed = urlparse(url)
58
+ if not parsed.scheme:
59
+ url = "https://" + url
60
+ return url
61
+
62
+ def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
63
+ """
64
+ layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
65
+ """
66
+ try:
67
+ head = requests.head(
68
+ url,
69
+ allow_redirects=True,
70
+ timeout=(5, 10),
71
+ headers={
72
+ "User-Agent": "Mozilla/5.0",
73
+ "Accept": "text/html,application/xhtml+xml",
74
+ "Accept-Encoding": "gzip, deflate, br",
75
+ },
76
+ )
77
+ size = head.headers.get("Content-Length")
78
+ if size and size.isdigit():
79
+ return int(size) > max_bytes
80
+ except requests.exceptions.RequestException:
81
+ # layman's terms: if HEAD fails, we won't block the GET just because of that
82
+ pass
83
+ return False
84
+
85
+ def _fetch_html(url: str) -> str:
86
+ """
87
+ layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
88
+ """
89
+ resp = requests.get(
90
+ url,
91
+ timeout=(5, 20), # connect, read
92
+ headers={
93
+ "User-Agent": "Mozilla/5.0",
94
+ "Accept": "text/html,application/xhtml+xml",
95
+ "Accept-Encoding": "gzip, deflate, br",
96
+ "Accept-Language": "en-US,en;q=0.8",
97
+ },
98
+ )
99
+ resp.raise_for_status()
100
 
101
+ # Only proceed for text/html payloads
102
+ ctype = resp.headers.get("Content-Type", "")
103
+ if "text/html" not in ctype.lower():
104
+ # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
105
+ return resp.text
106
+
107
+ # Respect declared encoding where possible
108
+ resp.encoding = resp.encoding or "utf-8"
109
+ return resp.text
110
+
111
+ def _extract_main_html(html: str) -> str:
112
  """
113
+ layman's terms: use reader mode (Readability) to isolate the main article/body content.
114
+ Falls back to stripping scripts/styles if Readability can't find a core.
 
115
  """
116
  try:
117
+ doc = Document(html)
118
+ main_html = doc.summary(html_partial=True) # main content as HTML
119
+ # Make sure we still have something useful
120
+ if main_html and len(main_html) > 40:
121
+ return main_html
122
+ except Exception:
123
+ pass
124
+
125
+ # Fallback: strip scripts/styles and return a body-only HTML
126
+ soup = BeautifulSoup(html, "html.parser")
127
+ for tag in soup(["script", "style", "noscript"]):
128
+ tag.decompose()
129
+ body = soup.body or soup
130
+ return str(body)
131
+
132
+ def _html_to_markdown(html: str) -> str:
133
+ """
134
+ layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
135
+ """
136
+ h = html2text.HTML2Text()
137
+ h.ignore_images = True # don't inline images in Markdown
138
+ h.ignore_links = False # keep links as [text](url)
139
+ h.body_width = 0 # don't hard-wrap lines
140
+ h.protect_links = True
141
+ h.single_line_break = True
142
+ md = h.handle(html)
143
+
144
+ # Tidy up excessive blank lines/whitespace
145
+ md = re.sub(r"\n{3,}", "\n\n", md).strip()
146
+ return md or "_No readable text found on this page._"
147
+
148
+ # ----------------------------
149
+ # Main callable for Gradio
150
+ # ----------------------------
151
+
152
+ def fetch_markdown(url: str) -> str:
153
+ """
154
+ layman's terms: the function the UI calls.
155
+ Steps:
156
+ 1) sanitize the URL
157
+ 2) quick HEAD check to avoid massive pages
158
+ 3) GET the HTML
159
+ 4) isolate the main content
160
+ 5) convert to Markdown
161
+ 6) return Markdown
162
+ """
163
+ if not url or not url.strip():
164
+ return "_Please enter a URL._"
165
+
166
+ try:
167
+ url = _normalize_url(url)
168
+
169
+ # Return cached value if available
170
+ cached = _cache_get(url)
171
+ if cached:
172
+ return cached
173
+
174
+ # Optional efficiency: skip very large pages before downloading
175
+ if _too_large_via_head(url):
176
+ return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
177
+
178
+ html = _fetch_html(url)
179
+ # If server returned non-HTML (e.g., JSON), just code-fence it
180
+ if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
181
+ # This condition is a no-op; we already content-typed in _fetch_html.
182
+ pass
183
+
184
+ main_html = _extract_main_html(html)
185
+ markdown = _html_to_markdown(main_html)
186
+
187
+ _cache_set(url, markdown)
188
+ return markdown
189
+
190
  except requests.exceptions.RequestException as e:
191
+ # layman's terms: network or HTTP error
192
+ return f"_Network error: {e}_"
193
+ except Exception as e:
194
+ # layman's terms: any other unexpected error
195
+ return f"_Unexpected error: {e}_"
196
+
197
+ # ----------------------------
198
+ # Gradio UI
199
+ # ----------------------------
200
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
201
+ # layman's terms: a simple, centered header explaining what this tool does
202
+ gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
203
+
204
+ with gr.Row():
205
+ url_box = gr.Textbox(
206
+ label="URL",
207
+ placeholder="example.com or https://example.com/article",
208
+ )
209
+ fetch_btn = gr.Button("Fetch")
210
+
211
+ # layman's terms: show the result as rendered Markdown (not a plain textbox)
212
+ output_md = gr.Markdown(label="Readable Markdown")
213
+
214
+ # layman's terms: helpful example URLs to try with one click
215
+ gr.Examples(
216
+ examples=[
217
+ ["https://en.wikipedia.org/wiki/Hugging_Face"],
218
+ ["https://huggingface.co/blog"],
219
+ ["https://www.bbc.com/news"],
220
+ ],
221
+ inputs=[url_box],
222
+ )
223
+
224
+ fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
225
+ url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
226
 
227
  if __name__ == "__main__":
228
+ demo.launch(mcp_server=True)