Update app.py
Browse files
app.py
CHANGED
@@ -1,29 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"""
|
6 |
-
|
7 |
-
|
8 |
-
request issues.
|
9 |
"""
|
10 |
try:
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
except requests.exceptions.RequestException as e:
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
if __name__ == "__main__":
|
29 |
-
demo.launch(mcp_server=True)
|
|
|
1 |
+
# File: app.py
|
2 |
+
# Purpose: Fetch only the readable text from a web page and return it as Markdown
|
3 |
+
# Notes: This version is more efficient and user-friendly than returning raw HTML.
|
4 |
+
|
5 |
+
import re
|
6 |
+
import time
|
7 |
import gradio as gr
|
8 |
import requests
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
from bs4 import BeautifulSoup # used as a fallback cleaner
|
11 |
+
from readability import Document # isolates the "main content" like reader view
|
12 |
+
import html2text # converts HTML to Markdown
|
13 |
+
|
14 |
+
# ----------------------------
|
15 |
+
# Simple in-memory cache (tiny LRU-ish)
|
16 |
+
# ----------------------------
|
17 |
+
# layman's terms: we remember recent results so repeated requests for the same URL are instant
|
18 |
+
_CACHE = {}
|
19 |
+
_CACHE_ORDER = []
|
20 |
+
_CACHE_MAX = 64
|
21 |
+
_CACHE_TTL_SECONDS = 10 * 60 # 10 minutes
|
22 |
+
|
23 |
+
def _cache_get(key):
|
24 |
+
# layman's terms: give me the saved value if it's still fresh
|
25 |
+
item = _CACHE.get(key)
|
26 |
+
if not item:
|
27 |
+
return None
|
28 |
+
value, ts = item
|
29 |
+
if time.time() - ts > _CACHE_TTL_SECONDS:
|
30 |
+
_CACHE.pop(key, None)
|
31 |
+
return None
|
32 |
+
# refresh order
|
33 |
+
if key in _CACHE_ORDER:
|
34 |
+
_CACHE_ORDER.remove(key)
|
35 |
+
_CACHE_ORDER.append(key)
|
36 |
+
return value
|
37 |
+
|
38 |
+
def _cache_set(key, value):
|
39 |
+
# layman's terms: save a result and keep the list from growing too large
|
40 |
+
_CACHE[key] = (value, time.time())
|
41 |
+
if key in _CACHE_ORDER:
|
42 |
+
_CACHE_ORDER.remove(key)
|
43 |
+
_CACHE_ORDER.append(key)
|
44 |
+
while len(_CACHE_ORDER) > _CACHE_MAX:
|
45 |
+
oldest = _CACHE_ORDER.pop(0)
|
46 |
+
_CACHE.pop(oldest, None)
|
47 |
+
|
48 |
+
# ----------------------------
|
49 |
+
# Helpers
|
50 |
+
# ----------------------------
|
51 |
+
|
52 |
+
def _normalize_url(url: str) -> str:
|
53 |
+
"""
|
54 |
+
layman's terms: if the user forgot 'https://', add it.
|
55 |
+
"""
|
56 |
+
url = url.strip()
|
57 |
+
parsed = urlparse(url)
|
58 |
+
if not parsed.scheme:
|
59 |
+
url = "https://" + url
|
60 |
+
return url
|
61 |
+
|
62 |
+
def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
|
63 |
+
"""
|
64 |
+
layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
|
65 |
+
"""
|
66 |
+
try:
|
67 |
+
head = requests.head(
|
68 |
+
url,
|
69 |
+
allow_redirects=True,
|
70 |
+
timeout=(5, 10),
|
71 |
+
headers={
|
72 |
+
"User-Agent": "Mozilla/5.0",
|
73 |
+
"Accept": "text/html,application/xhtml+xml",
|
74 |
+
"Accept-Encoding": "gzip, deflate, br",
|
75 |
+
},
|
76 |
+
)
|
77 |
+
size = head.headers.get("Content-Length")
|
78 |
+
if size and size.isdigit():
|
79 |
+
return int(size) > max_bytes
|
80 |
+
except requests.exceptions.RequestException:
|
81 |
+
# layman's terms: if HEAD fails, we won't block the GET just because of that
|
82 |
+
pass
|
83 |
+
return False
|
84 |
+
|
85 |
+
def _fetch_html(url: str) -> str:
|
86 |
+
"""
|
87 |
+
layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
|
88 |
+
"""
|
89 |
+
resp = requests.get(
|
90 |
+
url,
|
91 |
+
timeout=(5, 20), # connect, read
|
92 |
+
headers={
|
93 |
+
"User-Agent": "Mozilla/5.0",
|
94 |
+
"Accept": "text/html,application/xhtml+xml",
|
95 |
+
"Accept-Encoding": "gzip, deflate, br",
|
96 |
+
"Accept-Language": "en-US,en;q=0.8",
|
97 |
+
},
|
98 |
+
)
|
99 |
+
resp.raise_for_status()
|
100 |
|
101 |
+
# Only proceed for text/html payloads
|
102 |
+
ctype = resp.headers.get("Content-Type", "")
|
103 |
+
if "text/html" not in ctype.lower():
|
104 |
+
# layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
|
105 |
+
return resp.text
|
106 |
+
|
107 |
+
# Respect declared encoding where possible
|
108 |
+
resp.encoding = resp.encoding or "utf-8"
|
109 |
+
return resp.text
|
110 |
+
|
111 |
+
def _extract_main_html(html: str) -> str:
|
112 |
"""
|
113 |
+
layman's terms: use reader mode (Readability) to isolate the main article/body content.
|
114 |
+
Falls back to stripping scripts/styles if Readability can't find a core.
|
|
|
115 |
"""
|
116 |
try:
|
117 |
+
doc = Document(html)
|
118 |
+
main_html = doc.summary(html_partial=True) # main content as HTML
|
119 |
+
# Make sure we still have something useful
|
120 |
+
if main_html and len(main_html) > 40:
|
121 |
+
return main_html
|
122 |
+
except Exception:
|
123 |
+
pass
|
124 |
+
|
125 |
+
# Fallback: strip scripts/styles and return a body-only HTML
|
126 |
+
soup = BeautifulSoup(html, "html.parser")
|
127 |
+
for tag in soup(["script", "style", "noscript"]):
|
128 |
+
tag.decompose()
|
129 |
+
body = soup.body or soup
|
130 |
+
return str(body)
|
131 |
+
|
132 |
+
def _html_to_markdown(html: str) -> str:
|
133 |
+
"""
|
134 |
+
layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
|
135 |
+
"""
|
136 |
+
h = html2text.HTML2Text()
|
137 |
+
h.ignore_images = True # don't inline images in Markdown
|
138 |
+
h.ignore_links = False # keep links as [text](url)
|
139 |
+
h.body_width = 0 # don't hard-wrap lines
|
140 |
+
h.protect_links = True
|
141 |
+
h.single_line_break = True
|
142 |
+
md = h.handle(html)
|
143 |
+
|
144 |
+
# Tidy up excessive blank lines/whitespace
|
145 |
+
md = re.sub(r"\n{3,}", "\n\n", md).strip()
|
146 |
+
return md or "_No readable text found on this page._"
|
147 |
+
|
148 |
+
# ----------------------------
|
149 |
+
# Main callable for Gradio
|
150 |
+
# ----------------------------
|
151 |
+
|
152 |
+
def fetch_markdown(url: str) -> str:
|
153 |
+
"""
|
154 |
+
layman's terms: the function the UI calls.
|
155 |
+
Steps:
|
156 |
+
1) sanitize the URL
|
157 |
+
2) quick HEAD check to avoid massive pages
|
158 |
+
3) GET the HTML
|
159 |
+
4) isolate the main content
|
160 |
+
5) convert to Markdown
|
161 |
+
6) return Markdown
|
162 |
+
"""
|
163 |
+
if not url or not url.strip():
|
164 |
+
return "_Please enter a URL._"
|
165 |
+
|
166 |
+
try:
|
167 |
+
url = _normalize_url(url)
|
168 |
+
|
169 |
+
# Return cached value if available
|
170 |
+
cached = _cache_get(url)
|
171 |
+
if cached:
|
172 |
+
return cached
|
173 |
+
|
174 |
+
# Optional efficiency: skip very large pages before downloading
|
175 |
+
if _too_large_via_head(url):
|
176 |
+
return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
|
177 |
+
|
178 |
+
html = _fetch_html(url)
|
179 |
+
# If server returned non-HTML (e.g., JSON), just code-fence it
|
180 |
+
if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
|
181 |
+
# This condition is a no-op; we already content-typed in _fetch_html.
|
182 |
+
pass
|
183 |
+
|
184 |
+
main_html = _extract_main_html(html)
|
185 |
+
markdown = _html_to_markdown(main_html)
|
186 |
+
|
187 |
+
_cache_set(url, markdown)
|
188 |
+
return markdown
|
189 |
+
|
190 |
except requests.exceptions.RequestException as e:
|
191 |
+
# layman's terms: network or HTTP error
|
192 |
+
return f"_Network error: {e}_"
|
193 |
+
except Exception as e:
|
194 |
+
# layman's terms: any other unexpected error
|
195 |
+
return f"_Unexpected error: {e}_"
|
196 |
+
|
197 |
+
# ----------------------------
|
198 |
+
# Gradio UI
|
199 |
+
# ----------------------------
|
200 |
+
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
|
201 |
+
# layman's terms: a simple, centered header explaining what this tool does
|
202 |
+
gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
|
203 |
+
|
204 |
+
with gr.Row():
|
205 |
+
url_box = gr.Textbox(
|
206 |
+
label="URL",
|
207 |
+
placeholder="example.com or https://example.com/article",
|
208 |
+
)
|
209 |
+
fetch_btn = gr.Button("Fetch")
|
210 |
+
|
211 |
+
# layman's terms: show the result as rendered Markdown (not a plain textbox)
|
212 |
+
output_md = gr.Markdown(label="Readable Markdown")
|
213 |
+
|
214 |
+
# layman's terms: helpful example URLs to try with one click
|
215 |
+
gr.Examples(
|
216 |
+
examples=[
|
217 |
+
["https://en.wikipedia.org/wiki/Hugging_Face"],
|
218 |
+
["https://huggingface.co/blog"],
|
219 |
+
["https://www.bbc.com/news"],
|
220 |
+
],
|
221 |
+
inputs=[url_box],
|
222 |
+
)
|
223 |
+
|
224 |
+
fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
|
225 |
+
url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
|
226 |
|
227 |
if __name__ == "__main__":
|
228 |
+
demo.launch(mcp_server=True)
|