Nymbo commited on
Commit
5832786
·
verified ·
1 Parent(s): 301aafb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -226
app.py CHANGED
@@ -1,241 +1,52 @@
1
- # File: app.py
2
- # Purpose: Fetch only relevant text (not raw HTML) from a URL, with a smart extractor and a clean fallback.
3
-
4
- import re
5
  import gradio as gr
6
  import requests
7
- from urllib.parse import urlparse
8
  from bs4 import BeautifulSoup
9
 
10
- # Try to import the smart extractor — if unavailable for any reason, we'll gracefully fall back.
11
- try:
12
- import trafilatura # Best-in-class main-content extractor
13
- except Exception:
14
- trafilatura = None
15
-
16
-
17
- # ---------------------------
18
- # Helpers
19
- # ---------------------------
20
-
21
- def _normalize_url(url: str) -> str:
22
- """
23
- Make sure the URL has a scheme; default to https:// if missing.
24
- This avoids 'Invalid URL' errors for inputs like 'example.com'.
25
- """
26
- url = (url or "").strip()
27
- if not url:
28
- raise ValueError("Please enter a URL.")
29
- parsed = urlparse(url)
30
- if not parsed.scheme:
31
- url = "https://" + url
32
- return url
33
-
34
-
35
- def _fetch(url: str, timeout: int = 15) -> requests.Response:
36
- """
37
- Fetch the page with a reasonable User-Agent and a timeout.
38
- We allow redirects and raise on HTTP errors for clearer feedback.
39
- """
40
- headers = {
41
- "User-Agent": "Mozilla/5.0 (compatible; SmartTextFetcher/1.0; +https://huggingface.co/spaces)",
42
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
43
- }
44
- resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
45
- resp.raise_for_status()
46
- return resp
47
-
48
-
49
- def _extract_title_from_html(html: str) -> str | None:
50
- """
51
- Pull the <title> tag text, if present, for a nicer header.
52
- """
53
- try:
54
- soup = BeautifulSoup(html, "lxml")
55
- except Exception:
56
- soup = BeautifulSoup(html, "html.parser")
57
- title_tag = soup.find("title")
58
- if title_tag and title_tag.string:
59
- return title_tag.string.strip()
60
- return None
61
-
62
-
63
- def _visible_text_from_html(html: str) -> str:
64
- """
65
- Fallback extractor: strip scripts/styles/nav/ads and return visible text.
66
- This is a heuristic but works well when the smart extractor isn't available.
67
- """
68
- try:
69
- soup = BeautifulSoup(html, "lxml")
70
- except Exception:
71
- soup = BeautifulSoup(html, "html.parser")
72
-
73
- # Remove obviously non-content elements (scripts, styles, nav, ads, etc.)
74
- for tag in soup(["script", "style", "noscript", "svg", "path", "form",
75
- "header", "footer", "nav", "aside", "iframe"]):
76
- tag.decompose()
77
-
78
- # Also drop things that *look* like boilerplate (by id/class)
79
- kill_words = (
80
- "nav", "menu", "footer", "header", "cookie", "banner", "subscribe",
81
- "newsletter", "sidebar", "social", "share", "comment", "promo",
82
- "advert", "ad", "breadcrumbs", "breadcrumb"
83
- )
84
- for el in soup.find_all(True):
85
- meta = " ".join(el.get("class", []) + [el.get("id", "")]).lower()
86
- if any(k in meta for k in kill_words):
87
- el.decompose()
88
-
89
- # Prefer the main/article region when available
90
- main = soup.find("article") or soup.find("main") or soup.body or soup
91
-
92
- # Gather block-level text for nicer spacing
93
- blocks = main.find_all(["h1","h2","h3","h4","h5","h6","p","li","blockquote"])
94
- lines = []
95
- for b in blocks:
96
- text = b.get_text(" ", strip=True)
97
- if len(text) >= 3:
98
- lines.append(text)
99
-
100
- text = "\n\n".join(lines) if lines else main.get_text(" ", strip=True)
101
-
102
- # Tidy whitespace a bit
103
- text = re.sub(r"\n{3,}", "\n\n", text)
104
- text = re.sub(r"[ \t]{2,}", " ", text)
105
- return text.strip()
106
-
107
-
108
- def _smart_main_text(html: str, url: str) -> str | None:
109
  """
110
- Use Trafilatura to pull the main/article text when available.
111
- Returns None if extraction fails.
 
112
  """
113
- if not trafilatura:
114
- return None
115
  try:
116
- # Trafilatura works best when we give it the page content as a string.
117
- extracted = trafilatura.extract(
118
- html,
119
- include_comments=False,
120
- favor_recall=True, # a bit more inclusive; better for varied sites
121
- url=url
122
- )
123
- return (extracted or None)
124
- except Exception:
125
- return None
126
-
127
-
128
- def _truncate(text: str, max_chars: int) -> str:
129
- """
130
- Optional safety guard so outputs stay small and responsive.
131
- """
132
- if max_chars is None or max_chars <= 0:
133
- return text
134
- if len(text) <= max_chars:
135
- return text
136
- return text[:max_chars].rstrip() + "\n\n… [truncated]"
137
 
 
 
138
 
139
- # ---------------------------
140
- # Gradio callback
141
- # ---------------------------
142
 
143
- def fetch_relevant_text(
144
- url: str,
145
- mode: str = "Main article (smart)",
146
- max_chars: int = 8000,
147
- include_title: bool = True
148
- ) -> str:
149
- """
150
- Main entry point powered by the UI.
151
- - Validates the URL
152
- - Fetches the page
153
- - Extracts relevant text based on the selected mode
154
- - Optionally prefixes the page <title>
155
- """
156
- try:
157
- url = _normalize_url(url)
158
- resp = _fetch(url)
159
- content_type = (resp.headers.get("Content-Type") or "").lower()
160
 
161
- # If it's plain text, just return it directly.
162
- if "text/plain" in content_type and resp.text:
163
- text = resp.text.strip()
164
-
165
- # If it's HTML/XHTML, run extractors.
166
- elif "text/html" in content_type or "application/xhtml+xml" in content_type or "<html" in resp.text.lower():
167
- html = resp.text
168
-
169
- if mode.startswith("Main article"):
170
- text = _smart_main_text(html, url) or _visible_text_from_html(html)
171
- elif mode.startswith("Visible text"):
172
- text = _visible_text_from_html(html)
173
- else: # Raw HTML (debug) — exposed in UI but not the default
174
- text = html
175
-
176
- # Prepend title if requested and available (but don't do it in Raw HTML mode)
177
- if include_title and not mode.startswith("Raw HTML"):
178
- title = _extract_title_from_html(html)
179
- if title:
180
- text = f"{title}\n\n{text}".strip()
181
-
182
- else:
183
- # Not HTML or plain text — provide a helpful hint.
184
- return f"Unsupported content type: {content_type or 'unknown'}. This tool extracts text from HTML pages."
185
-
186
- # Keep response snappy by trimming overly long outputs.
187
- return _truncate(text, max_chars)
188
 
189
  except requests.exceptions.RequestException as e:
190
- return f"Network error while fetching the URL: {e}"
191
- except ValueError as ve:
192
- return f"{ve}"
193
- except Exception as e:
194
- return f"Unexpected error: {e}"
195
-
196
-
197
- # ---------------------------
198
- # UI (Gradio)
199
- # ---------------------------
200
-
201
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Smart Text") as demo:
202
- # Headline & quick explainer (human-friendly)
203
- gr.Markdown(
204
- """
205
- # Fetch MCP — Smart Text
206
- Enter a URL and get the **relevant text** back (not the raw HTML).
207
- Use “Main article (smart)” for best results; switch to “Visible text” if needed.
208
- """
209
- )
210
-
211
- with gr.Row():
212
- url_in = gr.Textbox(label="URL", placeholder="https://example.com/some-article", scale=4)
213
- with gr.Row():
214
- mode_in = gr.Radio(
215
- label="Extraction mode",
216
- choices=[
217
- "Main article (smart)",
218
- "Visible text (fallback)",
219
- "Raw HTML (debug)"
220
- ],
221
- value="Main article (smart)",
222
- scale=3
223
- )
224
- include_title_in = gr.Checkbox(label="Include page title", value=True, scale=1)
225
- max_chars_in = gr.Slider(
226
- label="Max characters (to keep responses fast)",
227
- minimum=500,
228
- maximum=40000,
229
- step=500,
230
- value=8000,
231
- scale=3
232
- )
233
-
234
- out = gr.Textbox(label="Extracted Text", lines=22)
235
-
236
- go = gr.Button("Fetch")
237
- go.click(fetch_relevant_text, inputs=[url_in, mode_in, max_chars_in, include_title_in], outputs=out)
238
 
239
- # Keep MCP server flag for your Space
240
  if __name__ == "__main__":
241
- demo.launch(mcp_server=True)
 
 
 
 
 
 
1
  import gradio as gr
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
 
5
+ def fetch_content(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
+ This function takes a URL as input, fetches its HTML content,
8
+ extracts the clean text, and returns it as a string.
9
+ It includes error handling for common request issues.
10
  """
 
 
11
  try:
12
+ # Send a GET request to the URL with a user-agent header
13
+ response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
14
+ # Raise an exception for bad status codes (4xx or 5xx)
15
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Create a BeautifulSoup object to parse the HTML content
18
+ soup = BeautifulSoup(response.text, 'html.parser')
19
 
20
+ # Find and remove all script and style elements from the parsed HTML
21
+ for script_or_style in soup(['script', 'style']):
22
+ script_or_style.decompose()
23
 
24
+ # Get the text from the soup and clean up whitespace
25
+ text = soup.get_text()
26
+ # Split the text into lines and strip leading/trailing whitespace from each
27
+ lines = (line.strip() for line in text.splitlines())
28
+ # Further break down lines into phrases and strip whitespace
29
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
30
+ # Join the chunks back together with a single newline, removing any blank lines
31
+ clean_text = '\n'.join(chunk for chunk in chunks if chunk)
 
 
 
 
 
 
 
 
 
32
 
33
+ return clean_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  except requests.exceptions.RequestException as e:
36
+ # Handle any network-related errors
37
+ return f"An error occurred: {e}"
38
+
39
+ # Define the Gradio interface
40
+ demo = gr.Interface(
41
+ fn=fetch_content,
42
+ inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
43
+ outputs=gr.Textbox(label="Cleaned Page Content"),
44
+ title="Webpage Text Extractor",
45
+ description="Enter a URL to fetch the clean text content of the web page, stripped of HTML, scripts, and styles.",
46
+ allow_flagging="never",
47
+ theme="Nymbo/Nymbo_Theme"
48
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
50
  if __name__ == "__main__":
51
+ # Launch the Gradio app
52
+ demo.launch()