Nymbo commited on
Commit
32db98e
·
verified ·
1 Parent(s): 598ab39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -39
app.py CHANGED
@@ -1,46 +1,135 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
 
4
 
5
- def fetch_content(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
- This function takes a URL as input, fetches its HTML content,
8
- extracts only the relevant text content, and returns it as a clean string.
9
- It includes error handling for common request issues.
10
  """
11
  try:
12
- response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
13
- response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
14
-
15
- # Parse the HTML content
16
- soup = BeautifulSoup(response.text, 'html.parser')
17
-
18
- # Remove script, style, and other non-content elements
19
- for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
20
- element.extract()
21
-
22
- # Get the text content
23
- text = soup.get_text()
24
-
25
- # Clean up the text: remove extra whitespace, empty lines, etc.
26
- lines = (line.strip() for line in text.splitlines())
27
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
28
- text = '\n'.join(chunk for chunk in chunks if chunk)
29
-
30
- return text
31
  except requests.exceptions.RequestException as e:
32
- return f"An error occurred: {e}"
33
-
34
- # Define the Gradio interface
35
- demo = gr.Interface(
36
- fn=fetch_content,
37
- inputs=gr.Textbox(label="URL", placeholder="https://www.example.com"),
38
- outputs=gr.Textbox(label="Page Content"),
39
- title="Web Page Text Extractor",
40
- description="Enter a URL to extract and display only the text content of the web page.",
41
- allow_flagging="never",
42
- theme="Nymbo/Nymbo_Theme"
43
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
 
45
  if __name__ == "__main__":
46
- demo.launch(mcp_server=True)
 
1
+ # File: app.py
2
+ # Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
3
+ # relevant human-readable text instead of the entire HTML.
4
+ # Includes robust error handling, timeouts, and fallbacks.
5
 
6
+ import gradio as gr # UI framework
7
+ import requests # makes the web request
8
+ from bs4 import BeautifulSoup # parses HTML so we can work with it
9
+ from readability import Document # distills a page down to its "main article" content
10
+ import html # unescapes HTML entities like & → &
11
+ import re # simple cleanup with regex
12
+
13
+ # ---- helper: clean up text nicely -------------------------------------------
14
+ def _normalize_text(text: str) -> str:
15
+ """
16
+ Layman's terms: This tidies up the text we extracted so it looks nice.
17
+ - Converts & things back to normal characters
18
+ - Collapses too many blank lines
19
+ - Trims leading/trailing whitespace
20
+ """
21
+ text = html.unescape(text)
22
+ # Replace Windows/Mac line endings with Unix and normalize spaces
23
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
24
+ # Collapse 3+ newlines down to 2
25
+ text = re.sub(r"\n{3,}", "\n\n", text)
26
+ return text.strip()
27
+
28
+ # ---- core fetcher: return main text or raw HTML ------------------------------
29
+ def fetch_page(url: str, extract_text: bool = True) -> str:
30
  """
31
+ Layman's terms: We download the web page. If 'extract_text' is True,
32
+ we try to grab only the main article/important text. Otherwise we
33
+ return the raw HTML (like your original app).
34
  """
35
  try:
36
+ # Make the request with a friendly browser-like header and a timeout
37
+ resp = requests.get(
38
+ url,
39
+ headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
40
+ timeout=15,
41
+ allow_redirects=True,
42
+ )
43
+ resp.raise_for_status() # If site returns 4xx/5xx, this will raise an error
44
+
 
 
 
 
 
 
 
 
 
 
45
  except requests.exceptions.RequestException as e:
46
+ # Layman's terms: If anything goes wrong with the request, report it nicely.
47
+ return f"Request error: {e}"
48
+
49
+ # If the user wants full HTML, behave like the original version
50
+ if not extract_text:
51
+ return resp.text
52
+
53
+ # Try readability first (usually best for articles/blog posts)
54
+ try:
55
+ # readability extracts the "main" content and returns HTML of just that part
56
+ doc = Document(resp.text)
57
+ main_html = doc.summary(html_partial=True)
58
+
59
+ # Parse the article-only HTML and get just the visible text
60
+ soup = BeautifulSoup(main_html, "lxml")
61
+ # Remove script/style just in case
62
+ for tag in soup(["script", "style", "noscript"]):
63
+ tag.decompose()
64
+
65
+ main_text = soup.get_text(separator="\n")
66
+ main_text = _normalize_text(main_text)
67
+
68
+ # Fallback: if extraction produced nearly nothing, try a simpler approach
69
+ if len(main_text.split()) < 40:
70
+ raise ValueError("Readability extraction too short; falling back")
71
+
72
+ return main_text
73
+
74
+ except Exception:
75
+ # Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
76
+ try:
77
+ soup = BeautifulSoup(resp.text, "lxml")
78
+
79
+ # Remove common noise: scripts, styles, nav, footer, header, forms
80
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
81
+ tag.decompose()
82
+
83
+ # If there's a <main> or an article-like block, prefer that
84
+ candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
85
+ if candidate:
86
+ text = candidate.get_text(separator="\n")
87
+ else:
88
+ text = soup.get_text(separator="\n")
89
+
90
+ return _normalize_text(text)
91
+
92
+ except Exception as e:
93
+ # Last resort: give raw HTML if even fallback parsing fails
94
+ return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"
95
+
96
+ # ---- Gradio UI ---------------------------------------------------------------
97
+ # Layman's terms: This is the app window. You paste a URL and choose whether to
98
+ # extract readable text or keep full HTML. Then click "Fetch".
99
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
100
+ gr.Markdown(
101
+ """
102
+ # Fetch MCP
103
+ Small utility that fetches a web page and returns **just the readable text** by default
104
+ *(toggle off to get the full HTML like before)*.
105
+ """
106
+ )
107
+
108
+ with gr.Row():
109
+ url_input = gr.Textbox(
110
+ label="URL",
111
+ placeholder="https://example.com/article",
112
+ lines=1,
113
+ )
114
+ with gr.Row():
115
+ extract_toggle = gr.Checkbox(
116
+ value=True,
117
+ label="Extract only the main readable text (recommended)",
118
+ )
119
+
120
+ fetch_btn = gr.Button("Fetch", variant="primary")
121
+
122
+ # Output as plain text so it’s easy to copy or pipe into other tools
123
+ output = gr.Textbox(
124
+ label="Output",
125
+ lines=20,
126
+ interactive=False,
127
+ placeholder="Fetched content will appear here…",
128
+ )
129
+
130
+ # Wire the button to our function
131
+ fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)
132
 
133
+ # Run as normal, keeping MCP server enabled
134
  if __name__ == "__main__":
135
+ demo.launch(mcp_server=True)