Nymbo commited on
Commit
8e5c5da
·
verified ·
1 Parent(s): 65add20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -128
app.py CHANGED
@@ -1,135 +1,111 @@
1
- # File: app.py
2
- # Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
3
- # relevant human-readable text instead of the entire HTML.
4
- # Includes robust error handling, timeouts, and fallbacks.
5
-
6
- import gradio as gr # UI framework
7
- import requests # makes the web request
8
- from bs4 import BeautifulSoup # parses HTML so we can work with it
9
- from readability import Document # distills a page down to its "main article" content
10
- import html # unescapes HTML entities like & → &
11
- import re # simple cleanup with regex
12
-
13
- # ---- helper: clean up text nicely -------------------------------------------
14
- def _normalize_text(text: str) -> str:
15
- """
16
- Layman's terms: This tidies up the text we extracted so it looks nice.
17
- - Converts & things back to normal characters
18
- - Collapses too many blank lines
19
- - Trims leading/trailing whitespace
20
- """
21
- text = html.unescape(text)
22
- # Replace Windows/Mac line endings with Unix and normalize spaces
23
- text = text.replace("\r\n", "\n").replace("\r", "\n")
24
- # Collapse 3+ newlines down to 2
25
- text = re.sub(r"\n{3,}", "\n\n", text)
26
- return text.strip()
27
-
28
- # ---- core fetcher: return main text or raw HTML ------------------------------
29
- def fetch_page(url: str, extract_text: bool = True) -> str:
30
  """
31
- Layman's terms: We download the web page. If 'extract_text' is True,
32
- we try to grab only the main article/important text. Otherwise we
33
- return the raw HTML (like your original app).
34
  """
 
 
 
35
  try:
36
- # Make the request with a friendly browser-like header and a timeout
37
- resp = requests.get(
38
- url,
39
- headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
40
- timeout=15,
41
- allow_redirects=True,
42
- )
43
- resp.raise_for_status() # If site returns 4xx/5xx, this will raise an error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  except requests.exceptions.RequestException as e:
46
- # Layman's terms: If anything goes wrong with the request, report it nicely.
47
- return f"Request error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # If the user wants full HTML, behave like the original version
50
- if not extract_text:
51
- return resp.text
52
-
53
- # Try readability first (usually best for articles/blog posts)
54
- try:
55
- # readability extracts the "main" content and returns HTML of just that part
56
- doc = Document(resp.text)
57
- main_html = doc.summary(html_partial=True)
58
-
59
- # Parse the article-only HTML and get just the visible text
60
- soup = BeautifulSoup(main_html, "lxml")
61
- # Remove script/style just in case
62
- for tag in soup(["script", "style", "noscript"]):
63
- tag.decompose()
64
-
65
- main_text = soup.get_text(separator="\n")
66
- main_text = _normalize_text(main_text)
67
-
68
- # Fallback: if extraction produced nearly nothing, try a simpler approach
69
- if len(main_text.split()) < 40:
70
- raise ValueError("Readability extraction too short; falling back")
71
-
72
- return main_text
73
-
74
- except Exception:
75
- # Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
76
- try:
77
- soup = BeautifulSoup(resp.text, "lxml")
78
-
79
- # Remove common noise: scripts, styles, nav, footer, header, forms
80
- for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
81
- tag.decompose()
82
-
83
- # If there's a <main> or an article-like block, prefer that
84
- candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
85
- if candidate:
86
- text = candidate.get_text(separator="\n")
87
- else:
88
- text = soup.get_text(separator="\n")
89
-
90
- return _normalize_text(text)
91
-
92
- except Exception as e:
93
- # Last resort: give raw HTML if even fallback parsing fails
94
- return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"
95
-
96
- # ---- Gradio UI ---------------------------------------------------------------
97
- # Layman's terms: This is the app window. You paste a URL and choose whether to
98
- # extract readable text or keep full HTML. Then click "Fetch".
99
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
100
- gr.Markdown(
101
- """
102
- # Fetch MCP
103
- Small utility that fetches a web page and returns **just the readable text** by default
104
- *(toggle off to get the full HTML like before)*.
105
- """
106
- )
107
-
108
- with gr.Row():
109
- url_input = gr.Textbox(
110
- label="URL",
111
- placeholder="https://example.com/article",
112
- lines=1,
113
- )
114
- with gr.Row():
115
- extract_toggle = gr.Checkbox(
116
- value=True,
117
- label="Extract only the main readable text (recommended)",
118
- )
119
-
120
- fetch_btn = gr.Button("Fetch", variant="primary")
121
-
122
- # Output as plain text so it’s easy to copy or pipe into other tools
123
- output = gr.Textbox(
124
- label="Output",
125
- lines=20,
126
- interactive=False,
127
- placeholder="Fetched content will appear here…",
128
- )
129
-
130
- # Wire the button to our function
131
- fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)
132
-
133
- # Run as normal, keeping MCP server enabled
134
  if __name__ == "__main__":
135
- demo.launch(mcp_server=True)
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import urllib.parse
5
+
6
+ def fetch_and_parse_hn(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
+ This function takes a Hacker News URL, fetches its content, parses it,
9
+ and returns a formatted Markdown string with titles, metadata, and hyperlinks.
 
10
  """
11
+ if not url.strip():
12
+ return "Please enter a URL."
13
+
14
  try:
15
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
16
+ response = requests.get(url, headers=headers)
17
+ response.raise_for_status() # Raises an HTTPError for bad responses
18
+
19
+ soup = BeautifulSoup(response.text, 'html.parser')
20
+
21
+ # Extract page title
22
+ page_title = soup.title.string if soup.title else "Hacker News"
23
+ output_md = [f"# {page_title}\n"]
24
+
25
+ # HN stories are in 'tr' tags with class 'athing'
26
+ story_rows = soup.find_all('tr', class_='athing')
27
+
28
+ if not story_rows:
29
+ return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."
30
+
31
+ for story_row in story_rows:
32
+ # --- Story Details (title, link, rank) ---
33
+ title_span = story_row.find('span', class_='titleline')
34
+ if not title_span:
35
+ continue
36
+
37
+ rank_span = story_row.find('span', class_='rank')
38
+ rank = rank_span.text.strip() if rank_span else ""
39
+
40
+ link_tag = title_span.find('a')
41
+ title = link_tag.text if link_tag else "No Title"
42
+ article_url = link_tag.get('href', '#')
43
+
44
+ # Handle relative URLs for internal posts (e.g., "Ask HN:")
45
+ if not article_url.startswith('http'):
46
+ article_url = urllib.parse.urljoin(url, article_url)
47
+
48
+ site_span = title_span.find('span', class_='sitebit')
49
+ site = f"({site_span.text})" if site_span else ""
50
+
51
+ # --- Metadata (points, user, comments) ---
52
+ # Metadata is in the next 'tr' sibling
53
+ metadata_row = story_row.find_next_sibling('tr')
54
+ if not metadata_row:
55
+ output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
56
+ continue
57
+
58
+ subtext = metadata_row.find('td', class_='subtext')
59
+ if not subtext:
60
+ output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
61
+ continue
62
+
63
+ score = subtext.find('span', class_='score')
64
+ user = subtext.find('a', class_='hnuser')
65
+
66
+ # The comments link is usually the last link in the subtext
67
+ comments_link = subtext.find_all('a')[-1]
68
+
69
+ # Build metadata string
70
+ meta_parts = []
71
+ if score:
72
+ meta_parts.append(score.text)
73
+ if user:
74
+ meta_parts.append(f"by {user.text}")
75
+ if comments_link and 'item?id=' in comments_link.get('href', ''):
76
+ comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
77
+ comments_url = urllib.parse.urljoin(url, comments_link['href'])
78
+ meta_parts.append(f"[{comments_text}]({comments_url})")
79
+
80
+ metadata_str = " | ".join(meta_parts)
81
+
82
+ # Assemble the final markdown for the item
83
+ output_md.append(f"{rank} **[{title}]({article_url})** {site}")
84
+ if metadata_str:
85
+ output_md.append(f" - *{metadata_str}*\n")
86
+
87
+ return "\n".join(output_md)
88
 
89
  except requests.exceptions.RequestException as e:
90
+ return f"An error occurred: {e}"
91
+ except Exception as e:
92
+ return f"An unexpected error occurred during parsing: {e}"
93
+
94
+ # Define the Gradio interface
95
+ demo = gr.Interface(
96
+ fn=fetch_and_parse_hn,
97
+ inputs=gr.Textbox(
98
+ label="Hacker News URL",
99
+ placeholder="e.g., https://news.ycombinator.com",
100
+ value="https://news.ycombinator.com"
101
+ ),
102
+ outputs=gr.Markdown(label="Hacker News Digest"),
103
+ title="Hacker News Digest Fetcher",
104
+ description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
105
+ allow_flagging="never",
106
+ theme="Nymbo/Nymbo_Theme",
107
+ examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
108
+ )
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if __name__ == "__main__":
111
+ demo.launch()