Nymbo commited on
Commit
85cbad8
·
verified ·
1 Parent(s): dca43df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -28
app.py CHANGED
@@ -1,46 +1,29 @@
1
  import gradio as gr
2
  import requests
3
- from bs4 import BeautifulSoup
4
 
5
  def fetch_content(url):
6
  """
7
- This function takes a URL as input, fetches its HTML,
8
- parses it to extract the clean, relevant text content,
9
- and returns it as a formatted string.
10
  """
11
  try:
12
- # Step 1: Fetch the HTML content
13
- response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
14
  response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
15
-
16
- # Step 2: Parse the HTML with BeautifulSoup
17
- soup = BeautifulSoup(response.text, 'html.parser')
18
-
19
- # Step 3: Remove script and style tags, as they don't contain readable content
20
- for script_or_style in soup(['script', 'style']):
21
- script_or_style.decompose()
22
-
23
- # Step 4: Get the text and clean it up
24
- text = soup.get_text()
25
- lines = (line.strip() for line in text.splitlines())
26
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
27
- clean_text = '\n'.join(chunk for chunk in chunks if chunk)
28
-
29
- return clean_text
30
-
31
  except requests.exceptions.RequestException as e:
32
  return f"An error occurred: {e}"
33
 
34
- # Define the Gradio interface with an updated description
35
  demo = gr.Interface(
36
  fn=fetch_content,
37
- inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."),
38
- outputs=gr.Textbox(label="Clean Text Content", lines=20),
39
- title="Webpage Text Extractor",
40
- description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.",
41
  allow_flagging="never",
42
  theme="Nymbo/Nymbo_Theme"
43
  )
44
 
45
  if __name__ == "__main__":
46
- demo.launch()
 
1
  import gradio as gr
2
  import requests
 
3
 
4
  def fetch_content(url):
5
  """
6
+ This function takes a URL as input, fetches its HTML content,
7
+ and returns it as a string. It includes error handling for common
8
+ request issues.
9
  """
10
  try:
11
+ response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
 
12
  response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
13
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  except requests.exceptions.RequestException as e:
15
  return f"An error occurred: {e}"
16
 
17
+ # Define the Gradio interface
18
  demo = gr.Interface(
19
  fn=fetch_content,
20
+ inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
21
+ outputs=gr.Textbox(label="Page Content"),
22
+ title="Fetch MCP",
23
+ description="Enter a URL to fetch the full HTML content of the web page.",
24
  allow_flagging="never",
25
  theme="Nymbo/Nymbo_Theme"
26
  )
27
 
28
  if __name__ == "__main__":
29
+ demo.launch(mcp_server=True)