Nymbo commited on
Commit
598ab39
·
verified ·
1 Parent(s): 85cbad8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -6
app.py CHANGED
@@ -1,26 +1,43 @@
1
  import gradio as gr
2
  import requests
 
3
 
4
  def fetch_content(url):
5
  """
6
  This function takes a URL as input, fetches its HTML content,
7
- and returns it as a string. It includes error handling for common
8
- request issues.
9
  """
10
  try:
11
  response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
12
  response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
13
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  except requests.exceptions.RequestException as e:
15
  return f"An error occurred: {e}"
16
 
17
  # Define the Gradio interface
18
  demo = gr.Interface(
19
  fn=fetch_content,
20
- inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
21
  outputs=gr.Textbox(label="Page Content"),
22
- title="Fetch MCP",
23
- description="Enter a URL to fetch the full HTML content of the web page.",
24
  allow_flagging="never",
25
  theme="Nymbo/Nymbo_Theme"
26
  )
 
1
  import gradio as gr
2
  import requests
3
+ from bs4 import BeautifulSoup
4
 
5
  def fetch_content(url):
6
  """
7
  This function takes a URL as input, fetches its HTML content,
8
+ extracts only the relevant text content, and returns it as a clean string.
9
+ It includes error handling for common request issues.
10
  """
11
  try:
12
  response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
13
  response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
14
+
15
+ # Parse the HTML content
16
+ soup = BeautifulSoup(response.text, 'html.parser')
17
+
18
+ # Remove script, style, and other non-content elements
19
+ for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
20
+ element.extract()
21
+
22
+ # Get the text content
23
+ text = soup.get_text()
24
+
25
+ # Clean up the text: remove extra whitespace, empty lines, etc.
26
+ lines = (line.strip() for line in text.splitlines())
27
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
28
+ text = '\n'.join(chunk for chunk in chunks if chunk)
29
+
30
+ return text
31
  except requests.exceptions.RequestException as e:
32
  return f"An error occurred: {e}"
33
 
34
  # Define the Gradio interface
35
  demo = gr.Interface(
36
  fn=fetch_content,
37
+ inputs=gr.Textbox(label="URL", placeholder="https://www.example.com"),
38
  outputs=gr.Textbox(label="Page Content"),
39
+ title="Web Page Text Extractor",
40
+ description="Enter a URL to extract and display only the text content of the web page.",
41
  allow_flagging="never",
42
  theme="Nymbo/Nymbo_Theme"
43
  )