Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

598ab39

verified ·

1 Parent(s): 85cbad8

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -6

app.py CHANGED Viewed

@@ -1,26 +1,43 @@
 import gradio as gr
 import requests
 def fetch_content(url):
     """
     This function takes a URL as input, fetches its HTML content,
-    and returns it as a string. It includes error handling for common
-    request issues.
     """
     try:
         response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
         response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
-        return response.text
     except requests.exceptions.RequestException as e:
         return f"An error occurred: {e}"
 # Define the Gradio interface
 demo = gr.Interface(
     fn=fetch_content,
-    inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
     outputs=gr.Textbox(label="Page Content"),
-    title="Fetch MCP",
-    description="Enter a URL to fetch the full HTML content of the web page.",
     allow_flagging="never",
     theme="Nymbo/Nymbo_Theme"
 )

 import gradio as gr
 import requests
+from bs4 import BeautifulSoup
 def fetch_content(url):
     """
     This function takes a URL as input, fetches its HTML content,
+    extracts only the relevant text content, and returns it as a clean string.
+    It includes error handling for common request issues.
     """
     try:
         response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
         response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
+        # Parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove script, style, and other non-content elements
+        for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
+            element.extract()
+        # Get the text content
+        text = soup.get_text()
+        # Clean up the text: remove extra whitespace, empty lines, etc.
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        return text
     except requests.exceptions.RequestException as e:
         return f"An error occurred: {e}"
 # Define the Gradio interface
 demo = gr.Interface(
     fn=fetch_content,
+    inputs=gr.Textbox(label="URL", placeholder="https://www.example.com"),
     outputs=gr.Textbox(label="Page Content"),
+    title="Web Page Text Extractor",
+    description="Enter a URL to extract and display only the text content of the web page.",
     allow_flagging="never",
     theme="Nymbo/Nymbo_Theme"
 )