Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

dca43df

verified ·

1 Parent(s): 5832786

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -25

app.py CHANGED Viewed

@@ -4,49 +4,43 @@ from bs4 import BeautifulSoup
 def fetch_content(url):
     """
-    This function takes a URL as input, fetches its HTML content,
-    extracts the clean text, and returns it as a string.
-    It includes error handling for common request issues.
     """
     try:
-        # Send a GET request to the URL with a user-agent header
-        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
-        # Raise an exception for bad status codes (4xx or 5xx)
-        response.raise_for_status()
-        # Create a BeautifulSoup object to parse the HTML content
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Find and remove all script and style elements from the parsed HTML
         for script_or_style in soup(['script', 'style']):
             script_or_style.decompose()
-        # Get the text from the soup and clean up whitespace
         text = soup.get_text()
-        # Split the text into lines and strip leading/trailing whitespace from each
         lines = (line.strip() for line in text.splitlines())
-        # Further break down lines into phrases and strip whitespace
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        # Join the chunks back together with a single newline, removing any blank lines
         clean_text = '\n'.join(chunk for chunk in chunks if chunk)
         return clean_text
     except requests.exceptions.RequestException as e:
-        # Handle any network-related errors
         return f"An error occurred: {e}"
-# Define the Gradio interface
 demo = gr.Interface(
     fn=fetch_content,
-    inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
-    outputs=gr.Textbox(label="Cleaned Page Content"),
     title="Webpage Text Extractor",
-    description="Enter a URL to fetch the clean text content of the web page, stripped of HTML, scripts, and styles.",
     allow_flagging="never",
     theme="Nymbo/Nymbo_Theme"
 )
 if __name__ == "__main__":
-    # Launch the Gradio app
-    demo.launch()

 def fetch_content(url):
     """
+    This function takes a URL as input, fetches its HTML,
+    parses it to extract the clean, relevant text content,
+    and returns it as a formatted string.
     """
     try:
+        # Step 1: Fetch the HTML content
+        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
+        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
+        # Step 2: Parse the HTML with BeautifulSoup
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Step 3: Remove script and style tags, as they don't contain readable content
         for script_or_style in soup(['script', 'style']):
             script_or_style.decompose()
+        # Step 4: Get the text and clean it up
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         clean_text = '\n'.join(chunk for chunk in chunks if chunk)
         return clean_text
     except requests.exceptions.RequestException as e:
         return f"An error occurred: {e}"
+# Define the Gradio interface with an updated description
 demo = gr.Interface(
     fn=fetch_content,
+    inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."),
+    outputs=gr.Textbox(label="Clean Text Content", lines=20),
     title="Webpage Text Extractor",
+    description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.",
     allow_flagging="never",
     theme="Nymbo/Nymbo_Theme"
 )
 if __name__ == "__main__":
+    demo.launch()