Nymbo commited on
Commit
dca43df
·
verified ·
1 Parent(s): 5832786

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -25
app.py CHANGED
@@ -4,49 +4,43 @@ from bs4 import BeautifulSoup
4
 
5
  def fetch_content(url):
6
  """
7
- This function takes a URL as input, fetches its HTML content,
8
- extracts the clean text, and returns it as a string.
9
- It includes error handling for common request issues.
10
  """
11
  try:
12
- # Send a GET request to the URL with a user-agent header
13
- response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
14
- # Raise an exception for bad status codes (4xx or 5xx)
15
- response.raise_for_status()
16
-
17
- # Create a BeautifulSoup object to parse the HTML content
18
  soup = BeautifulSoup(response.text, 'html.parser')
19
-
20
- # Find and remove all script and style elements from the parsed HTML
21
  for script_or_style in soup(['script', 'style']):
22
  script_or_style.decompose()
23
-
24
- # Get the text from the soup and clean up whitespace
25
  text = soup.get_text()
26
- # Split the text into lines and strip leading/trailing whitespace from each
27
  lines = (line.strip() for line in text.splitlines())
28
- # Further break down lines into phrases and strip whitespace
29
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
30
- # Join the chunks back together with a single newline, removing any blank lines
31
  clean_text = '\n'.join(chunk for chunk in chunks if chunk)
32
-
33
  return clean_text
34
-
35
  except requests.exceptions.RequestException as e:
36
- # Handle any network-related errors
37
  return f"An error occurred: {e}"
38
 
39
- # Define the Gradio interface
40
  demo = gr.Interface(
41
  fn=fetch_content,
42
- inputs=gr.Textbox(label="URL", placeholder="https://www.google.com"),
43
- outputs=gr.Textbox(label="Cleaned Page Content"),
44
  title="Webpage Text Extractor",
45
- description="Enter a URL to fetch the clean text content of the web page, stripped of HTML, scripts, and styles.",
46
  allow_flagging="never",
47
  theme="Nymbo/Nymbo_Theme"
48
  )
49
 
50
  if __name__ == "__main__":
51
- # Launch the Gradio app
52
- demo.launch()
 
4
 
5
  def fetch_content(url):
6
  """
7
+ This function takes a URL as input, fetches its HTML,
8
+ parses it to extract the clean, relevant text content,
9
+ and returns it as a formatted string.
10
  """
11
  try:
12
+ # Step 1: Fetch the HTML content
13
+ response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
14
+ response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
15
+
16
+ # Step 2: Parse the HTML with BeautifulSoup
 
17
  soup = BeautifulSoup(response.text, 'html.parser')
18
+
19
+ # Step 3: Remove script and style tags, as they don't contain readable content
20
  for script_or_style in soup(['script', 'style']):
21
  script_or_style.decompose()
22
+
23
+ # Step 4: Get the text and clean it up
24
  text = soup.get_text()
 
25
  lines = (line.strip() for line in text.splitlines())
 
26
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
27
  clean_text = '\n'.join(chunk for chunk in chunks if chunk)
28
+
29
  return clean_text
30
+
31
  except requests.exceptions.RequestException as e:
 
32
  return f"An error occurred: {e}"
33
 
34
+ # Define the Gradio interface with an updated description
35
  demo = gr.Interface(
36
  fn=fetch_content,
37
+ inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."),
38
+ outputs=gr.Textbox(label="Clean Text Content", lines=20),
39
  title="Webpage Text Extractor",
40
+ description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.",
41
  allow_flagging="never",
42
  theme="Nymbo/Nymbo_Theme"
43
  )
44
 
45
  if __name__ == "__main__":
46
+ demo.launch()