KrishanRao commited on
Commit
171a83a
·
verified ·
1 Parent(s): c3af3c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -6
app.py CHANGED
@@ -5,19 +5,31 @@
5
 
6
 
7
  import gradio as gr
8
- from urllib.request import urlopen, Request
9
  from bs4 import BeautifulSoup
10
  from transformers import pipeline
11
  import os
12
 
13
- # Function to extract text from the URL
14
  def extract_text(url):
15
  try:
16
- req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
17
- html = urlopen(req).read()
18
- text = ' '.join(BeautifulSoup(html, "html.parser").stripped_strings)
 
 
 
 
 
 
 
 
 
 
 
 
19
  return text
20
- except Exception as e:
21
  return f"Error extracting text from URL: {str(e)}"
22
 
23
  # Load Hugging Face model (for extracting named entities or QA)
@@ -106,3 +118,4 @@ demo = gr.Interface(
106
  if __name__ == "__main__":
107
  demo.launch(show_api=False)
108
 
 
 
5
 
6
 
7
  import gradio as gr
8
+ import requests
9
  from bs4 import BeautifulSoup
10
  from transformers import pipeline
11
  import os
12
 
13
+ # Function to extract text from the URL using requests
14
  def extract_text(url):
15
  try:
16
+ headers = {
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
18
+ 'Accept-Language': 'en-US,en;q=0.9',
19
+ 'Accept-Encoding': 'gzip, deflate, br',
20
+ 'Connection': 'keep-alive'
21
+ }
22
+ # Sending GET request with headers
23
+ response = requests.get(url, headers=headers)
24
+
25
+ # Check if the response is successful
26
+ response.raise_for_status() # Raise an error for bad status codes
27
+
28
+ # Parse HTML and extract text
29
+ soup = BeautifulSoup(response.text, "html.parser")
30
+ text = ' '.join(soup.stripped_strings)
31
  return text
32
+ except requests.exceptions.RequestException as e:
33
  return f"Error extracting text from URL: {str(e)}"
34
 
35
  # Load Hugging Face model (for extracting named entities or QA)
 
118
  if __name__ == "__main__":
119
  demo.launch(show_api=False)
120
 
121
+