Spaces:
Running
Running
Update application/utils/web_search.py
Browse files- application/utils/web_search.py +27 -35
application/utils/web_search.py
CHANGED
@@ -1,51 +1,43 @@
|
|
1 |
import requests
|
2 |
-
from googlesearch import search
|
3 |
from bs4 import BeautifulSoup
|
4 |
import re
|
5 |
-
import
|
|
|
6 |
class WebScarper:
|
7 |
def __init__(self):
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
return random.choice(results)
|
15 |
-
else:
|
16 |
-
return None
|
17 |
def fetch_url(self, url):
|
18 |
-
try:
|
19 |
-
headers = {
|
20 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
21 |
'Accept-Language': 'en-US,en;q=0.9',
|
22 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
23 |
-
}
|
24 |
-
|
25 |
-
response
|
26 |
-
if response.status_code != 200:
|
27 |
-
raise Exception(f"Unable to fetch URL, status code: {response.status_code}")
|
28 |
return response.text
|
29 |
-
|
30 |
-
|
31 |
-
print(f"Error: {e}")
|
32 |
return None
|
33 |
|
34 |
-
|
35 |
def get_text(self, data):
|
36 |
soup = BeautifulSoup(data, 'html.parser')
|
37 |
-
text = soup.get_text()
|
38 |
-
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
39 |
-
if
|
40 |
-
return cleaned_text[:4000]
|
41 |
-
else:
|
42 |
-
return cleaned_text
|
43 |
-
|
44 |
|
45 |
-
def scarpe(self,query):
|
46 |
-
|
47 |
-
|
48 |
-
if(data==None or url==None):
|
49 |
return None
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import re
|
4 |
+
from duckduckgo_search import DDGS
|
5 |
+
|
6 |
class WebScarper:
|
7 |
def __init__(self):
|
8 |
+
self.ddgs = DDGS()
|
9 |
+
|
10 |
+
def get_urls(self, query):
|
11 |
+
results = self.ddgs.text(query, max_results=3)
|
12 |
+
return [result['href'] for result in results] if results else []
|
13 |
+
|
|
|
|
|
|
|
14 |
def fetch_url(self, url):
|
15 |
+
try:
|
16 |
+
headers = {
|
17 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
18 |
'Accept-Language': 'en-US,en;q=0.9',
|
19 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
20 |
+
}
|
21 |
+
response = requests.get(url, headers=headers, timeout=10)
|
22 |
+
response.raise_for_status()
|
|
|
|
|
23 |
return response.text
|
24 |
+
except requests.exceptions.RequestException as e:
|
25 |
+
print(f"Error fetching URL {url}: {e}")
|
|
|
26 |
return None
|
27 |
|
|
|
28 |
def get_text(self, data):
|
29 |
soup = BeautifulSoup(data, 'html.parser')
|
30 |
+
text = soup.get_text()
|
31 |
+
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
32 |
+
return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
def scarpe(self, query):
|
35 |
+
urls = self.get_urls(query)
|
36 |
+
if not urls:
|
|
|
37 |
return None
|
38 |
+
|
39 |
+
for url in urls:
|
40 |
+
data = self.fetch_url(url)
|
41 |
+
if data:
|
42 |
+
return self.get_text(data)
|
43 |
+
return None
|