pratham0011 commited on
Commit
792e562
·
verified ·
1 Parent(s): dc4a1e0

Update services/search.py

Browse files
Files changed (1) hide show
  1. services/search.py +84 -84
services/search.py CHANGED
@@ -1,85 +1,85 @@
1
- import logging
2
- from typing import List, Dict
3
-
4
- import requests
5
- from bs4 import BeautifulSoup
6
- from urllib3.exceptions import InsecureRequestWarning
7
-
8
- # Disable SSL warnings for requests
9
- requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- class WebSearcher:
14
- def __init__(self):
15
- self.headers = {
16
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
17
- }
18
-
19
- def extract_text(self, html_content: str) -> str:
20
- soup = BeautifulSoup(html_content, 'html.parser')
21
- # Remove unwanted elements
22
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
23
- element.decompose()
24
- text = ' '.join(soup.stripped_strings)
25
- return text[:8000] # Limit text length
26
-
27
- def search(self, query: str, max_results: int = 3) -> List[Dict]:
28
- results = []
29
- try:
30
- with requests.Session() as session:
31
- # Google search parameters
32
- search_url = "https://www.google.com/search"
33
- params = {
34
- "q": query,
35
- "num": max_results,
36
- "hl": "en"
37
- }
38
-
39
- response = session.get(
40
- search_url,
41
- headers=self.headers,
42
- params=params,
43
- timeout=3,
44
- verify=False
45
- )
46
- response.raise_for_status()
47
-
48
- # Parse search results
49
- soup = BeautifulSoup(response.text, 'html.parser')
50
- search_results = soup.select('div.g')
51
-
52
- for result in search_results[:max_results]:
53
- link = result.find('a')
54
- if not link:
55
- continue
56
-
57
- url = link.get('href', '')
58
- if not url.startswith('http'):
59
- continue
60
-
61
- try:
62
- # Fetch webpage content
63
- page_response = session.get(
64
- url,
65
- headers=self.headers,
66
- timeout=5,
67
- verify=False
68
- )
69
- page_response.raise_for_status()
70
-
71
- content = self.extract_text(page_response.text)
72
- results.append({
73
- "url": url,
74
- "content": content
75
- })
76
- logger.info(f"Successfully fetched content from {url}")
77
-
78
- except Exception as e:
79
- logger.warning(f"Failed to fetch {url}: {str(e)}")
80
- continue
81
-
82
- except Exception as e:
83
- logger.error(f"Search failed: {str(e)}")
84
-
85
  return results[:max_results]
 
1
+ import logging
2
+ from typing import List, Dict
3
+
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from urllib3.exceptions import InsecureRequestWarning
7
+
8
+ # Disable SSL warnings for requests
9
+ requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class WebSearcher:
14
+ def __init__(self):
15
+ self.headers = {
16
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
17
+ }
18
+
19
+ def extract_text(self, html_content: str) -> str:
20
+ soup = BeautifulSoup(html_content, 'html.parser')
21
+ # Remove unwanted elements
22
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
23
+ element.decompose()
24
+ text = ' '.join(soup.stripped_strings)
25
+ return text[:8000] # Limit text length
26
+
27
+ def search(self, query: str, max_results: int = 3) -> List[Dict]:
28
+ results = []
29
+ try:
30
+ with requests.Session() as session:
31
+ # Google search parameters
32
+ search_url = "https://www.google.com/search"
33
+ params = {
34
+ "q": query,
35
+ "num": max_results,
36
+ "hl": "en"
37
+ }
38
+
39
+ response = session.get(
40
+ search_url,
41
+ headers=self.headers,
42
+ params=params,
43
+ timeout=10,
44
+ verify=False
45
+ )
46
+ response.raise_for_status()
47
+
48
+ # Parse search results
49
+ soup = BeautifulSoup(response.text, 'html.parser')
50
+ search_results = soup.select('div.g')
51
+
52
+ for result in search_results[:max_results]:
53
+ link = result.find('a')
54
+ if not link:
55
+ continue
56
+
57
+ url = link.get('href', '')
58
+ if not url.startswith('http'):
59
+ continue
60
+
61
+ try:
62
+ # Fetch webpage content
63
+ page_response = session.get(
64
+ url,
65
+ headers=self.headers,
66
+ timeout=5,
67
+ verify=False
68
+ )
69
+ page_response.raise_for_status()
70
+
71
+ content = self.extract_text(page_response.text)
72
+ results.append({
73
+ "url": url,
74
+ "content": content
75
+ })
76
+ logger.info(f"Successfully fetched content from {url}")
77
+
78
+ except Exception as e:
79
+ logger.warning(f"Failed to fetch {url}: {str(e)}")
80
+ continue
81
+
82
+ except Exception as e:
83
+ logger.error(f"Search failed: {str(e)}")
84
+
85
  return results[:max_results]