pratham0011 commited on
Commit
ed96a4f
·
verified ·
1 Parent(s): ffe3553

Delete search.py

Browse files
Files changed (1) hide show
  1. search.py +0 -85
search.py DELETED
@@ -1,85 +0,0 @@
1
- import logging
2
- from typing import List, Dict
3
-
4
- import requests
5
- from bs4 import BeautifulSoup
6
- from urllib3.exceptions import InsecureRequestWarning
7
-
8
- # Disable SSL warnings for requests
9
- requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- class WebSearcher:
14
- def __init__(self):
15
- self.headers = {
16
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
17
- }
18
-
19
- def extract_text(self, html_content: str) -> str:
20
- soup = BeautifulSoup(html_content, 'html.parser')
21
- # Remove unwanted elements
22
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
23
- element.decompose()
24
- text = ' '.join(soup.stripped_strings)
25
- return text[:8000] # Limit text length
26
-
27
- def search(self, query: str, max_results: int = 3) -> List[Dict]:
28
- results = []
29
- try:
30
- with requests.Session() as session:
31
- # Google search parameters
32
- search_url = "https://www.google.com/search"
33
- params = {
34
- "q": query,
35
- "num": max_results,
36
- "hl": "en"
37
- }
38
-
39
- response = session.get(
40
- search_url,
41
- headers=self.headers,
42
- params=params,
43
- timeout=3,
44
- verify=False
45
- )
46
- response.raise_for_status()
47
-
48
- # Parse search results
49
- soup = BeautifulSoup(response.text, 'html.parser')
50
- search_results = soup.select('div.g')
51
-
52
- for result in search_results[:max_results]:
53
- link = result.find('a')
54
- if not link:
55
- continue
56
-
57
- url = link.get('href', '')
58
- if not url.startswith('http'):
59
- continue
60
-
61
- try:
62
- # Fetch webpage content
63
- page_response = session.get(
64
- url,
65
- headers=self.headers,
66
- timeout=5,
67
- verify=False
68
- )
69
- page_response.raise_for_status()
70
-
71
- content = self.extract_text(page_response.text)
72
- results.append({
73
- "url": url,
74
- "content": content
75
- })
76
- logger.info(f"Successfully fetched content from {url}")
77
-
78
- except Exception as e:
79
- logger.warning(f"Failed to fetch {url}: {str(e)}")
80
- continue
81
-
82
- except Exception as e:
83
- logger.error(f"Search failed: {str(e)}")
84
-
85
- return results[:max_results]