File size: 5,066 Bytes
6b509f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#

import requests  # Import the requests library to perform HTTP requests synchronously
from src.utils.ip_generator import generate_ip  # Import function to generate random IP addresses for request headers

# Define a class named SearchTools to encapsulate functionalities related to deep search
class SearchTools:
    # This class provides methods to connect to the web

    """
    A class providing tools to perform web searches and read content from URLs using various search engines
    and a reader API service.

    Attributes:
        searxng_url (str): Base URL for the SearXNG search proxy service.
        baidu_url (str): Base URL for Baidu search engine.
        timeout (int): Timeout duration in seconds for HTTP requests.
        reader_api (str): Base URL for the reader API service used to extract content from URLs.

    Methods:
        read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
        search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
                               returning the raw HTML response text.
    """

    def __init__(self):
        """
        Initialize the SearchTools instance with predefined URLs and timeout settings.
        """
        self.searxng_url = "https://paulgo.io/search"  # URL for the SearXNG search proxy service
        self.baidu_url = "https://www.baidu.com/s"  # URL for Baidu search engine
        self.timeout = 30  # Timeout in seconds for HTTP requests to avoid long hanging connections
        self.reader_api = "https://r.jina.ai/"  # Reader API endpoint to extract readable content from URLs

    async def read_url(self, url: str) -> str:
        """
        Asynchronously read and retrieve the textual content of a given URL using the reader API.

        Args:
            url (str): The URL of the webpage to read content from.

        Returns:
            str: The textual content extracted from the URL if successful.
            None: If the request fails or an exception occurs.
        """
        try:
            data = {"url": url}  # Prepare POST data with the target URL
            # Send a synchronous POST request to the reader API with the URL data and timeout
            response = requests.post(self.reader_api, data=data, timeout=self.timeout)
            response.raise_for_status()  # Raise an exception if the response status is an HTTP error
            return response.text  # Return the textual content of the response
        except Exception:
            # Return None if any error occurs during the request or response processing
            return None

    async def search(self, query: str, engine: str = "google") -> str:
        """
        Asynchronously perform a web search for the given query using the specified search engine.

        Args:
            query (str): The search query string.
            engine (str, optional): The search engine to use. Supported values are "google" and "baidu".
                                    Defaults to "google".

        Returns:
            str: The raw HTML content of the search results page if successful.
            None: If the request fails or an exception occurs.
        """
        try:
            if engine == "baidu":
                # Construct the URL for Baidu search by appending the query parameter 'wd' with the search term
                url = f"{self.reader_api}{self.baidu_url}?wd={query}"
                # Set the HTTP header to target the main content container of Baidu search results
                headers = {
                    "X-Target-Selector": "#content_left",
                    "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
                }
            else:
                # For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing)
                prefix = "!go" if engine == "google" else "!bi"
                # Construct the URL for SearXNG search proxy with the prefixed query
                url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
                # Set the HTTP header to target the URLs container in the search results
                headers = {
                    "X-Target-Selector": "#urls",
                    "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
                }

            # Send a synchronous GET request to the constructed URL with headers and timeout
            response = requests.get(url, headers=headers, timeout=self.timeout)
            response.raise_for_status()  # Raise an exception if the response status is an HTTP error
            return response.text  # Return the raw HTML content of the search results
        except Exception:
            # Return None if any error occurs during the request or response processing
            return None