Spaces:
Running
Running
# | |
# SPDX-FileCopyrightText: Hadad <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
import requests # Import the requests library to perform HTTP requests synchronously | |
from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers | |
# Define a class named SearchTools to encapsulate functionalities related to deep search | |
class SearchTools: | |
# This class provides methods to connect to the web | |
""" | |
A class providing tools to perform web searches and read content from URLs using various search engines | |
and a reader API service. | |
Attributes: | |
searxng_url (str): Base URL for the SearXNG search proxy service. | |
baidu_url (str): Base URL for Baidu search engine. | |
timeout (int): Timeout duration in seconds for HTTP requests. | |
reader_api (str): Base URL for the reader API service used to extract content from URLs. | |
Methods: | |
read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API. | |
search(query, engine): Asynchronously performs a web search with the given query on the specified search engine, | |
returning the raw HTML response text. | |
""" | |
def __init__(self): | |
""" | |
Initialize the SearchTools instance with predefined URLs and timeout settings. | |
""" | |
self.searxng_url = "https://paulgo.io/search" # URL for the SearXNG search proxy service | |
self.baidu_url = "https://www.baidu.com/s" # URL for Baidu search engine | |
self.timeout = 30 # Timeout in seconds for HTTP requests to avoid long hanging connections | |
self.reader_api = "https://r.jina.ai/" # Reader API endpoint to extract readable content from URLs | |
async def read_url(self, url: str) -> str: | |
""" | |
Asynchronously read and retrieve the textual content of a given URL using the reader API. | |
Args: | |
url (str): The URL of the webpage to read content from. | |
Returns: | |
str: The textual content extracted from the URL if successful. | |
None: If the request fails or an exception occurs. | |
""" | |
try: | |
data = {"url": url} # Prepare POST data with the target URL | |
# Send a synchronous POST request to the reader API with the URL data and timeout | |
response = requests.post(self.reader_api, data=data, timeout=self.timeout) | |
response.raise_for_status() # Raise an exception if the response status is an HTTP error | |
return response.text # Return the textual content of the response | |
except Exception: | |
# Return None if any error occurs during the request or response processing | |
return None | |
async def search(self, query: str, engine: str = "google") -> str: | |
""" | |
Asynchronously perform a web search for the given query using the specified search engine. | |
Args: | |
query (str): The search query string. | |
engine (str, optional): The search engine to use. Supported values are "google" and "baidu". | |
Defaults to "google". | |
Returns: | |
str: The raw HTML content of the search results page if successful. | |
None: If the request fails or an exception occurs. | |
""" | |
try: | |
if engine == "baidu": | |
# Construct the URL for Baidu search by appending the query parameter 'wd' with the search term | |
url = f"{self.reader_api}{self.baidu_url}?wd={query}" | |
# Set the HTTP header to target the main content container of Baidu search results | |
headers = { | |
"X-Target-Selector": "#content_left", | |
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins | |
} | |
else: | |
# For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing) | |
prefix = "!go" if engine == "google" else "!bi" | |
# Construct the URL for SearXNG search proxy with the prefixed query | |
url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}" | |
# Set the HTTP header to target the URLs container in the search results | |
headers = { | |
"X-Target-Selector": "#urls", | |
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins | |
} | |
# Send a synchronous GET request to the constructed URL with headers and timeout | |
response = requests.get(url, headers=headers, timeout=self.timeout) | |
response.raise_for_status() # Raise an exception if the response status is an HTTP error | |
return response.text # Return the raw HTML content of the search results | |
except Exception: | |
# Return None if any error occurs during the request or response processing | |
return None |