ai / src /tools /deep_search.py
hadadrjt's picture
ai: Release J.A.R.V.I.S. Spaces Next-Gen!
6b509f7
raw
history blame
5.07 kB
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#
import requests # Import the requests library to perform HTTP requests synchronously
from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers
# Define a class named SearchTools to encapsulate functionalities related to deep search
class SearchTools:
# This class provides methods to connect to the web
"""
A class providing tools to perform web searches and read content from URLs using various search engines
and a reader API service.
Attributes:
searxng_url (str): Base URL for the SearXNG search proxy service.
baidu_url (str): Base URL for Baidu search engine.
timeout (int): Timeout duration in seconds for HTTP requests.
reader_api (str): Base URL for the reader API service used to extract content from URLs.
Methods:
read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
returning the raw HTML response text.
"""
def __init__(self):
"""
Initialize the SearchTools instance with predefined URLs and timeout settings.
"""
self.searxng_url = "https://paulgo.io/search" # URL for the SearXNG search proxy service
self.baidu_url = "https://www.baidu.com/s" # URL for Baidu search engine
self.timeout = 30 # Timeout in seconds for HTTP requests to avoid long hanging connections
self.reader_api = "https://r.jina.ai/" # Reader API endpoint to extract readable content from URLs
async def read_url(self, url: str) -> str:
"""
Asynchronously read and retrieve the textual content of a given URL using the reader API.
Args:
url (str): The URL of the webpage to read content from.
Returns:
str: The textual content extracted from the URL if successful.
None: If the request fails or an exception occurs.
"""
try:
data = {"url": url} # Prepare POST data with the target URL
# Send a synchronous POST request to the reader API with the URL data and timeout
response = requests.post(self.reader_api, data=data, timeout=self.timeout)
response.raise_for_status() # Raise an exception if the response status is an HTTP error
return response.text # Return the textual content of the response
except Exception:
# Return None if any error occurs during the request or response processing
return None
async def search(self, query: str, engine: str = "google") -> str:
"""
Asynchronously perform a web search for the given query using the specified search engine.
Args:
query (str): The search query string.
engine (str, optional): The search engine to use. Supported values are "google" and "baidu".
Defaults to "google".
Returns:
str: The raw HTML content of the search results page if successful.
None: If the request fails or an exception occurs.
"""
try:
if engine == "baidu":
# Construct the URL for Baidu search by appending the query parameter 'wd' with the search term
url = f"{self.reader_api}{self.baidu_url}?wd={query}"
# Set the HTTP header to target the main content container of Baidu search results
headers = {
"X-Target-Selector": "#content_left",
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
}
else:
# For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing)
prefix = "!go" if engine == "google" else "!bi"
# Construct the URL for SearXNG search proxy with the prefixed query
url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
# Set the HTTP header to target the URLs container in the search results
headers = {
"X-Target-Selector": "#urls",
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
}
# Send a synchronous GET request to the constructed URL with headers and timeout
response = requests.get(url, headers=headers, timeout=self.timeout)
response.raise_for_status() # Raise an exception if the response status is an HTTP error
return response.text # Return the raw HTML content of the search results
except Exception:
# Return None if any error occurs during the request or response processing
return None