Spaces:

hadadrjt
/

ai

Running

App Files Files Community

ai / src /tools /deep_search.py

hadadrjt

ai: Release J.A.R.V.I.S. Spaces Next-Gen!

6b509f7 1 day ago

raw

history blame

5.07 kB

	#
	# SPDX-FileCopyrightText: Hadad <[email protected]>
	# SPDX-License-Identifier: Apache-2.0
	#

	import requests # Import the requests library to perform HTTP requests synchronously
	from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers

	# Define a class named SearchTools to encapsulate functionalities related to deep search
	class SearchTools:
	# This class provides methods to connect to the web

	"""
	A class providing tools to perform web searches and read content from URLs using various search engines
	and a reader API service.

	Attributes:
	searxng_url (str): Base URL for the SearXNG search proxy service.
	baidu_url (str): Base URL for Baidu search engine.
	timeout (int): Timeout duration in seconds for HTTP requests.
	reader_api (str): Base URL for the reader API service used to extract content from URLs.

	Methods:
	read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
	search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
	returning the raw HTML response text.
	"""

	def __init__(self):
	"""
	Initialize the SearchTools instance with predefined URLs and timeout settings.
	"""
	self.searxng_url = "https://paulgo.io/search" # URL for the SearXNG search proxy service
	self.baidu_url = "https://www.baidu.com/s" # URL for Baidu search engine
	self.timeout = 30 # Timeout in seconds for HTTP requests to avoid long hanging connections
	self.reader_api = "https://r.jina.ai/" # Reader API endpoint to extract readable content from URLs

	async def read_url(self, url: str) -> str:
	"""
	Asynchronously read and retrieve the textual content of a given URL using the reader API.

	Args:
	url (str): The URL of the webpage to read content from.

	Returns:
	str: The textual content extracted from the URL if successful.
	None: If the request fails or an exception occurs.
	"""
	try:
	data = {"url": url} # Prepare POST data with the target URL
	# Send a synchronous POST request to the reader API with the URL data and timeout
	response = requests.post(self.reader_api, data=data, timeout=self.timeout)
	response.raise_for_status() # Raise an exception if the response status is an HTTP error
	return response.text # Return the textual content of the response
	except Exception:
	# Return None if any error occurs during the request or response processing
	return None

	async def search(self, query: str, engine: str = "google") -> str:
	"""
	Asynchronously perform a web search for the given query using the specified search engine.

	Args:
	query (str): The search query string.
	engine (str, optional): The search engine to use. Supported values are "google" and "baidu".
	Defaults to "google".

	Returns:
	str: The raw HTML content of the search results page if successful.
	None: If the request fails or an exception occurs.
	"""
	try:
	if engine == "baidu":
	# Construct the URL for Baidu search by appending the query parameter 'wd' with the search term
	url = f"{self.reader_api}{self.baidu_url}?wd={query}"
	# Set the HTTP header to target the main content container of Baidu search results
	headers = {
	"X-Target-Selector": "#content_left",
	"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
	}
	else:
	# For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing)
	prefix = "!go" if engine == "google" else "!bi"
	# Construct the URL for SearXNG search proxy with the prefixed query
	url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
	# Set the HTTP header to target the URLs container in the search results
	headers = {
	"X-Target-Selector": "#urls",
	"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
	}

	# Send a synchronous GET request to the constructed URL with headers and timeout
	response = requests.get(url, headers=headers, timeout=self.timeout)
	response.raise_for_status() # Raise an exception if the response status is an HTTP error
	return response.text # Return the raw HTML content of the search results
	except Exception:
	# Return None if any error occurs during the request or response processing
	return None