|
from __future__ import annotations |
|
|
|
from aiohttp import ClientSession, ClientTimeout |
|
try: |
|
from duckduckgo_search import DDGS |
|
from bs4 import BeautifulSoup |
|
has_requirements = True |
|
except ImportError: |
|
has_requirements = False |
|
from ...errors import MissingRequirementsError |
|
from ... import debug |
|
|
|
import asyncio |
|
|
|
class SearchResults(): |
|
def __init__(self, results: list, used_words: int): |
|
self.results = results |
|
self.used_words = used_words |
|
|
|
def __iter__(self): |
|
yield from self.results |
|
|
|
def __str__(self): |
|
search = "" |
|
for idx, result in enumerate(self.results): |
|
if search: |
|
search += "\n\n\n" |
|
search += f"Title: {result.title}\n\n" |
|
if result.text: |
|
search += result.text |
|
else: |
|
search += result.snippet |
|
search += f"\n\nSource: [[{idx}]]({result.url})" |
|
return search |
|
|
|
def __len__(self) -> int: |
|
return len(self.results) |
|
|
|
class SearchResultEntry(): |
|
def __init__(self, title: str, url: str, snippet: str, text: str = None): |
|
self.title = title |
|
self.url = url |
|
self.snippet = snippet |
|
self.text = text |
|
|
|
def set_text(self, text: str): |
|
self.text = text |
|
|
|
def scrape_text(html: str, max_words: int = None) -> str: |
|
soup = BeautifulSoup(html, "html.parser") |
|
for selector in [ |
|
"main", |
|
".main-content-wrapper", |
|
".main-content", |
|
".emt-container-inner", |
|
".content-wrapper", |
|
"#content", |
|
"#mainContent", |
|
]: |
|
select = soup.select_one(selector) |
|
if select: |
|
soup = select |
|
break |
|
|
|
for remove in [".c-globalDisclosure"]: |
|
select = soup.select_one(remove) |
|
if select: |
|
select.extract() |
|
clean_text = "" |
|
for paragraph in soup.select("p, h1, h2, h3, h4, h5, h6"): |
|
text = paragraph.get_text() |
|
for line in text.splitlines(): |
|
words = [] |
|
for word in line.replace("\t", " ").split(" "): |
|
if word: |
|
words.append(word) |
|
count = len(words) |
|
if not count: |
|
continue |
|
if max_words: |
|
max_words -= count |
|
if max_words <= 0: |
|
break |
|
if clean_text: |
|
clean_text += "\n" |
|
clean_text += " ".join(words) |
|
|
|
return clean_text |
|
|
|
async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str: |
|
try: |
|
async with session.get(url) as response: |
|
if response.status == 200: |
|
html = await response.text() |
|
return scrape_text(html, max_words) |
|
except: |
|
return |
|
|
|
async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults: |
|
if not has_requirements: |
|
raise MissingRequirementsError('Install "duckduckgo-search" and "beautifulsoup4" package | pip install -U g4f[search]') |
|
with DDGS() as ddgs: |
|
results = [] |
|
for result in ddgs.text( |
|
query, |
|
region="wt-wt", |
|
safesearch="moderate", |
|
timelimit="y", |
|
max_results=n_results, |
|
): |
|
results.append(SearchResultEntry( |
|
result["title"], |
|
result["href"], |
|
result["body"] |
|
)) |
|
|
|
if add_text: |
|
requests = [] |
|
async with ClientSession(timeout=ClientTimeout(5)) as session: |
|
for entry in results: |
|
requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1)))) |
|
texts = await asyncio.gather(*requests) |
|
|
|
formatted_results = [] |
|
used_words = 0 |
|
left_words = max_words |
|
for i, entry in enumerate(results): |
|
if add_text: |
|
entry.text = texts[i] |
|
if left_words: |
|
left_words -= entry.title.count(" ") + 5 |
|
if entry.text: |
|
left_words -= entry.text.count(" ") |
|
else: |
|
left_words -= entry.snippet.count(" ") |
|
if 0 > left_words: |
|
break |
|
used_words = max_words - left_words |
|
formatted_results.append(entry) |
|
|
|
return SearchResults(formatted_results, used_words) |
|
|
|
def get_search_message(prompt, n_results: int = 5, max_words: int = 2500) -> str: |
|
try: |
|
search_results = asyncio.run(search(prompt, n_results, max_words)) |
|
message = f""" |
|
{search_results} |
|
|
|
|
|
Instruction: Using the provided web search results, to write a comprehensive reply to the user request. |
|
Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com) |
|
|
|
User request: |
|
{prompt} |
|
""" |
|
debug.log(f"Web search: '{prompt.strip()[:50]}...' {search_results.used_words} Words") |
|
return message |
|
except Exception as e: |
|
debug.log(f"Couldn't do web search: {e.__class__.__name__}: {e}") |
|
return prompt |