|
from contextlib import asynccontextmanager |
|
from typing import Optional |
|
from duckduckgo_search import DDGS |
|
from pydantic import BaseModel |
|
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError |
|
from urllib.parse import quote_plus |
|
import logging |
|
import re |
|
|
|
class SerpResults(BaseModel): |
|
"""Model for SERP scrapping results""" |
|
error: Optional[str] |
|
results: Optional[list[dict]] |
|
|
|
|
|
class BraveSearchBlockedException(Exception): |
|
"""Dummy exception to detect when the headless browser is flagged as suspicious.""" |
|
|
|
def __init__(self, *args): |
|
super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious") |
|
pass |
|
|
|
|
|
@asynccontextmanager |
|
async def playwright_open_page(browser: Browser): |
|
"""Context manager for playwright pages""" |
|
context: BrowserContext = await browser.new_context() |
|
page: Page = await context.new_page() |
|
try: |
|
yield page |
|
finally: |
|
await page.close() |
|
await context.close() |
|
|
|
|
|
async def query_google_scholar(browser: Browser, q: str, n_results: int = 10): |
|
"""Queries google scholar for the specified query and number of results. Returns relevant papers""" |
|
|
|
async with playwright_open_page(browser) as page: |
|
|
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://scholar.google.com/scholar?q={quote_plus(q)}&num={n_results}" |
|
await page.goto(url) |
|
|
|
await page.wait_for_selector("div.gs_ri") |
|
|
|
items = await page.locator("div.gs_ri").all() |
|
results = [] |
|
for item in items[:n_results]: |
|
title = await item.locator("h3").inner_text(timeout=1000) |
|
body = await item.locator("div.gs_rs").inner_text(timeout=1000) |
|
href = await item.locator("h3 > a").get_attribute("href") |
|
|
|
results.append({ |
|
"title": title, |
|
"body": body, |
|
"href": href |
|
}) |
|
|
|
return results |
|
|
|
|
|
async def query_google_patents(browser: Browser, q: str, n_results: int = 10): |
|
"""Queries google patents for the specified query and number of results. Returns relevant patents""" |
|
|
|
|
|
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b" |
|
|
|
async with playwright_open_page(browser) as page: |
|
|
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://patents.google.com/?q={quote_plus(q)}&num={n_results}" |
|
await page.goto(url) |
|
|
|
|
|
|
|
await page.wait_for_function( |
|
f"""() => document.querySelectorAll('search-result-item').length >= 1""", |
|
timeout=30_000 |
|
) |
|
|
|
items = await page.locator("search-result-item").all() |
|
results = [] |
|
for item in items: |
|
text = " ".join(await item.locator("span").all_inner_texts()) |
|
match = re.search(PATENT_ID_REGEX, text) |
|
if not match: |
|
continue |
|
|
|
patent_id = match.group() |
|
|
|
try: |
|
title = await item.locator("h3, h4").first.inner_text(timeout=1000) |
|
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000) |
|
except: |
|
continue |
|
|
|
results.append({ |
|
"id": patent_id, |
|
"href": f"https://patents.google.com/patent/{patent_id}/en", |
|
"title": title, |
|
"body": body |
|
}) |
|
|
|
return results[:n_results] |
|
|
|
|
|
async def query_brave_search(browser: Browser, q: str, n_results: int = 10): |
|
"""Queries Brave Search for the specified query.""" |
|
|
|
async with playwright_open_page(browser) as page: |
|
|
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://search.brave.com/search?q={quote_plus(q)}" |
|
await page.goto(url) |
|
|
|
results_cards = await page.locator('.snippet').all() |
|
|
|
if len(results_cards) == 0: |
|
page_content = await page.content() |
|
|
|
if "suspicious" in page_content: |
|
raise BraveSearchBlockedException() |
|
|
|
results = [] |
|
|
|
try: |
|
for result in results_cards: |
|
title = await result.locator('.title').all_inner_texts() |
|
description = await result.locator('.snippet-description').all_inner_texts() |
|
url = await result.locator('a').nth(0).get_attribute('href') |
|
|
|
|
|
if url is None or url.startswith('/'): |
|
continue |
|
|
|
results.append({ |
|
"title": title[0] if title else "", |
|
"body": description[0] if description else "", |
|
"href": url |
|
}) |
|
|
|
if len(results) >= n_results: |
|
break |
|
|
|
except TimeoutError as e: |
|
logging.warning( |
|
f"Timeout on selector while parsing Brave Search SERP: {e}") |
|
|
|
return results |
|
|
|
|
|
async def query_bing_search(browser: Browser, q: str, n_results: int = 10): |
|
"""Queries bing search for the specified query""" |
|
async with playwright_open_page(browser) as page: |
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://www.bing.com/search?q={quote_plus(q)}" |
|
await page.goto(url) |
|
|
|
await page.wait_for_selector("li.b_algo") |
|
|
|
results = [] |
|
|
|
items = await page.query_selector_all("li.b_algo") |
|
for item in items[:n_results]: |
|
title_el = await item.query_selector("h2 > a") |
|
url = await title_el.get_attribute("href") if title_el else None |
|
title = await title_el.inner_text() if title_el else "" |
|
|
|
snippet = "" |
|
|
|
|
|
for selector in [ |
|
"div.b_caption p", |
|
"div.b_caption", |
|
"div.b_snippet", |
|
"div.b_text", |
|
"p" |
|
]: |
|
snippet_el = await item.query_selector(selector) |
|
if snippet_el: |
|
snippet = await snippet_el.inner_text() |
|
if snippet.strip(): |
|
break |
|
|
|
if title and url: |
|
results.append({ |
|
"title": title.strip(), |
|
"href": url.strip(), |
|
"body": snippet.strip() |
|
}) |
|
|
|
return results |
|
|
|
|
|
async def query_ddg_search(q: str, n_results: int = 10): |
|
"""Queries duckduckgo search for the specified query""" |
|
ddgs = DDGS() |
|
results = [] |
|
|
|
for result in ddgs.text(q, max_results=n_results): |
|
results.append( |
|
{"title": result["title"], "body": result["body"], "href": result["href"]}) |
|
|
|
return results |
|
|