SERPent

Running

SERPent / app.py

Lucas ARRIESSE

Update project doc

a9a935f about 1 hour ago

7.75 kB

	from contextlib import asynccontextmanager
	from typing import Optional
	from fastapi import APIRouter, FastAPI
	from fastapi.routing import APIRouter
	import httpx
	from pydantic import BaseModel, Field
	from playwright.async_api import async_playwright, Browser, BrowserContext, Page
	import logging
	import uvicorn

	from scrap import scrap_patent_async, scrap_patent_bulk_async
	from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar

	logging.basicConfig(
	level=logging.INFO,
	format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	# playwright global context
	playwright = None
	pw_browser: Optional[Browser] = None

	# httpx client
	httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
	max_connections=30, max_keepalive_connections=20))


	@asynccontextmanager
	async def api_lifespan(app: FastAPI):
	global playwright, pw_browser
	playwright = await async_playwright().start()
	pw_browser = await playwright.chromium.launch(headless=True)
	yield

	await pw_browser.close()
	await playwright.stop()

	app = FastAPI(lifespan=api_lifespan, docs_url="/",
	title="SERPent", description=open("docs/docs.md").read())

	# Router for scrapping related endpoints
	scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
	# Router for SERP-scrapping related endpoints
	serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"])

	# ===================== Search endpoints =====================


	class SerpQuery(BaseModel):
	queries: list[str] = Field(...,
	description="The list of queries to search for")
	n_results: int = Field(
	10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")


	@serp_router.post("/search_scholar")
	async def search_google_scholar(params: SerpQuery):
	"""Queries google scholar for the specified query"""
	results = []
	for q in params.queries:
	logging.info(f"Searching Google Scholar with query `{q}`")
	try:
	res = await query_google_scholar(pw_browser, q, params.n_results)
	results.extend(res)
	except Exception as e:
	logging.error(
	f"Failed to query Google Scholar with query `{q}`: {e}")
	return SerpResults(results=results, error=None)


	@serp_router.post("/search_patents")
	async def search_patents(params: SerpQuery) -> SerpResults:
	"""Searches google patents for the specified queries and returns the found documents."""
	results = []
	for q in params.queries:
	logging.info(f"Searching Google Patents with query `{q}`")
	try:
	res = await query_google_patents(pw_browser, q, params.n_results)
	results.extend(res)
	except Exception as e:
	logging.error(
	f"Failed to query Google Patents with query `{q}`: {e}")
	return SerpResults(results=results, error=None)


	@serp_router.post("/search_brave")
	async def search_brave(params: SerpQuery) -> SerpResults:
	"""Searches brave search for the specified queries and returns the found documents."""
	results = []
	last_exception: Optional[Exception] = None
	for q in params.queries:
	logging.info(f"Searching Brave search with query `{q}`")
	try:
	res = await query_brave_search(pw_browser, q, params.n_results)
	results.extend(res)
	except Exception as e:
	last_exception = e
	logging.error(
	f"Failed to query Brave search with query `{q}`: {e}")

	return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)


	@serp_router.post("/search_bing")
	async def search_bing(params: SerpQuery) -> SerpResults:
	"""Searches Bing search for the specified queries and returns the found documents."""
	results = []
	last_exception: Optional[Exception] = None
	for q in params.queries:
	logging.info(f"Searching Bing search with query `{q}`")
	try:
	res = await query_bing_search(pw_browser, q, params.n_results)
	results.extend(res)
	except Exception as e:
	last_exception = e
	logging.error(
	f"Failed to query Bing search with query `{q}`: {e}")

	return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)


	@serp_router.post("/search_duck")
	async def search_duck(params: SerpQuery) -> SerpResults:
	"""Searches duckduckgo for the specified queries and returns the found documents"""
	results = []
	last_exception: Optional[Exception] = None

	for q in params.queries:
	logging.info(f"Querying DDG with query: `{q}`")
	try:
	res = await query_ddg_search(q, params.n_results)
	results.extend(res)
	except Exception as e:
	last_exception = e
	logging.error(f"Failed to query DDG with query `{q}`: {e}")

	return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)


	@serp_router.post("/search")
	async def search(params: SerpQuery):
	"""Attempts to search the specified queries using ALL backends"""
	results = []

	for q in params.queries:
	try:
	logging.info(f"Querying DDG with query: `{q}`")
	res = await query_ddg_search(q, params.n_results)
	results.extend(res)
	continue
	except Exception as e:
	logging.error(f"Failed to query DDG with query `{q}`: {e}")
	logging.info("Trying with next browser backend.")

	try:
	logging.info(f"Querying Brave Search with query: `{q}`")
	res = await query_brave_search(pw_browser, q, params.n_results)
	results.extend(res)
	continue
	except Exception as e:
	logging.error(
	f"Failed to query Brave Search with query `{q}`: {e}")
	logging.info("Trying with next browser backend.")

	try:
	logging.info(f"Querying Bing with query: `{q}`")
	res = await query_bing_search(pw_browser, q, params.n_results)
	results.extend(res)
	continue
	except Exception as e:
	logging.error(f"Failed to query Bing search with query `{q}`: {e}")
	logging.info("Trying with next browser backend.")

	if len(results) == 0:
	return SerpResults(results=[], error="All backends are rate-limited.")

	return SerpResults(results=results, error=None)

	# =========================== Scrapping endpoints ===========================


	@scrap_router.get("/scrap_patent/{patent_id}")
	async def scrap_patent(patent_id: str):
	"""Scraps the specified patent from Google Patents."""
	patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
	return patent


	class ScrapPatentsRequest(BaseModel):
	"""Request model for scrapping multiple patents."""
	patent_ids: list[str] = Field(...,
	description="List of patent IDs to scrap")


	@scrap_router.post("/scrap_patents_bulk")
	async def scrap_patents(params: ScrapPatentsRequest):
	"""Scraps multiple patents from Google Patents."""
	patents = await scrap_patent_bulk_async(httpx_client, [
	f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
	return patents

	# ===============================================================================

	app.include_router(serp_router)
	app.include_router(scrap_router)

	uvicorn.run(app, host="0.0.0.0", port=7860)