from contextlib import asynccontextmanager from typing import Optional from fastapi import APIRouter, FastAPI from fastapi.routing import APIRouter import httpx from pydantic import BaseModel, Field from playwright.async_api import async_playwright, Browser, BrowserContext, Page import logging import uvicorn from scrap import scrap_patent_async, scrap_patent_bulk_async from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar logging.basicConfig( level=logging.INFO, format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) # playwright global context playwright = None pw_browser: Optional[Browser] = None # httpx client httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits( max_connections=30, max_keepalive_connections=20)) @asynccontextmanager async def api_lifespan(app: FastAPI): global playwright, pw_browser playwright = await async_playwright().start() pw_browser = await playwright.chromium.launch(headless=True) yield await pw_browser.close() await playwright.stop() app = FastAPI(lifespan=api_lifespan, docs_url="/", title="SERPent", description=open("docs/docs.md").read()) # Router for scrapping related endpoints scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"]) # Router for SERP-scrapping related endpoints serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"]) # ===================== Search endpoints ===================== class SerpQuery(BaseModel): queries: list[str] = Field(..., description="The list of queries to search for") n_results: int = Field( 10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100") @serp_router.post("/search_scholar") async def search_google_scholar(params: SerpQuery): """Queries google scholar for the specified query""" results = [] for q in params.queries: logging.info(f"Searching Google Scholar with query `{q}`") try: res = await query_google_scholar(pw_browser, q, params.n_results) results.extend(res) except Exception as e: logging.error( f"Failed to query Google Scholar with query `{q}`: {e}") return SerpResults(results=results, error=None) @serp_router.post("/search_patents") async def search_patents(params: SerpQuery) -> SerpResults: """Searches google patents for the specified queries and returns the found documents.""" results = [] for q in params.queries: logging.info(f"Searching Google Patents with query `{q}`") try: res = await query_google_patents(pw_browser, q, params.n_results) results.extend(res) except Exception as e: logging.error( f"Failed to query Google Patents with query `{q}`: {e}") return SerpResults(results=results, error=None) @serp_router.post("/search_brave") async def search_brave(params: SerpQuery) -> SerpResults: """Searches brave search for the specified queries and returns the found documents.""" results = [] last_exception: Optional[Exception] = None for q in params.queries: logging.info(f"Searching Brave search with query `{q}`") try: res = await query_brave_search(pw_browser, q, params.n_results) results.extend(res) except Exception as e: last_exception = e logging.error( f"Failed to query Brave search with query `{q}`: {e}") return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) @serp_router.post("/search_bing") async def search_bing(params: SerpQuery) -> SerpResults: """Searches Bing search for the specified queries and returns the found documents.""" results = [] last_exception: Optional[Exception] = None for q in params.queries: logging.info(f"Searching Bing search with query `{q}`") try: res = await query_bing_search(pw_browser, q, params.n_results) results.extend(res) except Exception as e: last_exception = e logging.error( f"Failed to query Bing search with query `{q}`: {e}") return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) @serp_router.post("/search_duck") async def search_duck(params: SerpQuery) -> SerpResults: """Searches duckduckgo for the specified queries and returns the found documents""" results = [] last_exception: Optional[Exception] = None for q in params.queries: logging.info(f"Querying DDG with query: `{q}`") try: res = await query_ddg_search(q, params.n_results) results.extend(res) except Exception as e: last_exception = e logging.error(f"Failed to query DDG with query `{q}`: {e}") return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) @serp_router.post("/search") async def search(params: SerpQuery): """Attempts to search the specified queries using ALL backends""" results = [] for q in params.queries: try: logging.info(f"Querying DDG with query: `{q}`") res = await query_ddg_search(q, params.n_results) results.extend(res) continue except Exception as e: logging.error(f"Failed to query DDG with query `{q}`: {e}") logging.info("Trying with next browser backend.") try: logging.info(f"Querying Brave Search with query: `{q}`") res = await query_brave_search(pw_browser, q, params.n_results) results.extend(res) continue except Exception as e: logging.error( f"Failed to query Brave Search with query `{q}`: {e}") logging.info("Trying with next browser backend.") try: logging.info(f"Querying Bing with query: `{q}`") res = await query_bing_search(pw_browser, q, params.n_results) results.extend(res) continue except Exception as e: logging.error(f"Failed to query Bing search with query `{q}`: {e}") logging.info("Trying with next browser backend.") if len(results) == 0: return SerpResults(results=[], error="All backends are rate-limited.") return SerpResults(results=results, error=None) # =========================== Scrapping endpoints =========================== @scrap_router.get("/scrap_patent/{patent_id}") async def scrap_patent(patent_id: str): """Scraps the specified patent from Google Patents.""" patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en") return patent class ScrapPatentsRequest(BaseModel): """Request model for scrapping multiple patents.""" patent_ids: list[str] = Field(..., description="List of patent IDs to scrap") @scrap_router.post("/scrap_patents_bulk") async def scrap_patents(params: ScrapPatentsRequest): """Scraps multiple patents from Google Patents.""" patents = await scrap_patent_bulk_async(httpx_client, [ f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids]) return patents # =============================================================================== app.include_router(serp_router) app.include_router(scrap_router) uvicorn.run(app, host="0.0.0.0", port=7860)