Spaces:
Running
Running
from contextlib import asynccontextmanager | |
from typing import Optional | |
from fastapi import APIRouter, FastAPI | |
from fastapi.routing import APIRouter | |
import httpx | |
from pydantic import BaseModel, Field | |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page | |
import logging | |
import uvicorn | |
from scrap import scrap_patent_async, scrap_patent_bulk_async | |
from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar | |
logging.basicConfig( | |
level=logging.INFO, | |
format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S' | |
) | |
# playwright global context | |
playwright = None | |
pw_browser: Optional[Browser] = None | |
# httpx client | |
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits( | |
max_connections=30, max_keepalive_connections=20)) | |
async def api_lifespan(app: FastAPI): | |
global playwright, pw_browser | |
playwright = await async_playwright().start() | |
pw_browser = await playwright.chromium.launch(headless=True) | |
yield | |
await pw_browser.close() | |
await playwright.stop() | |
app = FastAPI(lifespan=api_lifespan, docs_url="/", | |
title="SERPent", description=open("docs/docs.md").read()) | |
# Router for scrapping related endpoints | |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"]) | |
# Router for SERP-scrapping related endpoints | |
serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"]) | |
# ===================== Search endpoints ===================== | |
class SerpQuery(BaseModel): | |
queries: list[str] = Field(..., | |
description="The list of queries to search for") | |
n_results: int = Field( | |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100") | |
async def search_google_scholar(params: SerpQuery): | |
"""Queries google scholar for the specified query""" | |
results = [] | |
for q in params.queries: | |
logging.info(f"Searching Google Scholar with query `{q}`") | |
try: | |
res = await query_google_scholar(pw_browser, q, params.n_results) | |
results.extend(res) | |
except Exception as e: | |
logging.error( | |
f"Failed to query Google Scholar with query `{q}`: {e}") | |
return SerpResults(results=results, error=None) | |
async def search_patents(params: SerpQuery) -> SerpResults: | |
"""Searches google patents for the specified queries and returns the found documents.""" | |
results = [] | |
for q in params.queries: | |
logging.info(f"Searching Google Patents with query `{q}`") | |
try: | |
res = await query_google_patents(pw_browser, q, params.n_results) | |
results.extend(res) | |
except Exception as e: | |
logging.error( | |
f"Failed to query Google Patents with query `{q}`: {e}") | |
return SerpResults(results=results, error=None) | |
async def search_brave(params: SerpQuery) -> SerpResults: | |
"""Searches brave search for the specified queries and returns the found documents.""" | |
results = [] | |
last_exception: Optional[Exception] = None | |
for q in params.queries: | |
logging.info(f"Searching Brave search with query `{q}`") | |
try: | |
res = await query_brave_search(pw_browser, q, params.n_results) | |
results.extend(res) | |
except Exception as e: | |
last_exception = e | |
logging.error( | |
f"Failed to query Brave search with query `{q}`: {e}") | |
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) | |
async def search_bing(params: SerpQuery) -> SerpResults: | |
"""Searches Bing search for the specified queries and returns the found documents.""" | |
results = [] | |
last_exception: Optional[Exception] = None | |
for q in params.queries: | |
logging.info(f"Searching Bing search with query `{q}`") | |
try: | |
res = await query_bing_search(pw_browser, q, params.n_results) | |
results.extend(res) | |
except Exception as e: | |
last_exception = e | |
logging.error( | |
f"Failed to query Bing search with query `{q}`: {e}") | |
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) | |
async def search_duck(params: SerpQuery) -> SerpResults: | |
"""Searches duckduckgo for the specified queries and returns the found documents""" | |
results = [] | |
last_exception: Optional[Exception] = None | |
for q in params.queries: | |
logging.info(f"Querying DDG with query: `{q}`") | |
try: | |
res = await query_ddg_search(q, params.n_results) | |
results.extend(res) | |
except Exception as e: | |
last_exception = e | |
logging.error(f"Failed to query DDG with query `{q}`: {e}") | |
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None) | |
async def search(params: SerpQuery): | |
"""Attempts to search the specified queries using ALL backends""" | |
results = [] | |
for q in params.queries: | |
try: | |
logging.info(f"Querying DDG with query: `{q}`") | |
res = await query_ddg_search(q, params.n_results) | |
results.extend(res) | |
continue | |
except Exception as e: | |
logging.error(f"Failed to query DDG with query `{q}`: {e}") | |
logging.info("Trying with next browser backend.") | |
try: | |
logging.info(f"Querying Brave Search with query: `{q}`") | |
res = await query_brave_search(pw_browser, q, params.n_results) | |
results.extend(res) | |
continue | |
except Exception as e: | |
logging.error( | |
f"Failed to query Brave Search with query `{q}`: {e}") | |
logging.info("Trying with next browser backend.") | |
try: | |
logging.info(f"Querying Bing with query: `{q}`") | |
res = await query_bing_search(pw_browser, q, params.n_results) | |
results.extend(res) | |
continue | |
except Exception as e: | |
logging.error(f"Failed to query Bing search with query `{q}`: {e}") | |
logging.info("Trying with next browser backend.") | |
if len(results) == 0: | |
return SerpResults(results=[], error="All backends are rate-limited.") | |
return SerpResults(results=results, error=None) | |
# =========================== Scrapping endpoints =========================== | |
async def scrap_patent(patent_id: str): | |
"""Scraps the specified patent from Google Patents.""" | |
patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en") | |
return patent | |
class ScrapPatentsRequest(BaseModel): | |
"""Request model for scrapping multiple patents.""" | |
patent_ids: list[str] = Field(..., | |
description="List of patent IDs to scrap") | |
async def scrap_patents(params: ScrapPatentsRequest): | |
"""Scraps multiple patents from Google Patents.""" | |
patents = await scrap_patent_bulk_async(httpx_client, [ | |
f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids]) | |
return patents | |
# =============================================================================== | |
app.include_router(serp_router) | |
app.include_router(scrap_router) | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |