Spaces:
Running
Running
Deploy scrap endpoints
Browse files- app.py +50 -18
- requirements.txt +3 -1
- scrap.py +63 -0
- backends.py → search.py +4 -1
app.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
-
import json
|
3 |
from typing import Optional
|
4 |
-
from duckduckgo_search import DDGS
|
5 |
-
from duckduckgo_search.exceptions import RatelimitException
|
6 |
import expiringdict
|
7 |
-
from fastapi import FastAPI
|
|
|
|
|
8 |
from pydantic import BaseModel, Field
|
9 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
10 |
-
from urllib.parse import quote_plus
|
11 |
import logging
|
12 |
-
import re
|
13 |
import uvicorn
|
14 |
|
15 |
-
from
|
|
|
16 |
|
17 |
logging.basicConfig(
|
18 |
level=logging.INFO,
|
@@ -24,13 +22,16 @@ logging.basicConfig(
|
|
24 |
playwright = None
|
25 |
pw_browser: Optional[Browser] = None
|
26 |
|
|
|
|
|
|
|
|
|
27 |
|
28 |
@asynccontextmanager
|
29 |
async def api_lifespan(app: FastAPI):
|
30 |
global playwright, pw_browser
|
31 |
playwright = await async_playwright().start()
|
32 |
pw_browser = await playwright.chromium.launch(headless=True)
|
33 |
-
|
34 |
yield
|
35 |
|
36 |
await pw_browser.close()
|
@@ -39,6 +40,20 @@ async def api_lifespan(app: FastAPI):
|
|
39 |
app = FastAPI(lifespan=api_lifespan)
|
40 |
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
class APISearchParams(BaseModel):
|
44 |
queries: list[str] = Field(...,
|
@@ -53,16 +68,6 @@ async def query_google_scholar(params: APISearchParams):
|
|
53 |
return {"error": "Unimplemented"}
|
54 |
|
55 |
|
56 |
-
@app.get('/')
|
57 |
-
async def status():
|
58 |
-
backend_keys = [k[0] for k in backend_status.items()]
|
59 |
-
backend_status_dict = {}
|
60 |
-
|
61 |
-
for k in backend_keys:
|
62 |
-
backend_status_dict[k] = backend_status.get(k)
|
63 |
-
return {"status": "running", "backend_status": backend_status_dict}
|
64 |
-
|
65 |
-
|
66 |
@app.post("/search_patents")
|
67 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
68 |
"""Searches google patents for the specified queries and returns the found documents."""
|
@@ -175,4 +180,31 @@ async def search(params: APISearchParams):
|
|
175 |
|
176 |
return APISearchResults(results=results, error=None)
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
1 |
from contextlib import asynccontextmanager
|
|
|
2 |
from typing import Optional
|
|
|
|
|
3 |
import expiringdict
|
4 |
+
from fastapi import APIRouter, FastAPI
|
5 |
+
from fastapi.routing import APIRouter as Router
|
6 |
+
import httpx
|
7 |
from pydantic import BaseModel, Field
|
8 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
|
9 |
import logging
|
|
|
10 |
import uvicorn
|
11 |
|
12 |
+
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
13 |
+
from search import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
|
14 |
|
15 |
logging.basicConfig(
|
16 |
level=logging.INFO,
|
|
|
22 |
playwright = None
|
23 |
pw_browser: Optional[Browser] = None
|
24 |
|
25 |
+
# httpx client
|
26 |
+
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
27 |
+
max_connections=15, max_keepalive_connections=20))
|
28 |
+
|
29 |
|
30 |
@asynccontextmanager
|
31 |
async def api_lifespan(app: FastAPI):
|
32 |
global playwright, pw_browser
|
33 |
playwright = await async_playwright().start()
|
34 |
pw_browser = await playwright.chromium.launch(headless=True)
|
|
|
35 |
yield
|
36 |
|
37 |
await pw_browser.close()
|
|
|
40 |
app = FastAPI(lifespan=api_lifespan)
|
41 |
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
42 |
|
43 |
+
# Router for scrapping related endpoints
|
44 |
+
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
45 |
+
|
46 |
+
@app.get('/')
|
47 |
+
async def status():
|
48 |
+
backend_keys = [k[0] for k in backend_status.items()]
|
49 |
+
backend_status_dict = {}
|
50 |
+
|
51 |
+
for k in backend_keys:
|
52 |
+
backend_status_dict[k] = backend_status.get(k)
|
53 |
+
return {"status": "running", "backend_status": backend_status_dict}
|
54 |
+
|
55 |
+
# ===================== Search endpoints =====================
|
56 |
+
|
57 |
|
58 |
class APISearchParams(BaseModel):
|
59 |
queries: list[str] = Field(...,
|
|
|
68 |
return {"error": "Unimplemented"}
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
@app.post("/search_patents")
|
72 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
|
180 |
|
181 |
return APISearchResults(results=results, error=None)
|
182 |
|
183 |
+
# =========================== Scrapping endpoints ===========================
|
184 |
+
|
185 |
+
|
186 |
+
@scrap_router.get("/scrap_patent/{patent_id}")
|
187 |
+
async def scrap_patent(patent_id: str):
|
188 |
+
"""Scraps the specified patent from Google Patents."""
|
189 |
+
patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
|
190 |
+
return patent
|
191 |
+
|
192 |
+
|
193 |
+
class ScrapPatentsRequest(BaseModel):
|
194 |
+
"""Request model for scrapping multiple patents."""
|
195 |
+
patent_ids: list[str] = Field(...,
|
196 |
+
description="List of patent IDs to scrap")
|
197 |
+
|
198 |
+
|
199 |
+
@scrap_router.post("/scrap_patents_bulk")
|
200 |
+
async def scrap_patents(params: ScrapPatentsRequest):
|
201 |
+
"""Scraps multiple patents from Google Patents."""
|
202 |
+
patents = await scrap_patent_bulk_async(httpx_client, [
|
203 |
+
f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
|
204 |
+
return patents
|
205 |
+
|
206 |
+
# ===============================================================================
|
207 |
+
|
208 |
+
app.include_router(scrap_router)
|
209 |
+
|
210 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
@@ -3,4 +3,6 @@ uvicorn
|
|
3 |
pydantic
|
4 |
playwright
|
5 |
duckduckgo_search
|
6 |
-
expiringdict
|
|
|
|
|
|
3 |
pydantic
|
4 |
playwright
|
5 |
duckduckgo_search
|
6 |
+
expiringdict
|
7 |
+
beautifulsoup4
|
8 |
+
httpx
|
scrap.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
from typing import Optional
|
4 |
+
from httpx import AsyncClient
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
|
9 |
+
class PatentScrapResult(BaseModel):
|
10 |
+
"""Schema for the result of scraping a google patents page."""
|
11 |
+
title: str
|
12 |
+
abstract: Optional[str] = None
|
13 |
+
description: Optional[str] = None
|
14 |
+
claims: Optional[str] = None
|
15 |
+
|
16 |
+
|
17 |
+
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
18 |
+
headers = {
|
19 |
+
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
|
20 |
+
}
|
21 |
+
try:
|
22 |
+
response = await client.get(patent_url, headers=headers)
|
23 |
+
response.raise_for_status()
|
24 |
+
|
25 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
26 |
+
|
27 |
+
# Abstract
|
28 |
+
abstract_div = soup.find("div", {"class": "abstract"})
|
29 |
+
abstract = abstract_div.get_text(
|
30 |
+
strip=True) if abstract_div else None
|
31 |
+
|
32 |
+
# Description
|
33 |
+
description_section = soup.find("section", itemprop="description")
|
34 |
+
description = description_section.get_text(
|
35 |
+
separator="\n", strip=True) if description_section else None
|
36 |
+
|
37 |
+
# Claims
|
38 |
+
claims_section = soup.find("section", itemprop="claims")
|
39 |
+
claims = claims_section.get_text(
|
40 |
+
separator="\n", strip=True) if claims_section else None
|
41 |
+
|
42 |
+
# Patent Title
|
43 |
+
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
44 |
+
"content").strip()
|
45 |
+
|
46 |
+
return PatentScrapResult(
|
47 |
+
abstract=abstract,
|
48 |
+
description=description,
|
49 |
+
claims=claims,
|
50 |
+
title=meta_title
|
51 |
+
)
|
52 |
+
except Exception as e:
|
53 |
+
logging.error(f"Error scraping {patent_url}: {e}")
|
54 |
+
return None
|
55 |
+
|
56 |
+
|
57 |
+
async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
|
58 |
+
"""Scrape multiple patents asynchronously."""
|
59 |
+
tasks = [scrap_patent_async(client, url) for url in patent_urls]
|
60 |
+
results = await asyncio.gather(*tasks)
|
61 |
+
|
62 |
+
# Filter out None results (failed scrapes)
|
63 |
+
return [res for res in results if res is not None]
|
backends.py → search.py
RENAMED
@@ -21,6 +21,9 @@ class APISearchResults(BaseModel):
|
|
21 |
|
22 |
class BraveSearchBlockedException(Exception):
|
23 |
"""Dummy exception to detect when the headless browser is flagged as suspicious."""
|
|
|
|
|
|
|
24 |
pass
|
25 |
|
26 |
|
@@ -36,7 +39,7 @@ async def playwright_open_page(browser: Browser):
|
|
36 |
await context.close()
|
37 |
|
38 |
|
39 |
-
#TODO: update to return same format for results
|
40 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
41 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
42 |
|
|
|
21 |
|
22 |
class BraveSearchBlockedException(Exception):
|
23 |
"""Dummy exception to detect when the headless browser is flagged as suspicious."""
|
24 |
+
|
25 |
+
def __init__(self, *args):
|
26 |
+
super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious")
|
27 |
pass
|
28 |
|
29 |
|
|
|
39 |
await context.close()
|
40 |
|
41 |
|
42 |
+
# TODO: update to return same format for results
|
43 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
44 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
45 |
|