Game4all commited on
Commit
c38bd79
·
1 Parent(s): b517821

Deploy scrap endpoints

Browse files
Files changed (4) hide show
  1. app.py +50 -18
  2. requirements.txt +3 -1
  3. scrap.py +63 -0
  4. backends.py → search.py +4 -1
app.py CHANGED
@@ -1,18 +1,16 @@
1
  from contextlib import asynccontextmanager
2
- import json
3
  from typing import Optional
4
- from duckduckgo_search import DDGS
5
- from duckduckgo_search.exceptions import RatelimitException
6
  import expiringdict
7
- from fastapi import FastAPI
 
 
8
  from pydantic import BaseModel, Field
9
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
10
- from urllib.parse import quote_plus
11
  import logging
12
- import re
13
  import uvicorn
14
 
15
- from backends import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
 
16
 
17
  logging.basicConfig(
18
  level=logging.INFO,
@@ -24,13 +22,16 @@ logging.basicConfig(
24
  playwright = None
25
  pw_browser: Optional[Browser] = None
26
 
 
 
 
 
27
 
28
  @asynccontextmanager
29
  async def api_lifespan(app: FastAPI):
30
  global playwright, pw_browser
31
  playwright = await async_playwright().start()
32
  pw_browser = await playwright.chromium.launch(headless=True)
33
-
34
  yield
35
 
36
  await pw_browser.close()
@@ -39,6 +40,20 @@ async def api_lifespan(app: FastAPI):
39
  app = FastAPI(lifespan=api_lifespan)
40
  backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  class APISearchParams(BaseModel):
44
  queries: list[str] = Field(...,
@@ -53,16 +68,6 @@ async def query_google_scholar(params: APISearchParams):
53
  return {"error": "Unimplemented"}
54
 
55
 
56
- @app.get('/')
57
- async def status():
58
- backend_keys = [k[0] for k in backend_status.items()]
59
- backend_status_dict = {}
60
-
61
- for k in backend_keys:
62
- backend_status_dict[k] = backend_status.get(k)
63
- return {"status": "running", "backend_status": backend_status_dict}
64
-
65
-
66
  @app.post("/search_patents")
67
  async def search_patents(params: APISearchParams) -> APIPatentResults:
68
  """Searches google patents for the specified queries and returns the found documents."""
@@ -175,4 +180,31 @@ async def search(params: APISearchParams):
175
 
176
  return APISearchResults(results=results, error=None)
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  from contextlib import asynccontextmanager
 
2
  from typing import Optional
 
 
3
  import expiringdict
4
+ from fastapi import APIRouter, FastAPI
5
+ from fastapi.routing import APIRouter as Router
6
+ import httpx
7
  from pydantic import BaseModel, Field
8
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 
9
  import logging
 
10
  import uvicorn
11
 
12
+ from scrap import scrap_patent_async, scrap_patent_bulk_async
13
+ from search import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
14
 
15
  logging.basicConfig(
16
  level=logging.INFO,
 
22
  playwright = None
23
  pw_browser: Optional[Browser] = None
24
 
25
+ # httpx client
26
+ httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
27
+ max_connections=15, max_keepalive_connections=20))
28
+
29
 
30
  @asynccontextmanager
31
  async def api_lifespan(app: FastAPI):
32
  global playwright, pw_browser
33
  playwright = await async_playwright().start()
34
  pw_browser = await playwright.chromium.launch(headless=True)
 
35
  yield
36
 
37
  await pw_browser.close()
 
40
  app = FastAPI(lifespan=api_lifespan)
41
  backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
42
 
43
+ # Router for scrapping related endpoints
44
+ scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
45
+
46
+ @app.get('/')
47
+ async def status():
48
+ backend_keys = [k[0] for k in backend_status.items()]
49
+ backend_status_dict = {}
50
+
51
+ for k in backend_keys:
52
+ backend_status_dict[k] = backend_status.get(k)
53
+ return {"status": "running", "backend_status": backend_status_dict}
54
+
55
+ # ===================== Search endpoints =====================
56
+
57
 
58
  class APISearchParams(BaseModel):
59
  queries: list[str] = Field(...,
 
68
  return {"error": "Unimplemented"}
69
 
70
 
 
 
 
 
 
 
 
 
 
 
71
  @app.post("/search_patents")
72
  async def search_patents(params: APISearchParams) -> APIPatentResults:
73
  """Searches google patents for the specified queries and returns the found documents."""
 
180
 
181
  return APISearchResults(results=results, error=None)
182
 
183
+ # =========================== Scrapping endpoints ===========================
184
+
185
+
186
+ @scrap_router.get("/scrap_patent/{patent_id}")
187
+ async def scrap_patent(patent_id: str):
188
+ """Scraps the specified patent from Google Patents."""
189
+ patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
190
+ return patent
191
+
192
+
193
+ class ScrapPatentsRequest(BaseModel):
194
+ """Request model for scrapping multiple patents."""
195
+ patent_ids: list[str] = Field(...,
196
+ description="List of patent IDs to scrap")
197
+
198
+
199
+ @scrap_router.post("/scrap_patents_bulk")
200
+ async def scrap_patents(params: ScrapPatentsRequest):
201
+ """Scraps multiple patents from Google Patents."""
202
+ patents = await scrap_patent_bulk_async(httpx_client, [
203
+ f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
204
+ return patents
205
+
206
+ # ===============================================================================
207
+
208
+ app.include_router(scrap_router)
209
+
210
  uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -3,4 +3,6 @@ uvicorn
3
  pydantic
4
  playwright
5
  duckduckgo_search
6
- expiringdict
 
 
 
3
  pydantic
4
  playwright
5
  duckduckgo_search
6
+ expiringdict
7
+ beautifulsoup4
8
+ httpx
scrap.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from typing import Optional
4
+ from httpx import AsyncClient
5
+ from bs4 import BeautifulSoup
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class PatentScrapResult(BaseModel):
10
+ """Schema for the result of scraping a google patents page."""
11
+ title: str
12
+ abstract: Optional[str] = None
13
+ description: Optional[str] = None
14
+ claims: Optional[str] = None
15
+
16
+
17
+ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
18
+ headers = {
19
+ "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
20
+ }
21
+ try:
22
+ response = await client.get(patent_url, headers=headers)
23
+ response.raise_for_status()
24
+
25
+ soup = BeautifulSoup(response.text, "html.parser")
26
+
27
+ # Abstract
28
+ abstract_div = soup.find("div", {"class": "abstract"})
29
+ abstract = abstract_div.get_text(
30
+ strip=True) if abstract_div else None
31
+
32
+ # Description
33
+ description_section = soup.find("section", itemprop="description")
34
+ description = description_section.get_text(
35
+ separator="\n", strip=True) if description_section else None
36
+
37
+ # Claims
38
+ claims_section = soup.find("section", itemprop="claims")
39
+ claims = claims_section.get_text(
40
+ separator="\n", strip=True) if claims_section else None
41
+
42
+ # Patent Title
43
+ meta_title = soup.find("meta", {"name": "DC.title"}).get(
44
+ "content").strip()
45
+
46
+ return PatentScrapResult(
47
+ abstract=abstract,
48
+ description=description,
49
+ claims=claims,
50
+ title=meta_title
51
+ )
52
+ except Exception as e:
53
+ logging.error(f"Error scraping {patent_url}: {e}")
54
+ return None
55
+
56
+
57
+ async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
58
+ """Scrape multiple patents asynchronously."""
59
+ tasks = [scrap_patent_async(client, url) for url in patent_urls]
60
+ results = await asyncio.gather(*tasks)
61
+
62
+ # Filter out None results (failed scrapes)
63
+ return [res for res in results if res is not None]
backends.py → search.py RENAMED
@@ -21,6 +21,9 @@ class APISearchResults(BaseModel):
21
 
22
  class BraveSearchBlockedException(Exception):
23
  """Dummy exception to detect when the headless browser is flagged as suspicious."""
 
 
 
24
  pass
25
 
26
 
@@ -36,7 +39,7 @@ async def playwright_open_page(browser: Browser):
36
  await context.close()
37
 
38
 
39
- #TODO: update to return same format for results
40
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
41
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
42
 
 
21
 
22
  class BraveSearchBlockedException(Exception):
23
  """Dummy exception to detect when the headless browser is flagged as suspicious."""
24
+
25
+ def __init__(self, *args):
26
+ super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious")
27
  pass
28
 
29
 
 
39
  await context.close()
40
 
41
 
42
+ # TODO: update to return same format for results
43
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
44
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
45