Game4all commited on
Commit
8d6fbc5
·
1 Parent(s): cf1c265

Reorganize endpoints

Browse files
Files changed (3) hide show
  1. README.md +3 -1
  2. app.py +11 -6
  3. search.py +22 -7
README.md CHANGED
@@ -9,4 +9,6 @@ short_description: A SERP scrapping API for AI projects
9
  ---
10
 
11
 
12
- # `SERPent`
 
 
 
9
  ---
10
 
11
 
12
+ # `SERPent`
13
+
14
+ `SERPent` provides a SERP / scrapping API for use by AI agents / projects.
app.py CHANGED
@@ -42,6 +42,9 @@ backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
42
 
43
  # Router for scrapping related endpoints
44
  scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
 
 
 
45
 
46
  @app.get('/')
47
  async def status():
@@ -62,13 +65,13 @@ class APISearchParams(BaseModel):
62
  10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
63
 
64
 
65
- @app.post("/search_scholar")
66
  async def query_google_scholar(params: APISearchParams):
67
  """Queries google scholar for the specified query"""
68
  return {"error": "Unimplemented"}
69
 
70
 
71
- @app.post("/search_patents")
72
  async def search_patents(params: APISearchParams) -> APIPatentResults:
73
  """Searches google patents for the specified queries and returns the found documents."""
74
  results = []
@@ -84,7 +87,7 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
84
  return APIPatentResults(results=results, error=None)
85
 
86
 
87
- @app.post("/search_brave")
88
  async def search_brave(params: APISearchParams) -> APISearchResults:
89
  """Searches brave search for the specified queries and returns the found documents."""
90
  results = []
@@ -103,7 +106,7 @@ async def search_brave(params: APISearchParams) -> APISearchResults:
103
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
104
 
105
 
106
- @app.post("/search_bing")
107
  async def search_bing(params: APISearchParams) -> APISearchResults:
108
  """Searches Bing search for the specified queries and returns the found documents."""
109
  results = []
@@ -111,7 +114,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
111
  for q in params.queries:
112
  logging.info(f"Searching Bing search with query `{q}`")
113
  try:
114
- res = await query_brave_search(pw_browser, q, params.n_results)
115
  results.extend(res)
116
  except Exception as e:
117
  last_exception = e
@@ -122,7 +125,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
122
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
123
 
124
 
125
- @app.post("/search_duck")
126
  async def search_duck(params: APISearchParams) -> APISearchResults:
127
  """Searches duckduckgo for the specified queries and returns the found documents"""
128
  results = []
@@ -141,6 +144,7 @@ async def search_duck(params: APISearchParams) -> APISearchResults:
141
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
142
 
143
 
 
144
  @app.post("/search")
145
  async def search(params: APISearchParams):
146
  """Attempts to search the specified queries using ALL backends"""
@@ -205,6 +209,7 @@ async def scrap_patents(params: ScrapPatentsRequest):
205
 
206
  # ===============================================================================
207
 
 
208
  app.include_router(scrap_router)
209
 
210
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
42
 
43
  # Router for scrapping related endpoints
44
  scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
45
+ # Router for search related endpoints
46
+ search_router = APIRouter(prefix="/search", tags=["search"])
47
+
48
 
49
  @app.get('/')
50
  async def status():
 
65
  10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
66
 
67
 
68
+ @search_router.post("/search_scholar")
69
  async def query_google_scholar(params: APISearchParams):
70
  """Queries google scholar for the specified query"""
71
  return {"error": "Unimplemented"}
72
 
73
 
74
+ @search_router.post("/search_patents")
75
  async def search_patents(params: APISearchParams) -> APIPatentResults:
76
  """Searches google patents for the specified queries and returns the found documents."""
77
  results = []
 
87
  return APIPatentResults(results=results, error=None)
88
 
89
 
90
+ @search_router.post("/search_brave")
91
  async def search_brave(params: APISearchParams) -> APISearchResults:
92
  """Searches brave search for the specified queries and returns the found documents."""
93
  results = []
 
106
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
107
 
108
 
109
+ @search_router.post("/search_bing")
110
  async def search_bing(params: APISearchParams) -> APISearchResults:
111
  """Searches Bing search for the specified queries and returns the found documents."""
112
  results = []
 
114
  for q in params.queries:
115
  logging.info(f"Searching Bing search with query `{q}`")
116
  try:
117
+ res = await query_bing_search(pw_browser, q, params.n_results)
118
  results.extend(res)
119
  except Exception as e:
120
  last_exception = e
 
125
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
126
 
127
 
128
+ @search_router.post("/search_duck")
129
  async def search_duck(params: APISearchParams) -> APISearchResults:
130
  """Searches duckduckgo for the specified queries and returns the found documents"""
131
  results = []
 
144
  return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
145
 
146
 
147
+ @search_router.post("/search")
148
  @app.post("/search")
149
  async def search(params: APISearchParams):
150
  """Attempts to search the specified queries using ALL backends"""
 
209
 
210
  # ===============================================================================
211
 
212
+ app.include_router(search_router)
213
  app.include_router(scrap_router)
214
 
215
  uvicorn.run(app, host="0.0.0.0", port=7860)
search.py CHANGED
@@ -39,7 +39,6 @@ async def playwright_open_page(browser: Browser):
39
  await context.close()
40
 
41
 
42
- # TODO: update to return same format for results
43
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
44
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
45
 
@@ -65,17 +64,33 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
65
  PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
66
 
67
  items = await page.locator("search-result-item").all()
68
- id_matches = []
69
  for item in items:
 
70
  all_text = " ".join(await item.locator("span").all_inner_texts())
71
  found = re.findall(PATENT_ID_REGEX, all_text)
72
- if found:
73
- id_matches.append(found[0])
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
76
- for id in id_matches]
 
 
 
 
77
 
78
- return patents[:n_results]
79
 
80
 
81
  async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
 
39
  await context.close()
40
 
41
 
 
42
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
43
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
44
 
 
64
  PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
65
 
66
  items = await page.locator("search-result-item").all()
67
+ results = []
68
  for item in items:
69
+ # Extract all inner texts from spans (still used for patent ID)
70
  all_text = " ".join(await item.locator("span").all_inner_texts())
71
  found = re.findall(PATENT_ID_REGEX, all_text)
72
+ if not found:
73
+ continue
74
+
75
+ # get the first match as patent ID
76
+ patent_id = found[0]
77
+
78
+ # extract patent title
79
+ title = await item.locator("h3, h4").first.inner_text(timeout=1000)
80
+
81
+ # extract patent body
82
+ snippet_locator = item.locator(
83
+ "div.abstract, div.result-snippet, .snippet, .result-text")
84
+ body = await snippet_locator.first.inner_text(timeout=1000)
85
 
86
+ results.append({
87
+ "id": patent_id,
88
+ "href": f"https://patents.google.com/patent/{patent_id}/en",
89
+ "title": title,
90
+ "body": body
91
+ })
92
 
93
+ return results[:n_results]
94
 
95
 
96
  async def query_brave_search(browser: Browser, q: str, n_results: int = 10):