Spaces:
Running
Running
Reorganize endpoints
Browse files
README.md
CHANGED
@@ -9,4 +9,6 @@ short_description: A SERP scrapping API for AI projects
|
|
9 |
---
|
10 |
|
11 |
|
12 |
-
# `SERPent`
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
|
12 |
+
# `SERPent`
|
13 |
+
|
14 |
+
`SERPent` provides a SERP / scrapping API for use by AI agents / projects.
|
app.py
CHANGED
@@ -42,6 +42,9 @@ backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
|
42 |
|
43 |
# Router for scrapping related endpoints
|
44 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
|
|
|
|
|
|
45 |
|
46 |
@app.get('/')
|
47 |
async def status():
|
@@ -62,13 +65,13 @@ class APISearchParams(BaseModel):
|
|
62 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
63 |
|
64 |
|
65 |
-
@
|
66 |
async def query_google_scholar(params: APISearchParams):
|
67 |
"""Queries google scholar for the specified query"""
|
68 |
return {"error": "Unimplemented"}
|
69 |
|
70 |
|
71 |
-
@
|
72 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
74 |
results = []
|
@@ -84,7 +87,7 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
|
|
84 |
return APIPatentResults(results=results, error=None)
|
85 |
|
86 |
|
87 |
-
@
|
88 |
async def search_brave(params: APISearchParams) -> APISearchResults:
|
89 |
"""Searches brave search for the specified queries and returns the found documents."""
|
90 |
results = []
|
@@ -103,7 +106,7 @@ async def search_brave(params: APISearchParams) -> APISearchResults:
|
|
103 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
104 |
|
105 |
|
106 |
-
@
|
107 |
async def search_bing(params: APISearchParams) -> APISearchResults:
|
108 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
109 |
results = []
|
@@ -111,7 +114,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
|
|
111 |
for q in params.queries:
|
112 |
logging.info(f"Searching Bing search with query `{q}`")
|
113 |
try:
|
114 |
-
res = await
|
115 |
results.extend(res)
|
116 |
except Exception as e:
|
117 |
last_exception = e
|
@@ -122,7 +125,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
|
|
122 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
123 |
|
124 |
|
125 |
-
@
|
126 |
async def search_duck(params: APISearchParams) -> APISearchResults:
|
127 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
128 |
results = []
|
@@ -141,6 +144,7 @@ async def search_duck(params: APISearchParams) -> APISearchResults:
|
|
141 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
142 |
|
143 |
|
|
|
144 |
@app.post("/search")
|
145 |
async def search(params: APISearchParams):
|
146 |
"""Attempts to search the specified queries using ALL backends"""
|
@@ -205,6 +209,7 @@ async def scrap_patents(params: ScrapPatentsRequest):
|
|
205 |
|
206 |
# ===============================================================================
|
207 |
|
|
|
208 |
app.include_router(scrap_router)
|
209 |
|
210 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
42 |
|
43 |
# Router for scrapping related endpoints
|
44 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
45 |
+
# Router for search related endpoints
|
46 |
+
search_router = APIRouter(prefix="/search", tags=["search"])
|
47 |
+
|
48 |
|
49 |
@app.get('/')
|
50 |
async def status():
|
|
|
65 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
66 |
|
67 |
|
68 |
+
@search_router.post("/search_scholar")
|
69 |
async def query_google_scholar(params: APISearchParams):
|
70 |
"""Queries google scholar for the specified query"""
|
71 |
return {"error": "Unimplemented"}
|
72 |
|
73 |
|
74 |
+
@search_router.post("/search_patents")
|
75 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
76 |
"""Searches google patents for the specified queries and returns the found documents."""
|
77 |
results = []
|
|
|
87 |
return APIPatentResults(results=results, error=None)
|
88 |
|
89 |
|
90 |
+
@search_router.post("/search_brave")
|
91 |
async def search_brave(params: APISearchParams) -> APISearchResults:
|
92 |
"""Searches brave search for the specified queries and returns the found documents."""
|
93 |
results = []
|
|
|
106 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
107 |
|
108 |
|
109 |
+
@search_router.post("/search_bing")
|
110 |
async def search_bing(params: APISearchParams) -> APISearchResults:
|
111 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
112 |
results = []
|
|
|
114 |
for q in params.queries:
|
115 |
logging.info(f"Searching Bing search with query `{q}`")
|
116 |
try:
|
117 |
+
res = await query_bing_search(pw_browser, q, params.n_results)
|
118 |
results.extend(res)
|
119 |
except Exception as e:
|
120 |
last_exception = e
|
|
|
125 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
126 |
|
127 |
|
128 |
+
@search_router.post("/search_duck")
|
129 |
async def search_duck(params: APISearchParams) -> APISearchResults:
|
130 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
131 |
results = []
|
|
|
144 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
145 |
|
146 |
|
147 |
+
@search_router.post("/search")
|
148 |
@app.post("/search")
|
149 |
async def search(params: APISearchParams):
|
150 |
"""Attempts to search the specified queries using ALL backends"""
|
|
|
209 |
|
210 |
# ===============================================================================
|
211 |
|
212 |
+
app.include_router(search_router)
|
213 |
app.include_router(scrap_router)
|
214 |
|
215 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
search.py
CHANGED
@@ -39,7 +39,6 @@ async def playwright_open_page(browser: Browser):
|
|
39 |
await context.close()
|
40 |
|
41 |
|
42 |
-
# TODO: update to return same format for results
|
43 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
44 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
45 |
|
@@ -65,17 +64,33 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
|
65 |
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
66 |
|
67 |
items = await page.locator("search-result-item").all()
|
68 |
-
|
69 |
for item in items:
|
|
|
70 |
all_text = " ".join(await item.locator("span").all_inner_texts())
|
71 |
found = re.findall(PATENT_ID_REGEX, all_text)
|
72 |
-
if found:
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
return
|
79 |
|
80 |
|
81 |
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|
|
|
39 |
await context.close()
|
40 |
|
41 |
|
|
|
42 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
43 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
44 |
|
|
|
64 |
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
65 |
|
66 |
items = await page.locator("search-result-item").all()
|
67 |
+
results = []
|
68 |
for item in items:
|
69 |
+
# Extract all inner texts from spans (still used for patent ID)
|
70 |
all_text = " ".join(await item.locator("span").all_inner_texts())
|
71 |
found = re.findall(PATENT_ID_REGEX, all_text)
|
72 |
+
if not found:
|
73 |
+
continue
|
74 |
+
|
75 |
+
# get the first match as patent ID
|
76 |
+
patent_id = found[0]
|
77 |
+
|
78 |
+
# extract patent title
|
79 |
+
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
|
80 |
+
|
81 |
+
# extract patent body
|
82 |
+
snippet_locator = item.locator(
|
83 |
+
"div.abstract, div.result-snippet, .snippet, .result-text")
|
84 |
+
body = await snippet_locator.first.inner_text(timeout=1000)
|
85 |
|
86 |
+
results.append({
|
87 |
+
"id": patent_id,
|
88 |
+
"href": f"https://patents.google.com/patent/{patent_id}/en",
|
89 |
+
"title": title,
|
90 |
+
"body": body
|
91 |
+
})
|
92 |
|
93 |
+
return results[:n_results]
|
94 |
|
95 |
|
96 |
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|