Initial commit
Browse files- Dockerfile +49 -0
- README.md +6 -7
- app.py +156 -0
- requirements.txt +4 -0
Dockerfile
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
wget \
|
7 |
+
gnupg \
|
8 |
+
ca-certificates \
|
9 |
+
fonts-liberation \
|
10 |
+
libasound2 \
|
11 |
+
libatk-bridge2.0-0 \
|
12 |
+
libatk1.0-0 \
|
13 |
+
libc6 \
|
14 |
+
libcairo2 \
|
15 |
+
libcups2 \
|
16 |
+
libdbus-1-3 \
|
17 |
+
libexpat1 \
|
18 |
+
libfontconfig1 \
|
19 |
+
libgcc1 \
|
20 |
+
libglib2.0-0 \
|
21 |
+
libgtk-3-0 \
|
22 |
+
libnspr4 \
|
23 |
+
libnss3 \
|
24 |
+
libx11-6 \
|
25 |
+
libx11-xcb1 \
|
26 |
+
libxcb1 \
|
27 |
+
libxcomposite1 \
|
28 |
+
libxcursor1 \
|
29 |
+
libxdamage1 \
|
30 |
+
libxext6 \
|
31 |
+
libxfixes3 \
|
32 |
+
libxi6 \
|
33 |
+
libxrandr2 \
|
34 |
+
libxrender1 \
|
35 |
+
libxss1 \
|
36 |
+
libxtst6 \
|
37 |
+
xdg-utils \
|
38 |
+
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
39 |
+
|
40 |
+
COPY requirements.txt .
|
41 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
42 |
+
|
43 |
+
RUN playwright install chromium
|
44 |
+
|
45 |
+
COPY . .
|
46 |
+
|
47 |
+
EXPOSE 7860
|
48 |
+
|
49 |
+
CMD ["python3" "./app.py"]
|
README.md
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
---
|
2 |
title: SERPent
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
|
8 |
-
license: mit
|
9 |
-
short_description: Reusable SERP scrapping API for AI projects
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
1 |
---
|
2 |
title: SERPent
|
3 |
+
emoji: 🐍
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
|
|
|
|
8 |
---
|
9 |
|
10 |
+
|
11 |
+
# `SERPent`
|
app.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import asynccontextmanager
|
2 |
+
from typing import Optional
|
3 |
+
from fastapi import FastAPI
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
6 |
+
from urllib.parse import quote_plus
|
7 |
+
import logging
|
8 |
+
import re
|
9 |
+
import uvicorn
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
|
13 |
+
# playwright global context
|
14 |
+
playwright = None
|
15 |
+
pw_browser: Browser = None
|
16 |
+
|
17 |
+
|
18 |
+
@asynccontextmanager
|
19 |
+
async def api_lifespan(app: FastAPI):
|
20 |
+
global playwright, pw_browser
|
21 |
+
playwright = await async_playwright().start()
|
22 |
+
pw_browser = await playwright.chromium.launch(headless=True)
|
23 |
+
|
24 |
+
yield
|
25 |
+
|
26 |
+
await pw_browser.close()
|
27 |
+
await playwright.stop()
|
28 |
+
|
29 |
+
app = FastAPI(lifespan=api_lifespan)
|
30 |
+
|
31 |
+
|
32 |
+
class APISearchParams(BaseModel):
|
33 |
+
queries: list[str] = Field(...,
|
34 |
+
description="The list of queries to search for")
|
35 |
+
n_results: int = Field(
|
36 |
+
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
37 |
+
|
38 |
+
|
39 |
+
class APIPatentResults(BaseModel):
|
40 |
+
"""Response of /search_patents endpoint"""
|
41 |
+
error: Optional[str]
|
42 |
+
results: Optional[list[dict]]
|
43 |
+
|
44 |
+
|
45 |
+
class APIBraveResults(BaseModel):
|
46 |
+
"""Response of /search_brave endpoint"""
|
47 |
+
error: Optional[str]
|
48 |
+
results: Optional[list[dict]]
|
49 |
+
|
50 |
+
|
51 |
+
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
52 |
+
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
53 |
+
context: BrowserContext = await browser.new_context()
|
54 |
+
page: Page = await context.new_page()
|
55 |
+
|
56 |
+
async def _block_resources(route, request):
|
57 |
+
if request.resource_type in ["stylesheet", "image"]:
|
58 |
+
await route.abort()
|
59 |
+
else:
|
60 |
+
await route.continue_()
|
61 |
+
|
62 |
+
await page.route("**/*", _block_resources)
|
63 |
+
|
64 |
+
url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
|
65 |
+
await page.goto(url)
|
66 |
+
|
67 |
+
await page.wait_for_function(
|
68 |
+
f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
|
69 |
+
timeout=30_000
|
70 |
+
)
|
71 |
+
|
72 |
+
# regex to locate a patent id
|
73 |
+
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
74 |
+
|
75 |
+
items = await page.locator("search-result-item").all()
|
76 |
+
matches = []
|
77 |
+
for item in items:
|
78 |
+
all_text = " ".join(await item.locator("span").all_inner_texts())
|
79 |
+
found = re.findall(PATENT_ID_REGEX, all_text)
|
80 |
+
if found:
|
81 |
+
matches.append(found[0])
|
82 |
+
|
83 |
+
await context.close()
|
84 |
+
return matches
|
85 |
+
|
86 |
+
|
87 |
+
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|
88 |
+
"""Queries brave search for the specified query"""
|
89 |
+
context: BrowserContext = await browser.new_context()
|
90 |
+
page: Page = await context.new_page()
|
91 |
+
|
92 |
+
async def _block_resources(route, request):
|
93 |
+
if request.resource_type in ["stylesheet", "image"]:
|
94 |
+
await route.abort()
|
95 |
+
else:
|
96 |
+
await route.continue_()
|
97 |
+
|
98 |
+
await page.route("**/*", _block_resources)
|
99 |
+
|
100 |
+
url = f"https://search.brave.com/search?q={quote_plus(q)}"
|
101 |
+
await page.goto(url)
|
102 |
+
|
103 |
+
results_cards = await page.locator('.snippet').all()
|
104 |
+
|
105 |
+
results = []
|
106 |
+
|
107 |
+
for result in results_cards:
|
108 |
+
title = await result.locator('.title').all_inner_texts()
|
109 |
+
description = await result.locator('.snippet-description').all_inner_texts()
|
110 |
+
url = await result.locator('a').nth(0).get_attribute('href')
|
111 |
+
|
112 |
+
if url.startswith('/'):
|
113 |
+
continue
|
114 |
+
|
115 |
+
results.append({"title": title[0] if len(title) > 0 else "", "body": description[0] if len(
|
116 |
+
description) > 0 else "", "href": url})
|
117 |
+
|
118 |
+
return results[:n_results]
|
119 |
+
|
120 |
+
|
121 |
+
@app.post("/search_scholar")
|
122 |
+
async def query_google_scholar(params: APISearchParams):
|
123 |
+
"""Queries google scholar for the specified query"""
|
124 |
+
return {"error": "Unimplemented"}
|
125 |
+
|
126 |
+
|
127 |
+
@app.post("/search_patents")
|
128 |
+
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
129 |
+
"""Searches google patents for the specified queries and returns the found documents."""
|
130 |
+
results = []
|
131 |
+
for q in params.queries:
|
132 |
+
logging.info(f"Searching Google Patents with query `{q}`")
|
133 |
+
try:
|
134 |
+
res = await query_google_patents(pw_browser, q, params.n_results)
|
135 |
+
results.extend(res)
|
136 |
+
except Exception as e:
|
137 |
+
logging.error(
|
138 |
+
f"Failed to query Google Patents with query `{q}`: {e}")
|
139 |
+
return APIPatentResults(results=[{"href": f"https://patents.google.com/patent/{id}/en", "id": id} for id in results], error=None)
|
140 |
+
|
141 |
+
|
142 |
+
@app.post("/search_brave")
|
143 |
+
async def search_brave(params: APISearchParams) -> APIBraveResults:
|
144 |
+
results = []
|
145 |
+
for q in params.queries:
|
146 |
+
logging.info(f"Searching Brave search with query `{q}`")
|
147 |
+
try:
|
148 |
+
res = await query_brave_search(pw_browser, q, params.n_results)
|
149 |
+
results.extend(res)
|
150 |
+
except Exception as e:
|
151 |
+
logging.error(
|
152 |
+
f"Failed to query Brave search with query `{q}`: {e}")
|
153 |
+
|
154 |
+
return APIBraveResults(results=results, error=None)
|
155 |
+
|
156 |
+
uvicorn.run(app, port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
pydantic
|
4 |
+
playwright
|