Game4all commited on
Commit
8dcc49f
·
1 Parent(s): e3eedf3

Initial commit

Browse files
Files changed (4) hide show
  1. Dockerfile +49 -0
  2. README.md +6 -7
  3. app.py +156 -0
  4. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ wget \
7
+ gnupg \
8
+ ca-certificates \
9
+ fonts-liberation \
10
+ libasound2 \
11
+ libatk-bridge2.0-0 \
12
+ libatk1.0-0 \
13
+ libc6 \
14
+ libcairo2 \
15
+ libcups2 \
16
+ libdbus-1-3 \
17
+ libexpat1 \
18
+ libfontconfig1 \
19
+ libgcc1 \
20
+ libglib2.0-0 \
21
+ libgtk-3-0 \
22
+ libnspr4 \
23
+ libnss3 \
24
+ libx11-6 \
25
+ libx11-xcb1 \
26
+ libxcb1 \
27
+ libxcomposite1 \
28
+ libxcursor1 \
29
+ libxdamage1 \
30
+ libxext6 \
31
+ libxfixes3 \
32
+ libxi6 \
33
+ libxrandr2 \
34
+ libxrender1 \
35
+ libxss1 \
36
+ libxtst6 \
37
+ xdg-utils \
38
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
39
+
40
+ COPY requirements.txt .
41
+ RUN pip install --no-cache-dir -r requirements.txt
42
+
43
+ RUN playwright install chromium
44
+
45
+ COPY . .
46
+
47
+ EXPOSE 7860
48
+
49
+ CMD ["python3" "./app.py"]
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
  title: SERPent
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: gray
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: Reusable SERP scrapping API for AI projects
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
  title: SERPent
3
+ emoji: 🐍
4
+ colorFrom: green
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 7860
 
 
8
  ---
9
 
10
+
11
+ # `SERPent`
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from typing import Optional
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel, Field
5
+ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
6
+ from urllib.parse import quote_plus
7
+ import logging
8
+ import re
9
+ import uvicorn
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ # playwright global context
14
+ playwright = None
15
+ pw_browser: Browser = None
16
+
17
+
18
+ @asynccontextmanager
19
+ async def api_lifespan(app: FastAPI):
20
+ global playwright, pw_browser
21
+ playwright = await async_playwright().start()
22
+ pw_browser = await playwright.chromium.launch(headless=True)
23
+
24
+ yield
25
+
26
+ await pw_browser.close()
27
+ await playwright.stop()
28
+
29
+ app = FastAPI(lifespan=api_lifespan)
30
+
31
+
32
+ class APISearchParams(BaseModel):
33
+ queries: list[str] = Field(...,
34
+ description="The list of queries to search for")
35
+ n_results: int = Field(
36
+ 10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
37
+
38
+
39
+ class APIPatentResults(BaseModel):
40
+ """Response of /search_patents endpoint"""
41
+ error: Optional[str]
42
+ results: Optional[list[dict]]
43
+
44
+
45
+ class APIBraveResults(BaseModel):
46
+ """Response of /search_brave endpoint"""
47
+ error: Optional[str]
48
+ results: Optional[list[dict]]
49
+
50
+
51
+ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
52
+ """Queries google patents for the specified query and number of results. Returns relevant patents"""
53
+ context: BrowserContext = await browser.new_context()
54
+ page: Page = await context.new_page()
55
+
56
+ async def _block_resources(route, request):
57
+ if request.resource_type in ["stylesheet", "image"]:
58
+ await route.abort()
59
+ else:
60
+ await route.continue_()
61
+
62
+ await page.route("**/*", _block_resources)
63
+
64
+ url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
65
+ await page.goto(url)
66
+
67
+ await page.wait_for_function(
68
+ f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
69
+ timeout=30_000
70
+ )
71
+
72
+ # regex to locate a patent id
73
+ PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
74
+
75
+ items = await page.locator("search-result-item").all()
76
+ matches = []
77
+ for item in items:
78
+ all_text = " ".join(await item.locator("span").all_inner_texts())
79
+ found = re.findall(PATENT_ID_REGEX, all_text)
80
+ if found:
81
+ matches.append(found[0])
82
+
83
+ await context.close()
84
+ return matches
85
+
86
+
87
+ async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
88
+ """Queries brave search for the specified query"""
89
+ context: BrowserContext = await browser.new_context()
90
+ page: Page = await context.new_page()
91
+
92
+ async def _block_resources(route, request):
93
+ if request.resource_type in ["stylesheet", "image"]:
94
+ await route.abort()
95
+ else:
96
+ await route.continue_()
97
+
98
+ await page.route("**/*", _block_resources)
99
+
100
+ url = f"https://search.brave.com/search?q={quote_plus(q)}"
101
+ await page.goto(url)
102
+
103
+ results_cards = await page.locator('.snippet').all()
104
+
105
+ results = []
106
+
107
+ for result in results_cards:
108
+ title = await result.locator('.title').all_inner_texts()
109
+ description = await result.locator('.snippet-description').all_inner_texts()
110
+ url = await result.locator('a').nth(0).get_attribute('href')
111
+
112
+ if url.startswith('/'):
113
+ continue
114
+
115
+ results.append({"title": title[0] if len(title) > 0 else "", "body": description[0] if len(
116
+ description) > 0 else "", "href": url})
117
+
118
+ return results[:n_results]
119
+
120
+
121
+ @app.post("/search_scholar")
122
+ async def query_google_scholar(params: APISearchParams):
123
+ """Queries google scholar for the specified query"""
124
+ return {"error": "Unimplemented"}
125
+
126
+
127
+ @app.post("/search_patents")
128
+ async def search_patents(params: APISearchParams) -> APIPatentResults:
129
+ """Searches google patents for the specified queries and returns the found documents."""
130
+ results = []
131
+ for q in params.queries:
132
+ logging.info(f"Searching Google Patents with query `{q}`")
133
+ try:
134
+ res = await query_google_patents(pw_browser, q, params.n_results)
135
+ results.extend(res)
136
+ except Exception as e:
137
+ logging.error(
138
+ f"Failed to query Google Patents with query `{q}`: {e}")
139
+ return APIPatentResults(results=[{"href": f"https://patents.google.com/patent/{id}/en", "id": id} for id in results], error=None)
140
+
141
+
142
+ @app.post("/search_brave")
143
+ async def search_brave(params: APISearchParams) -> APIBraveResults:
144
+ results = []
145
+ for q in params.queries:
146
+ logging.info(f"Searching Brave search with query `{q}`")
147
+ try:
148
+ res = await query_brave_search(pw_browser, q, params.n_results)
149
+ results.extend(res)
150
+ except Exception as e:
151
+ logging.error(
152
+ f"Failed to query Brave search with query `{q}`: {e}")
153
+
154
+ return APIBraveResults(results=results, error=None)
155
+
156
+ uvicorn.run(app, port=7860)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ playwright