Game4all commited on
Commit
e14c7a4
·
1 Parent(s): a007a27

Add scholar endpoint + rework

Browse files
Files changed (2) hide show
  1. app.py +12 -3
  2. serp.py +50 -18
app.py CHANGED
@@ -9,7 +9,7 @@ import logging
9
  import uvicorn
10
 
11
  from scrap import scrap_patent_async, scrap_patent_bulk_async
12
- from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
13
 
14
  logging.basicConfig(
15
  level=logging.INFO,
@@ -54,9 +54,18 @@ class SerpQuery(BaseModel):
54
 
55
 
56
  @serp_router.post("/search_scholar")
57
- async def query_google_scholar(params: SerpQuery):
58
  """Queries google scholar for the specified query"""
59
- return {"error": "Unimplemented"}
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  @serp_router.post("/search_patents")
 
9
  import uvicorn
10
 
11
  from scrap import scrap_patent_async, scrap_patent_bulk_async
12
+ from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
13
 
14
  logging.basicConfig(
15
  level=logging.INFO,
 
54
 
55
 
56
  @serp_router.post("/search_scholar")
57
+ async def search_google_scholar(params: SerpQuery):
58
  """Queries google scholar for the specified query"""
59
+ results = []
60
+ for q in params.queries:
61
+ logging.info(f"Searching Google Scholar with query `{q}`")
62
+ try:
63
+ res = await query_google_scholar(pw_browser, q, params.n_results)
64
+ results.extend(res)
65
+ except Exception as e:
66
+ logging.error(
67
+ f"Failed to query Google Scholar with query `{q}`: {e}")
68
+ return SerpResults(results=results, error=None)
69
 
70
 
71
  @serp_router.post("/search_patents")
serp.py CHANGED
@@ -33,9 +33,46 @@ async def playwright_open_page(browser: Browser):
33
  await context.close()
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
37
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
38
 
 
 
 
39
  async with playwright_open_page(browser) as page:
40
 
41
  async def _block_resources(route, request):
@@ -46,36 +83,31 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
46
 
47
  await page.route("**/*", _block_resources)
48
 
49
- url = f"https://patents.google.com/?q=({quote_plus(q)})&num={n_results}"
50
  await page.goto(url)
51
 
 
 
52
  await page.wait_for_function(
53
- f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
54
  timeout=30_000
55
  )
56
 
57
- # regex to locate a patent id
58
- PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
59
-
60
  items = await page.locator("search-result-item").all()
61
  results = []
62
  for item in items:
63
- # Extract all inner texts from spans (still used for patent ID)
64
- all_text = " ".join(await item.locator("span").all_inner_texts())
65
- found = re.findall(PATENT_ID_REGEX, all_text)
66
- if not found:
67
  continue
68
 
69
- # get the first match as patent ID
70
- patent_id = found[0]
71
-
72
- # extract patent title
73
- title = await item.locator("h3, h4").first.inner_text(timeout=1000)
74
 
75
- # extract patent body
76
- snippet_locator = item.locator(
77
- "div.abstract, div.result-snippet, .snippet, .result-text")
78
- body = await snippet_locator.first.inner_text(timeout=1000)
 
79
 
80
  results.append({
81
  "id": patent_id,
 
33
  await context.close()
34
 
35
 
36
+ async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
37
+ """Queries google scholar for the specified query and number of results. Returns relevant papers"""
38
+
39
+ async with playwright_open_page(browser) as page:
40
+
41
+ async def _block_resources(route, request):
42
+ if request.resource_type in ["stylesheet", "image"]:
43
+ await route.abort()
44
+ else:
45
+ await route.continue_()
46
+
47
+ await page.route("**/*", _block_resources)
48
+
49
+ url = f"https://scholar.google.com/scholar?q={quote_plus(q)}&num={n_results}"
50
+ await page.goto(url)
51
+
52
+ await page.wait_for_selector("div.gs_ri")
53
+
54
+ items = await page.locator("div.gs_ri").all()
55
+ results = []
56
+ for item in items[:n_results]:
57
+ title = await item.locator("h3").inner_text(timeout=1000)
58
+ body = await item.locator("div.gs_rs").inner_text(timeout=1000)
59
+ href = await item.locator("h3 > a").get_attribute("href")
60
+
61
+ results.append({
62
+ "title": title,
63
+ "body": body,
64
+ "href": href
65
+ })
66
+
67
+ return results
68
+
69
+
70
  async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
71
  """Queries google patents for the specified query and number of results. Returns relevant patents"""
72
 
73
+ # regex to locate a patent id
74
+ PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
75
+
76
  async with playwright_open_page(browser) as page:
77
 
78
  async def _block_resources(route, request):
 
83
 
84
  await page.route("**/*", _block_resources)
85
 
86
+ url = f"https://patents.google.com/?q={quote_plus(q)}&num={n_results}"
87
  await page.goto(url)
88
 
89
+ # Wait for at least one search result item to appear
90
+ # This ensures the page has loaded enough to start scraping
91
  await page.wait_for_function(
92
+ f"""() => document.querySelectorAll('search-result-item').length >= 1""",
93
  timeout=30_000
94
  )
95
 
 
 
 
96
  items = await page.locator("search-result-item").all()
97
  results = []
98
  for item in items:
99
+ text = " ".join(await item.locator("span").all_inner_texts())
100
+ match = re.search(PATENT_ID_REGEX, text)
101
+ if not match:
 
102
  continue
103
 
104
+ patent_id = match.group()
 
 
 
 
105
 
106
+ try:
107
+ title = await item.locator("h3, h4").first.inner_text(timeout=1000)
108
+ body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
109
+ except:
110
+ continue # If we can't get title or body, skip this item
111
 
112
  results.append({
113
  "id": patent_id,