SERPent

Running

App Files Files Community

Game4all commited on Jun 24

Commit

e14c7a4

1 Parent(s): a007a27

Add scholar endpoint + rework

Browse files

Files changed (2) hide show

app.py +12 -3
serp.py +50 -18

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import logging
 import uvicorn
 from scrap import scrap_patent_async, scrap_patent_bulk_async
-from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
 logging.basicConfig(
     level=logging.INFO,
@@ -54,9 +54,18 @@ class SerpQuery(BaseModel):
 @serp_router.post("/search_scholar")
-async def query_google_scholar(params: SerpQuery):
     """Queries google scholar for the specified query"""
-    return {"error": "Unimplemented"}
 @serp_router.post("/search_patents")

 import uvicorn
 from scrap import scrap_patent_async, scrap_patent_bulk_async
+from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
 logging.basicConfig(
     level=logging.INFO,
 @serp_router.post("/search_scholar")
+async def search_google_scholar(params: SerpQuery):
     """Queries google scholar for the specified query"""
+    results = []
+    for q in params.queries:
+        logging.info(f"Searching Google Scholar with query `{q}`")
+        try:
+            res = await query_google_scholar(pw_browser, q, params.n_results)
+            results.extend(res)
+        except Exception as e:
+            logging.error(
+                f"Failed to query Google Scholar with query `{q}`: {e}")
+    return SerpResults(results=results, error=None)
 @serp_router.post("/search_patents")

serp.py CHANGED Viewed

@@ -33,9 +33,46 @@ async def playwright_open_page(browser: Browser):
         await context.close()
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""
     async with playwright_open_page(browser) as page:
         async def _block_resources(route, request):
@@ -46,36 +83,31 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
         await page.route("**/*", _block_resources)
-        url = f"https://patents.google.com/?q=({quote_plus(q)})&num={n_results}"
         await page.goto(url)
         await page.wait_for_function(
-            f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
             timeout=30_000
         )
-        # regex to locate a patent id
-        PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
         items = await page.locator("search-result-item").all()
         results = []
         for item in items:
-            # Extract all inner texts from spans (still used for patent ID)
-            all_text = " ".join(await item.locator("span").all_inner_texts())
-            found = re.findall(PATENT_ID_REGEX, all_text)
-            if not found:
                 continue
-            # get the first match as patent ID
-            patent_id = found[0]
-            # extract patent title
-            title = await item.locator("h3, h4").first.inner_text(timeout=1000)
-            # extract patent body
-            snippet_locator = item.locator(
-                "div.abstract, div.result-snippet, .snippet, .result-text")
-            body = await snippet_locator.first.inner_text(timeout=1000)
             results.append({
                 "id": patent_id,

         await context.close()
+async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
+    """Queries google scholar for the specified query and number of results. Returns relevant papers"""
+    async with playwright_open_page(browser) as page:
+        async def _block_resources(route, request):
+            if request.resource_type in ["stylesheet", "image"]:
+                await route.abort()
+            else:
+                await route.continue_()
+        await page.route("**/*", _block_resources)
+        url = f"https://scholar.google.com/scholar?q={quote_plus(q)}&num={n_results}"
+        await page.goto(url)
+        await page.wait_for_selector("div.gs_ri")
+        items = await page.locator("div.gs_ri").all()
+        results = []
+        for item in items[:n_results]:
+            title = await item.locator("h3").inner_text(timeout=1000)
+            body = await item.locator("div.gs_rs").inner_text(timeout=1000)
+            href = await item.locator("h3 > a").get_attribute("href")
+            results.append({
+                "title": title,
+                "body": body,
+                "href": href
+            })
+    return results
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""
+    # regex to locate a patent id
+    PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
     async with playwright_open_page(browser) as page:
         async def _block_resources(route, request):
         await page.route("**/*", _block_resources)
+        url = f"https://patents.google.com/?q={quote_plus(q)}&num={n_results}"
         await page.goto(url)
+        # Wait for at least one search result item to appear
+        # This ensures the page has loaded enough to start scraping
         await page.wait_for_function(
+            f"""() => document.querySelectorAll('search-result-item').length >= 1""",
             timeout=30_000
         )
         items = await page.locator("search-result-item").all()
         results = []
         for item in items:
+            text = " ".join(await item.locator("span").all_inner_texts())
+            match = re.search(PATENT_ID_REGEX, text)
+            if not match:
                 continue
+            patent_id = match.group()
+            try:
+                title = await item.locator("h3, h4").first.inner_text(timeout=1000)
+                body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
+            except:
+                continue  # If we can't get title or body, skip this item
             results.append({
                 "id": patent_id,