SERPent

Running

App Files Files Community

Game4all commited on 5 days ago

Commit

cf1c265

1 Parent(s): c38bd79

Extract field, background and description

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +1 -1
scrap.py +29 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__
-.vscode

 __pycache__
+.vscode
+.venv

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ pw_browser: Optional[Browser] = None
 # httpx client
 httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
-    max_connections=15, max_keepalive_connections=20))
 @asynccontextmanager

 # httpx client
 httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
+    max_connections=30, max_keepalive_connections=20))
 @asynccontextmanager

scrap.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import logging
 from typing import Optional
 from httpx import AsyncClient
 from bs4 import BeautifulSoup
@@ -8,10 +9,18 @@ from pydantic import BaseModel
 class PatentScrapResult(BaseModel):
     """Schema for the result of scraping a google patents page."""
     title: str
     abstract: Optional[str] = None
     description: Optional[str] = None
     claims: Optional[str] = None
 async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
@@ -34,6 +43,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
         description = description_section.get_text(
             separator="\n", strip=True) if description_section else None
         # Claims
         claims_section = soup.find("section", itemprop="claims")
         claims = claims_section.get_text(
@@ -43,11 +64,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
         meta_title = soup.find("meta", {"name": "DC.title"}).get(
             "content").strip()
         return PatentScrapResult(
             abstract=abstract,
             description=description,
             claims=claims,
-            title=meta_title
         )
     except Exception as e:
         logging.error(f"Error scraping {patent_url}: {e}")

 import asyncio
 import logging
+import re
 from typing import Optional
 from httpx import AsyncClient
 from bs4 import BeautifulSoup
 class PatentScrapResult(BaseModel):
     """Schema for the result of scraping a google patents page."""
+    # The title of the patent.
     title: str
+    # The abstract of the patent, if available.
     abstract: Optional[str] = None
+    # The full description of the patent containing the field of the invention, background, summary, etc.
     description: Optional[str] = None
+    # The full claims of the patent.
     claims: Optional[str] = None
+    # The field of the invention, if available.
+    field_of_invention: Optional[str] = None
+    # The background of the invention, if available.
+    background: Optional[str] = None
 async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
         description = description_section.get_text(
             separator="\n", strip=True) if description_section else None
+        # Field of the Invention
+        invention_field_match = re.findall(
+            r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+        invention_field = invention_field_match[0][1].strip(
+        ) if invention_field_match else None
+        # Background of the Invention
+        invention_background_match = re.findall(
+            r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+        invention_background = invention_background_match[0][1].strip(
+        ) if invention_background_match else None
         # Claims
         claims_section = soup.find("section", itemprop="claims")
         claims = claims_section.get_text(
         meta_title = soup.find("meta", {"name": "DC.title"}).get(
             "content").strip()
+        # Patent publication number
+        # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
+        # get the h2 with id ="pubnum" and extract the text
         return PatentScrapResult(
+            # publication_number=pub_num,
             abstract=abstract,
             description=description,
             claims=claims,
+            title=meta_title,
+            field_of_invention=invention_field,
+            background=invention_background
         )
     except Exception as e:
         logging.error(f"Error scraping {patent_url}: {e}")