Spaces:
Running
Running
Extract field, background and description
Browse files- .gitignore +2 -1
- app.py +1 -1
- scrap.py +29 -1
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
__pycache__
|
2 |
-
.vscode
|
|
|
|
1 |
__pycache__
|
2 |
+
.vscode
|
3 |
+
.venv
|
app.py
CHANGED
@@ -24,7 +24,7 @@ pw_browser: Optional[Browser] = None
|
|
24 |
|
25 |
# httpx client
|
26 |
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
27 |
-
max_connections=
|
28 |
|
29 |
|
30 |
@asynccontextmanager
|
|
|
24 |
|
25 |
# httpx client
|
26 |
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
27 |
+
max_connections=30, max_keepalive_connections=20))
|
28 |
|
29 |
|
30 |
@asynccontextmanager
|
scrap.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import asyncio
|
2 |
import logging
|
|
|
3 |
from typing import Optional
|
4 |
from httpx import AsyncClient
|
5 |
from bs4 import BeautifulSoup
|
@@ -8,10 +9,18 @@ from pydantic import BaseModel
|
|
8 |
|
9 |
class PatentScrapResult(BaseModel):
|
10 |
"""Schema for the result of scraping a google patents page."""
|
|
|
11 |
title: str
|
|
|
12 |
abstract: Optional[str] = None
|
|
|
13 |
description: Optional[str] = None
|
|
|
14 |
claims: Optional[str] = None
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
@@ -34,6 +43,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
|
|
34 |
description = description_section.get_text(
|
35 |
separator="\n", strip=True) if description_section else None
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Claims
|
38 |
claims_section = soup.find("section", itemprop="claims")
|
39 |
claims = claims_section.get_text(
|
@@ -43,11 +64,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
|
|
43 |
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
44 |
"content").strip()
|
45 |
|
|
|
|
|
|
|
|
|
46 |
return PatentScrapResult(
|
|
|
47 |
abstract=abstract,
|
48 |
description=description,
|
49 |
claims=claims,
|
50 |
-
title=meta_title
|
|
|
|
|
51 |
)
|
52 |
except Exception as e:
|
53 |
logging.error(f"Error scraping {patent_url}: {e}")
|
|
|
1 |
import asyncio
|
2 |
import logging
|
3 |
+
import re
|
4 |
from typing import Optional
|
5 |
from httpx import AsyncClient
|
6 |
from bs4 import BeautifulSoup
|
|
|
9 |
|
10 |
class PatentScrapResult(BaseModel):
|
11 |
"""Schema for the result of scraping a google patents page."""
|
12 |
+
# The title of the patent.
|
13 |
title: str
|
14 |
+
# The abstract of the patent, if available.
|
15 |
abstract: Optional[str] = None
|
16 |
+
# The full description of the patent containing the field of the invention, background, summary, etc.
|
17 |
description: Optional[str] = None
|
18 |
+
# The full claims of the patent.
|
19 |
claims: Optional[str] = None
|
20 |
+
# The field of the invention, if available.
|
21 |
+
field_of_invention: Optional[str] = None
|
22 |
+
# The background of the invention, if available.
|
23 |
+
background: Optional[str] = None
|
24 |
|
25 |
|
26 |
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
|
|
43 |
description = description_section.get_text(
|
44 |
separator="\n", strip=True) if description_section else None
|
45 |
|
46 |
+
# Field of the Invention
|
47 |
+
invention_field_match = re.findall(
|
48 |
+
r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
49 |
+
invention_field = invention_field_match[0][1].strip(
|
50 |
+
) if invention_field_match else None
|
51 |
+
|
52 |
+
# Background of the Invention
|
53 |
+
invention_background_match = re.findall(
|
54 |
+
r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
55 |
+
invention_background = invention_background_match[0][1].strip(
|
56 |
+
) if invention_background_match else None
|
57 |
+
|
58 |
# Claims
|
59 |
claims_section = soup.find("section", itemprop="claims")
|
60 |
claims = claims_section.get_text(
|
|
|
64 |
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
65 |
"content").strip()
|
66 |
|
67 |
+
# Patent publication number
|
68 |
+
# pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
|
69 |
+
# get the h2 with id ="pubnum" and extract the text
|
70 |
+
|
71 |
return PatentScrapResult(
|
72 |
+
# publication_number=pub_num,
|
73 |
abstract=abstract,
|
74 |
description=description,
|
75 |
claims=claims,
|
76 |
+
title=meta_title,
|
77 |
+
field_of_invention=invention_field,
|
78 |
+
background=invention_background
|
79 |
)
|
80 |
except Exception as e:
|
81 |
logging.error(f"Error scraping {patent_url}: {e}")
|