Game4all commited on
Commit
cf1c265
·
1 Parent(s): c38bd79

Extract field, background and description

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +1 -1
  3. scrap.py +29 -1
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  __pycache__
2
- .vscode
 
 
1
  __pycache__
2
+ .vscode
3
+ .venv
app.py CHANGED
@@ -24,7 +24,7 @@ pw_browser: Optional[Browser] = None
24
 
25
  # httpx client
26
  httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
27
- max_connections=15, max_keepalive_connections=20))
28
 
29
 
30
  @asynccontextmanager
 
24
 
25
  # httpx client
26
  httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
27
+ max_connections=30, max_keepalive_connections=20))
28
 
29
 
30
  @asynccontextmanager
scrap.py CHANGED
@@ -1,5 +1,6 @@
1
  import asyncio
2
  import logging
 
3
  from typing import Optional
4
  from httpx import AsyncClient
5
  from bs4 import BeautifulSoup
@@ -8,10 +9,18 @@ from pydantic import BaseModel
8
 
9
  class PatentScrapResult(BaseModel):
10
  """Schema for the result of scraping a google patents page."""
 
11
  title: str
 
12
  abstract: Optional[str] = None
 
13
  description: Optional[str] = None
 
14
  claims: Optional[str] = None
 
 
 
 
15
 
16
 
17
  async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
@@ -34,6 +43,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
34
  description = description_section.get_text(
35
  separator="\n", strip=True) if description_section else None
36
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Claims
38
  claims_section = soup.find("section", itemprop="claims")
39
  claims = claims_section.get_text(
@@ -43,11 +64,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
43
  meta_title = soup.find("meta", {"name": "DC.title"}).get(
44
  "content").strip()
45
 
 
 
 
 
46
  return PatentScrapResult(
 
47
  abstract=abstract,
48
  description=description,
49
  claims=claims,
50
- title=meta_title
 
 
51
  )
52
  except Exception as e:
53
  logging.error(f"Error scraping {patent_url}: {e}")
 
1
  import asyncio
2
  import logging
3
+ import re
4
  from typing import Optional
5
  from httpx import AsyncClient
6
  from bs4 import BeautifulSoup
 
9
 
10
  class PatentScrapResult(BaseModel):
11
  """Schema for the result of scraping a google patents page."""
12
+ # The title of the patent.
13
  title: str
14
+ # The abstract of the patent, if available.
15
  abstract: Optional[str] = None
16
+ # The full description of the patent containing the field of the invention, background, summary, etc.
17
  description: Optional[str] = None
18
+ # The full claims of the patent.
19
  claims: Optional[str] = None
20
+ # The field of the invention, if available.
21
+ field_of_invention: Optional[str] = None
22
+ # The background of the invention, if available.
23
+ background: Optional[str] = None
24
 
25
 
26
  async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
 
43
  description = description_section.get_text(
44
  separator="\n", strip=True) if description_section else None
45
 
46
+ # Field of the Invention
47
+ invention_field_match = re.findall(
48
+ r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
49
+ invention_field = invention_field_match[0][1].strip(
50
+ ) if invention_field_match else None
51
+
52
+ # Background of the Invention
53
+ invention_background_match = re.findall(
54
+ r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
55
+ invention_background = invention_background_match[0][1].strip(
56
+ ) if invention_background_match else None
57
+
58
  # Claims
59
  claims_section = soup.find("section", itemprop="claims")
60
  claims = claims_section.get_text(
 
64
  meta_title = soup.find("meta", {"name": "DC.title"}).get(
65
  "content").strip()
66
 
67
+ # Patent publication number
68
+ # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
69
+ # get the h2 with id ="pubnum" and extract the text
70
+
71
  return PatentScrapResult(
72
+ # publication_number=pub_num,
73
  abstract=abstract,
74
  description=description,
75
  claims=claims,
76
+ title=meta_title,
77
+ field_of_invention=invention_field,
78
+ background=invention_background
79
  )
80
  except Exception as e:
81
  logging.error(f"Error scraping {patent_url}: {e}")