SERPent / scrap.py
Game4all's picture
Extract field, background and description
cf1c265
raw
history blame
3.71 kB
import asyncio
import logging
import re
from typing import Optional
from httpx import AsyncClient
from bs4 import BeautifulSoup
from pydantic import BaseModel
class PatentScrapResult(BaseModel):
"""Schema for the result of scraping a google patents page."""
# The title of the patent.
title: str
# The abstract of the patent, if available.
abstract: Optional[str] = None
# The full description of the patent containing the field of the invention, background, summary, etc.
description: Optional[str] = None
# The full claims of the patent.
claims: Optional[str] = None
# The field of the invention, if available.
field_of_invention: Optional[str] = None
# The background of the invention, if available.
background: Optional[str] = None
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
}
try:
response = await client.get(patent_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Abstract
abstract_div = soup.find("div", {"class": "abstract"})
abstract = abstract_div.get_text(
strip=True) if abstract_div else None
# Description
description_section = soup.find("section", itemprop="description")
description = description_section.get_text(
separator="\n", strip=True) if description_section else None
# Field of the Invention
invention_field_match = re.findall(
r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
invention_field = invention_field_match[0][1].strip(
) if invention_field_match else None
# Background of the Invention
invention_background_match = re.findall(
r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
invention_background = invention_background_match[0][1].strip(
) if invention_background_match else None
# Claims
claims_section = soup.find("section", itemprop="claims")
claims = claims_section.get_text(
separator="\n", strip=True) if claims_section else None
# Patent Title
meta_title = soup.find("meta", {"name": "DC.title"}).get(
"content").strip()
# Patent publication number
# pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
# get the h2 with id ="pubnum" and extract the text
return PatentScrapResult(
# publication_number=pub_num,
abstract=abstract,
description=description,
claims=claims,
title=meta_title,
field_of_invention=invention_field,
background=invention_background
)
except Exception as e:
logging.error(f"Error scraping {patent_url}: {e}")
return None
async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
"""Scrape multiple patents asynchronously."""
tasks = [scrap_patent_async(client, url) for url in patent_urls]
results = await asyncio.gather(*tasks)
# Filter out None results (failed scrapes)
return [res for res in results if res is not None]