import asyncio import logging import re from typing import Optional from httpx import AsyncClient from bs4 import BeautifulSoup from pydantic import BaseModel class PatentScrapResult(BaseModel): """Schema for the result of scraping a google patents page.""" # The title of the patent. title: str # The abstract of the patent, if available. abstract: Optional[str] = None # The full description of the patent containing the field of the invention, background, summary, etc. description: Optional[str] = None # The full claims of the patent. claims: Optional[str] = None # The field of the invention, if available. field_of_invention: Optional[str] = None # The background of the invention, if available. background: Optional[str] = None async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult: headers = { "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)" } try: response = await client.get(patent_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Abstract abstract_div = soup.find("div", {"class": "abstract"}) abstract = abstract_div.get_text( strip=True) if abstract_div else None # Description description_section = soup.find("section", itemprop="description") description = description_section.get_text( separator="\n", strip=True) if description_section else None # Field of the Invention invention_field_match = re.findall( r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None invention_field = invention_field_match[0][1].strip( ) if invention_field_match else None # Background of the Invention invention_background_match = re.findall( r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None invention_background = invention_background_match[0][1].strip( ) if invention_background_match else None # Claims claims_section = soup.find("section", itemprop="claims") claims = claims_section.get_text( separator="\n", strip=True) if claims_section else None # Patent Title meta_title = soup.find("meta", {"name": "DC.title"}).get( "content").strip() # Patent publication number # pub_num = soup.select_one("h2#pubnum").get_text(strip=True) # get the h2 with id ="pubnum" and extract the text return PatentScrapResult( # publication_number=pub_num, abstract=abstract, description=description, claims=claims, title=meta_title, field_of_invention=invention_field, background=invention_background ) except Exception as e: logging.error(f"Error scraping {patent_url}: {e}") return None async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]: """Scrape multiple patents asynchronously.""" tasks = [scrap_patent_async(client, url) for url in patent_urls] results = await asyncio.gather(*tasks) # Filter out None results (failed scrapes) return [res for res in results if res is not None]