Spaces:

Game4all
/

SERPent

Paused

File size: 3,707 Bytes

c38bd79
 
cf1c265
c38bd79
 
 
 
 
 
 
 
cf1c265
c38bd79
cf1c265
c38bd79
cf1c265
c38bd79
cf1c265
c38bd79
cf1c265
 
 
 
c38bd79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf1c265
 
 
 
 
 
 
 
 
 
 
 
c38bd79
 
 
 
 
 
 
 
 
cf1c265
 
 
 
c38bd79
cf1c265
c38bd79
 
 
cf1c265
 
 
c38bd79

import asyncio
import logging
import re
from typing import Optional
from httpx import AsyncClient
from bs4 import BeautifulSoup
from pydantic import BaseModel


class PatentScrapResult(BaseModel):
    """Schema for the result of scraping a google patents page."""
    # The title of the patent.
    title: str
    # The abstract of the patent, if available.
    abstract: Optional[str] = None
    # The full description of the patent containing the field of the invention, background, summary, etc.
    description: Optional[str] = None
    # The full claims of the patent.
    claims: Optional[str] = None
    # The field of the invention, if available.
    field_of_invention: Optional[str] = None
    # The background of the invention, if available.
    background: Optional[str] = None


async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
    }
    try:
        response = await client.get(patent_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Abstract
        abstract_div = soup.find("div", {"class": "abstract"})
        abstract = abstract_div.get_text(
            strip=True) if abstract_div else None

        # Description
        description_section = soup.find("section", itemprop="description")
        description = description_section.get_text(
            separator="\n", strip=True) if description_section else None

        # Field of the Invention
        invention_field_match = re.findall(
            r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
        invention_field = invention_field_match[0][1].strip(
        ) if invention_field_match else None

        # Background of the Invention
        invention_background_match = re.findall(
            r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
        invention_background = invention_background_match[0][1].strip(
        ) if invention_background_match else None

        # Claims
        claims_section = soup.find("section", itemprop="claims")
        claims = claims_section.get_text(
            separator="\n", strip=True) if claims_section else None

        # Patent Title
        meta_title = soup.find("meta", {"name": "DC.title"}).get(
            "content").strip()

        # Patent publication number
        # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
        # get the h2 with id ="pubnum" and extract the text

        return PatentScrapResult(
            # publication_number=pub_num,
            abstract=abstract,
            description=description,
            claims=claims,
            title=meta_title,
            field_of_invention=invention_field,
            background=invention_background
        )
    except Exception as e:
        logging.error(f"Error scraping {patent_url}: {e}")
        return None


async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
    """Scrape multiple patents asynchronously."""
    tasks = [scrap_patent_async(client, url) for url in patent_urls]
    results = await asyncio.gather(*tasks)

    # Filter out None results (failed scrapes)
    return [res for res in results if res is not None]