Spaces:

Game4all
/

SERPent

Paused

App Files Files Community

SERPent / scrap.py

Game4all

Extract field, background and description

cf1c265 5 days ago

raw

history blame

3.71 kB

	import asyncio
	import logging
	import re
	from typing import Optional
	from httpx import AsyncClient
	from bs4 import BeautifulSoup
	from pydantic import BaseModel


	class PatentScrapResult(BaseModel):
	"""Schema for the result of scraping a google patents page."""
	# The title of the patent.
	title: str
	# The abstract of the patent, if available.
	abstract: Optional[str] = None
	# The full description of the patent containing the field of the invention, background, summary, etc.
	description: Optional[str] = None
	# The full claims of the patent.
	claims: Optional[str] = None
	# The field of the invention, if available.
	field_of_invention: Optional[str] = None
	# The background of the invention, if available.
	background: Optional[str] = None


	async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
	}
	try:
	response = await client.get(patent_url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	# Abstract
	abstract_div = soup.find("div", {"class": "abstract"})
	abstract = abstract_div.get_text(
	strip=True) if abstract_div else None

	# Description
	description_section = soup.find("section", itemprop="description")
	description = description_section.get_text(
	separator="\n", strip=True) if description_section else None

	# Field of the Invention
	invention_field_match = re.findall(
	r"(FIELD OF THE INVENTION\|TECHNICAL FIELD)(.*?)(?:(BACKGROUND\|BACKGROUND OF THE INVENTION\|SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_field = invention_field_match[0][1].strip(
	) if invention_field_match else None

	# Background of the Invention
	invention_background_match = re.findall(
	r"(BACKGROUND OF THE INVENTION\|BACKGROUND)(.*?)(?:(SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE PREFERRED EMBODIMENTS\|DESCRIPTION))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_background = invention_background_match[0][1].strip(
	) if invention_background_match else None

	# Claims
	claims_section = soup.find("section", itemprop="claims")
	claims = claims_section.get_text(
	separator="\n", strip=True) if claims_section else None

	# Patent Title
	meta_title = soup.find("meta", {"name": "DC.title"}).get(
	"content").strip()

	# Patent publication number
	# pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
	# get the h2 with id ="pubnum" and extract the text

	return PatentScrapResult(
	# publication_number=pub_num,
	abstract=abstract,
	description=description,
	claims=claims,
	title=meta_title,
	field_of_invention=invention_field,
	background=invention_background
	)
	except Exception as e:
	logging.error(f"Error scraping {patent_url}: {e}")
	return None


	async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
	"""Scrape multiple patents asynchronously."""
	tasks = [scrap_patent_async(client, url) for url in patent_urls]
	results = await asyncio.gather(*tasks)

	# Filter out None results (failed scrapes)
	return [res for res in results if res is not None]