SERPent / scrap.py
Game4all's picture
Deploy scrap endpoints
c38bd79
raw
history blame
2.11 kB
import asyncio
import logging
from typing import Optional
from httpx import AsyncClient
from bs4 import BeautifulSoup
from pydantic import BaseModel
class PatentScrapResult(BaseModel):
"""Schema for the result of scraping a google patents page."""
title: str
abstract: Optional[str] = None
description: Optional[str] = None
claims: Optional[str] = None
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
}
try:
response = await client.get(patent_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Abstract
abstract_div = soup.find("div", {"class": "abstract"})
abstract = abstract_div.get_text(
strip=True) if abstract_div else None
# Description
description_section = soup.find("section", itemprop="description")
description = description_section.get_text(
separator="\n", strip=True) if description_section else None
# Claims
claims_section = soup.find("section", itemprop="claims")
claims = claims_section.get_text(
separator="\n", strip=True) if claims_section else None
# Patent Title
meta_title = soup.find("meta", {"name": "DC.title"}).get(
"content").strip()
return PatentScrapResult(
abstract=abstract,
description=description,
claims=claims,
title=meta_title
)
except Exception as e:
logging.error(f"Error scraping {patent_url}: {e}")
return None
async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
"""Scrape multiple patents asynchronously."""
tasks = [scrap_patent_async(client, url) for url in patent_urls]
results = await asyncio.gather(*tasks)
# Filter out None results (failed scrapes)
return [res for res in results if res is not None]