website-RAG / modules /soup_extractor.py
beeguy's picture
styles and installs
6edcaaa
raw
history blame contribute delete
357 Bytes
import re
from bs4 import BeautifulSoup, SoupStrainer
def_strainer = SoupStrainer(class_ = 'content')
def bs4_extractor(html: str, strainer: SoupStrainer = def_strainer) -> str:
'''
Extract text from html using BeautifulSoup
'''
soup = BeautifulSoup(html, "lxml", parse_only=strainer)
return re.sub(r"\n\n+", "\n\n", soup.text).strip()