Spaces:
Sleeping
Sleeping
File size: 357 Bytes
6edcaaa |
1 2 3 4 5 6 7 8 9 10 11 |
import re
from bs4 import BeautifulSoup, SoupStrainer
def_strainer = SoupStrainer(class_ = 'content')
def bs4_extractor(html: str, strainer: SoupStrainer = def_strainer) -> str:
'''
Extract text from html using BeautifulSoup
'''
soup = BeautifulSoup(html, "lxml", parse_only=strainer)
return re.sub(r"\n\n+", "\n\n", soup.text).strip() |