File size: 357 Bytes
6edcaaa
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
import re
from bs4 import BeautifulSoup, SoupStrainer

def_strainer = SoupStrainer(class_ = 'content')

def bs4_extractor(html: str, strainer: SoupStrainer = def_strainer) -> str:
    '''
    Extract text from html using BeautifulSoup
    '''
    soup = BeautifulSoup(html, "lxml", parse_only=strainer)
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()