Spaces:
Runtime error
Runtime error
"""Loader that uses bs4 to load HTML files, enriching metadata with page title.""" | |
import logging | |
from typing import Dict, List, Union | |
from bs4 import BeautifulSoup | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
logger = logging.getLogger(__file__) | |
class BSHTMLLoader(BaseLoader): | |
"""Loader that uses beautiful soup to parse HTML files.""" | |
def __init__(self, file_path: str) -> None: | |
self.file_path = file_path | |
def load(self) -> List[Document]: | |
"""Load HTML document into document objects.""" | |
with open(self.file_path, "r") as f: | |
soup = BeautifulSoup(f, features="lxml") | |
text = soup.get_text() | |
if soup.title: | |
title = str(soup.title.string) | |
else: | |
title = "" | |
metadata: Dict[str, Union[str, None]] = { | |
"source": self.file_path, | |
"title": title, | |
} | |
return [Document(page_content=text, metadata=metadata)] | |