Spaces:
Runtime error
Runtime error
"""Loader that uses unstructured to load HTML files.""" | |
from typing import List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
class UnstructuredURLLoader(BaseLoader): | |
"""Loader that uses unstructured to load HTML files.""" | |
def __init__(self, urls: List[str]): | |
"""Initialize with file path.""" | |
try: | |
import unstructured # noqa:F401 | |
except ImportError: | |
raise ValueError( | |
"unstructured package not found, please install it with " | |
"`pip install unstructured`" | |
) | |
self.urls = urls | |
def load(self) -> List[Document]: | |
"""Load file.""" | |
from unstructured.partition.html import partition_html | |
docs: List[Document] = list() | |
for url in self.urls: | |
elements = partition_html(url=url) | |
text = "\n\n".join([str(el) for el in elements]) | |
metadata = {"source": url} | |
docs.append(Document(page_content=text, metadata=metadata)) | |
return docs | |