Spaces:
Running
Running
import requests | |
import logging | |
from typing import Iterator, List, Union | |
from langchain_core.document_loaders import BaseLoader | |
from langchain_core.documents import Document | |
from open_webui.env import SRC_LOG_LEVELS | |
log = logging.getLogger(__name__) | |
log.setLevel(SRC_LOG_LEVELS["RAG"]) | |
class ExternalLoader(BaseLoader): | |
def __init__( | |
self, | |
web_paths: Union[str, List[str]], | |
external_url: str, | |
external_api_key: str, | |
continue_on_failure: bool = True, | |
**kwargs, | |
) -> None: | |
self.external_url = external_url | |
self.external_api_key = external_api_key | |
self.urls = web_paths if isinstance(web_paths, list) else [web_paths] | |
self.continue_on_failure = continue_on_failure | |
def lazy_load(self) -> Iterator[Document]: | |
batch_size = 20 | |
for i in range(0, len(self.urls), batch_size): | |
urls = self.urls[i : i + batch_size] | |
try: | |
response = requests.post( | |
self.external_url, | |
headers={ | |
"User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", | |
"Authorization": f"Bearer {self.external_api_key}", | |
}, | |
json={ | |
"urls": urls, | |
}, | |
) | |
response.raise_for_status() | |
results = response.json() | |
for result in results: | |
yield Document( | |
page_content=result.get("page_content", ""), | |
metadata=result.get("metadata", {}), | |
) | |
except Exception as e: | |
if self.continue_on_failure: | |
log.error(f"Error extracting content from batch {urls}: {e}") | |
else: | |
raise e | |