|
import socket |
|
import urllib.parse |
|
import validators |
|
from typing import Union, Sequence, Iterator |
|
|
|
from langchain_community.document_loaders import ( |
|
WebBaseLoader, |
|
) |
|
from langchain_core.documents import Document |
|
|
|
|
|
from open_webui.constants import ERROR_MESSAGES |
|
from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH |
|
from open_webui.env import SRC_LOG_LEVELS |
|
|
|
import logging |
|
|
|
log = logging.getLogger(__name__) |
|
log.setLevel(SRC_LOG_LEVELS["RAG"]) |
|
|
|
|
|
def validate_url(url: Union[str, Sequence[str]]): |
|
if isinstance(url, str): |
|
if isinstance(validators.url(url), validators.ValidationError): |
|
raise ValueError(ERROR_MESSAGES.INVALID_URL) |
|
if not ENABLE_RAG_LOCAL_WEB_FETCH: |
|
|
|
parsed_url = urllib.parse.urlparse(url) |
|
|
|
ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname) |
|
|
|
|
|
for ip in ipv4_addresses: |
|
if validators.ipv4(ip, private=True): |
|
raise ValueError(ERROR_MESSAGES.INVALID_URL) |
|
for ip in ipv6_addresses: |
|
if validators.ipv6(ip, private=True): |
|
raise ValueError(ERROR_MESSAGES.INVALID_URL) |
|
return True |
|
elif isinstance(url, Sequence): |
|
return all(validate_url(u) for u in url) |
|
else: |
|
return False |
|
|
|
|
|
def resolve_hostname(hostname): |
|
|
|
addr_info = socket.getaddrinfo(hostname, None) |
|
|
|
|
|
ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET] |
|
ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6] |
|
|
|
return ipv4_addresses, ipv6_addresses |
|
|
|
|
|
class SafeWebBaseLoader(WebBaseLoader): |
|
"""WebBaseLoader with enhanced error handling for URLs.""" |
|
|
|
def lazy_load(self) -> Iterator[Document]: |
|
"""Lazy load text from the url(s) in web_path with error handling.""" |
|
for path in self.web_paths: |
|
try: |
|
soup = self._scrape(path, bs_kwargs=self.bs_kwargs) |
|
text = soup.get_text(**self.bs_get_text_kwargs) |
|
|
|
|
|
metadata = {"source": path} |
|
if title := soup.find("title"): |
|
metadata["title"] = title.get_text() |
|
if description := soup.find("meta", attrs={"name": "description"}): |
|
metadata["description"] = description.get( |
|
"content", "No description found." |
|
) |
|
if html := soup.find("html"): |
|
metadata["language"] = html.get("lang", "No language found.") |
|
|
|
yield Document(page_content=text, metadata=metadata) |
|
except Exception as e: |
|
|
|
log.error(f"Error loading {path}: {e}") |
|
|
|
|
|
def get_web_loader( |
|
url: Union[str, Sequence[str]], |
|
verify_ssl: bool = True, |
|
requests_per_second: int = 2, |
|
): |
|
|
|
if not validate_url(url): |
|
raise ValueError(ERROR_MESSAGES.INVALID_URL) |
|
return SafeWebBaseLoader( |
|
url, |
|
verify_ssl=verify_ssl, |
|
requests_per_second=requests_per_second, |
|
continue_on_failure=True, |
|
) |
|
|