Spaces:
Paused
Paused
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, UnstructuredURLLoader | |
from langchain_community.vectorstores import Qdrant | |
import os | |
import requests | |
def process_file(file): | |
# save the file temporarily | |
temp_file = "./"+file.path | |
with open(temp_file, "wb") as file: | |
file.write(file.content) | |
documents = [] | |
if file.path.endswith(".pdf"): | |
loader = PyMuPDF(temp_file) | |
docs = loader.load() | |
documents.extend(docs) | |
else: | |
loader = TextLoader(temp_file) | |
docs = loader.load() | |
documents.extend(docs) | |
return documents | |
def load_documents_from_url(url): | |
try: | |
# Check if it's a PDF | |
if url.endswith(".pdf"): | |
try: | |
loader = PyMuPDFLoader(url) | |
return loader.load() | |
except Exception as e: | |
print(f"Error loading PDF from {url}: {e}") | |
return None | |
# Fetch the content and check for video pages | |
try: | |
response = requests.head(url, timeout=10) # Timeout for fetching headers | |
content_type = response.headers.get('Content-Type', '') | |
except Exception as e: | |
print(f"Error fetching headers from {url}: {e}") | |
return None | |
# Ignore video content (flagged for now) | |
if 'video' in content_type: | |
return None | |
if 'youtube' in url: | |
return None | |
# Otherwise, treat it as an HTML page | |
try: | |
loader = UnstructuredURLLoader([url]) | |
return loader.load() | |
except Exception as e: | |
print(f"Error loading HTML from {url}: {e}") | |
return None | |
except Exception as e: | |
print(f"General error loading from {url}: {e}") | |
return None | |
def add_to_qdrant(documents, embeddings, qdrant_client, collection_name): | |
Qdrant.from_documents( | |
documents, | |
embeddings, | |
url=qdrant_client.url, | |
prefer_grpc=True, | |
collection_name=collection_name, | |
) |