Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.docstore.document import Document | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain_community.vectorstores import Qdrant | |
from langchain_openai.embeddings import OpenAIEmbeddings | |
import fitz | |
import io | |
import tiktoken | |
import requests | |
import os | |
from utilities.debugger import dprint | |
def tiktoken_len(text): | |
tokens = tiktoken.encoding_for_model("gpt-4o").encode( | |
text, | |
) | |
return len(tokens) | |
def download_document(state, url, file_name, download_folder): | |
file_path = os.path.join(download_folder, file_name) | |
if not os.path.exists(download_folder): | |
os.makedirs(download_folder) | |
if not os.path.exists(file_path): | |
print(f"Downloading {file_name} from {url}...") | |
response = requests.get(url) | |
if response.status_code == 200: | |
with open(file_path, 'wb') as f: | |
f.write(response.content) | |
else: | |
dprint(state, f"Failed to download document from {url}. Status code: {response.status_code}") | |
else: | |
dprint(state, f"{file_name} already exists locally.") | |
return file_path | |
def get_documents(state): | |
for url in state.document_urls: | |
dprint(state, f"Downloading and loading document from {url}...") | |
file_name = url.split("/")[-1] | |
file_path = download_document(state, url, file_name, state.download_folder) | |
loader = PyMuPDFLoader(file_path) | |
loaded_document = loader.load() | |
single_text_document = "\n".join([doc.page_content for doc in loaded_document]) | |
#state.add_loaded_document(loaded_document) # Append the loaded documents to the list | |
#state.add_single_text_document(single_text_document) | |
dprint(state, f"Number of pages: {len(loaded_document)}") | |
# lets get titles and metadata | |
pdf = fitz.open(file_path) | |
metadata = pdf.metadata | |
title = metadata.get('title', 'Document 1') | |
#state.add_metadata(metadata) | |
#state.add_title(title) | |
document = { | |
"url": url, | |
"title": title, | |
"metadata": metadata, | |
"single_text_document": single_text_document, | |
} | |
state.add_document(document) | |
dprint(state, f"Title of Document: {title}") | |
dprint(state, f"Full metadata for Document 1: {metadata}") | |
pdf.close() | |
dprint(state, f"documents: {state.documents}") | |
def create_chunked_documents(state): | |
get_documents(state) | |
# file_path_1 = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf" | |
# file_path_2 = "data/NIST.AI.600-1.pdf" | |
# loader = PyMuPDFLoader(file_path_1) | |
# documents_1 = loader.load() | |
# loader = PyMuPDFLoader(file_path_2) | |
# documents_2 = loader.load() | |
# print(f"Number of pages in 1: {len(documents_1)}") | |
# print(f"Number of pages in 2: {len(documents_2)}") | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=state.chunk_size, | |
chunk_overlap=state.chunk_overlap, | |
length_function = tiktoken_len, | |
) | |
combined_document_objects = [] | |
dprint(state, "Chunking documents and creating document objects") | |
for document in state.documents: | |
dprint(state, f"processing documend: {document['title']}") | |
text = document["single_text_document"] | |
dprint(state, text) | |
title = document["title"] | |
chunks_document = text_splitter.split_text(text) | |
dprint(state, len(chunks_document)) | |
document_objects = [Document(page_content=chunk, metadata={"source": title, "document_id": "doc1"}) for chunk in chunks_document] | |
dprint(state, f"Number of chunks for Document: {len(chunks_document)}") | |
combined_document_objects = combined_document_objects + document_objects | |
state.add_combined_document_objects(combined_document_objects) | |
def create_vector_store(state): | |
create_chunked_documents(state) | |
embedding_model = OpenAIEmbeddings(model=state.embedding_model) | |
qdrant_vectorstore = Qdrant.from_documents( | |
documents=state.combined_document_objects, | |
embedding=embedding_model, | |
location=":memory:" | |
) | |
qdrant_retriever = qdrant_vectorstore.as_retriever() | |
state.set_retriever(qdrant_retriever) | |
return qdrant_retriever |