AI4Midterm / utilities /rag_utilities.py
rchrdgwr's picture
Add files for midterm project
5a9839d
raw
history blame
4.44 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
import fitz
import io
import tiktoken
import requests
import os
from utilities.debugger import dprint
def tiktoken_len(text):
tokens = tiktoken.encoding_for_model("gpt-4o").encode(
text,
)
return len(tokens)
def download_document(state, url, file_name, download_folder):
file_path = os.path.join(download_folder, file_name)
if not os.path.exists(download_folder):
os.makedirs(download_folder)
if not os.path.exists(file_path):
print(f"Downloading {file_name} from {url}...")
response = requests.get(url)
if response.status_code == 200:
with open(file_path, 'wb') as f:
f.write(response.content)
else:
dprint(state, f"Failed to download document from {url}. Status code: {response.status_code}")
else:
dprint(state, f"{file_name} already exists locally.")
return file_path
def get_documents(state):
for url in state.document_urls:
dprint(state, f"Downloading and loading document from {url}...")
file_name = url.split("/")[-1]
file_path = download_document(state, url, file_name, state.download_folder)
loader = PyMuPDFLoader(file_path)
loaded_document = loader.load()
single_text_document = "\n".join([doc.page_content for doc in loaded_document])
#state.add_loaded_document(loaded_document) # Append the loaded documents to the list
#state.add_single_text_document(single_text_document)
dprint(state, f"Number of pages: {len(loaded_document)}")
# lets get titles and metadata
pdf = fitz.open(file_path)
metadata = pdf.metadata
title = metadata.get('title', 'Document 1')
#state.add_metadata(metadata)
#state.add_title(title)
document = {
"url": url,
"title": title,
"metadata": metadata,
"single_text_document": single_text_document,
}
state.add_document(document)
dprint(state, f"Title of Document: {title}")
dprint(state, f"Full metadata for Document 1: {metadata}")
pdf.close()
dprint(state, f"documents: {state.documents}")
def create_chunked_documents(state):
get_documents(state)
# file_path_1 = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"
# file_path_2 = "data/NIST.AI.600-1.pdf"
# loader = PyMuPDFLoader(file_path_1)
# documents_1 = loader.load()
# loader = PyMuPDFLoader(file_path_2)
# documents_2 = loader.load()
# print(f"Number of pages in 1: {len(documents_1)}")
# print(f"Number of pages in 2: {len(documents_2)}")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=state.chunk_size,
chunk_overlap=state.chunk_overlap,
length_function = tiktoken_len,
)
combined_document_objects = []
dprint(state, "Chunking documents and creating document objects")
for document in state.documents:
dprint(state, f"processing documend: {document['title']}")
text = document["single_text_document"]
dprint(state, text)
title = document["title"]
chunks_document = text_splitter.split_text(text)
dprint(state, len(chunks_document))
document_objects = [Document(page_content=chunk, metadata={"source": title, "document_id": "doc1"}) for chunk in chunks_document]
dprint(state, f"Number of chunks for Document: {len(chunks_document)}")
combined_document_objects = combined_document_objects + document_objects
state.add_combined_document_objects(combined_document_objects)
def create_vector_store(state):
create_chunked_documents(state)
embedding_model = OpenAIEmbeddings(model=state.embedding_model)
qdrant_vectorstore = Qdrant.from_documents(
documents=state.combined_document_objects,
embedding=embedding_model,
location=":memory:"
)
qdrant_retriever = qdrant_vectorstore.as_retriever()
state.set_retriever(qdrant_retriever)
return qdrant_retriever