AI4Midterm / utilities /vector_utilities.py
rchrdgwr's picture
Updated application
c6907ac
from utilities.constants import (
CHUNKING_STRATEGY_TABLE_AWARE,
CHUNKING_STRATEGY_SECTION_BASED,
CHUNKING_STRATEGY_SEMANTIC
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np
import pdfplumber
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken
from utilities.debugger import dprint
def create_vector_store(app_state, model_run_state, **kwargs):
for key, value in kwargs.items():
if hasattr(model_run_state, key):
setattr(model_run_state, key, value)
else:
print(f"Warning: {key} is not an attribute of the state object")
# Rest of your create_vector_store logic
dprint(app_state, f"Chunk size after update: {model_run_state.chunk_size}")
create_chunked_documents(app_state, model_run_state)
qdrant_vectorstore = Qdrant.from_documents(
documents=model_run_state.combined_document_objects,
embedding=model_run_state.embedding_model,
location=":memory:"
)
qdrant_retriever = qdrant_vectorstore.as_retriever()
model_run_state.retriever = qdrant_retriever
print("Vector store created")
def tiktoken_len(text):
tokens = tiktoken.encoding_for_model("gpt-4o").encode(
text,
)
return len(tokens)
def create_chunked_documents(app_state, model_run_state):
dprint(app_state, model_run_state.chunking_strategy)
if model_run_state.chunking_strategy == CHUNKING_STRATEGY_TABLE_AWARE:
combined_document_objects = chunk_with_table_aware(app_state, model_run_state)
elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SECTION_BASED:
combined_document_objects = chunk_with_section_based(app_state, model_run_state)
elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SEMANTIC:
combined_document_objects = chunk_with_semantic_splitter(app_state, model_run_state)
else:
combined_document_objects = chunk_with_recursive_splitter(app_state, model_run_state)
model_run_state.combined_document_objects = combined_document_objects
dprint(app_state, "Chunking completed successfully")
def chunk_with_recursive_splitter(app_state, model_run_state):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=model_run_state.chunk_size,
chunk_overlap=model_run_state.chunk_overlap,
length_function = tiktoken_len,
)
combined_document_objects = []
dprint(app_state, "Chunking documents and creating document objects")
for document in app_state.documents:
dprint(app_state, f"processing documend: {document['title']}")
text = document["single_text_document"]
dprint(app_state, text)
title = document["title"]
# document_id = document["document_id"]
chunks_document = text_splitter.split_text(text)
dprint(app_state, len(chunks_document))
for chunk_number, chunk in enumerate(chunks_document, start=1):
document_objects = Document(
page_content=chunk,
metadata={
"source": title,
"document_id": document.get("document_id", "default_id"),
"chunk_number": chunk_number # Add unique chunk number
}
)
combined_document_objects.append(document_objects)
return combined_document_objects
def chunk_with_table_aware(app_state, model_run_state):
combined_document_objects = []
dprint(app_state, "Using Table-Aware Chunking for documents.")
for document in app_state.documents:
title = document["title"]
text = document["single_text_document"]
# Check if document is a PDF and contains tables
if document.get("is_pdf", False):
with pdfplumber.open(document["file_path"]) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
table_content = "\n".join([str(row) for row in table])
document_objects = Document(
page_content=table_content,
metadata={
"source": title,
"document_id": document.get("document_id", "default_id"),
"chunk_number": "table"
}
)
combined_document_objects.append(document_objects)
# Chunk the rest of the text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap)
chunks_document = text_splitter.split_text(text)
for chunk_number, chunk in enumerate(chunks_document, start=1):
document_objects = Document(
page_content=chunk,
metadata={
"source": title,
"document_id": document.get("document_id", "default_id"),
"chunk_number": chunk_number
}
)
combined_document_objects.append(document_objects)
return combined_document_objects
def chunk_with_section_based(app_state, model_run_state):
combined_document_objects = []
dprint(app_state, "Using Section-Based Chunking for documents.")
for document in app_state.documents:
text = document["single_text_document"]
title = document["title"]
# Split the text by headings
sections = re.split(r"\n[A-Z].+?\n", text)
# Chunk each section
text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap)
for section_number, section in enumerate(sections, start=1):
chunks_document = text_splitter.split_text(section)
for chunk_number, chunk in enumerate(chunks_document, start=1):
document_objects = Document(
page_content=chunk,
metadata={
"source": title,
"document_id": document.get("document_id", "default_id"),
"section_number": section_number,
"chunk_number": chunk_number
}
)
combined_document_objects.append(document_objects)
return combined_document_objects
def chunk_with_semantic_splitter(app_state, model_run_state):
# Load pre-trained model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
combined_document_objects = []
dprint(app_state, "Using Semantic-Based Chunking for documents.")
for document in app_state.documents:
text = document["single_text_document"]
title = document["title"]
# Split text into sentences or paragraphs
sentences = text.split(". ") # Simple split by sentence (you can refine this)
sentence_embeddings = model.encode(sentences)
# Group sentences into chunks based on semantic similarity
chunks = []
current_chunk = []
for i in range(len(sentences) - 1):
current_chunk.append(sentences[i])
# Calculate similarity between consecutive sentences
sim = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i + 1]])[0][0]
if sim < 0.7 or len(current_chunk) >= model_run_state.chunk_size:
# If similarity is below threshold or chunk size is reached, start a new chunk
chunks.append(" ".join(current_chunk))
current_chunk = []
# Add the final chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
# Create document objects for the chunks
for chunk_number, chunk in enumerate(chunks, start=1):
document_objects = Document(
page_content=chunk,
metadata={
"source": title,
"document_id": document.get("document_id", "default_id"),
"chunk_number": chunk_number
}
)
combined_document_objects.append(document_objects)
return combined_document_objects