Spaces:

rchrdgwr
/

AI4Midterm

Sleeping

App Files Files Community

AI4Midterm / utilities /vector_utilities.py

rchrdgwr

Updated application

c6907ac 10 months ago

raw

history blame contribute delete

8.75 kB

	from utilities.constants import (
	CHUNKING_STRATEGY_TABLE_AWARE,
	CHUNKING_STRATEGY_SECTION_BASED,
	CHUNKING_STRATEGY_SEMANTIC
	)
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.docstore.document import Document
	from langchain_community.vectorstores import Qdrant
	from langchain_openai.embeddings import OpenAIEmbeddings
	import numpy as np
	import pdfplumber
	import re
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import tiktoken
	from utilities.debugger import dprint

	def create_vector_store(app_state, model_run_state, **kwargs):
	for key, value in kwargs.items():
	if hasattr(model_run_state, key):
	setattr(model_run_state, key, value)
	else:
	print(f"Warning: {key} is not an attribute of the state object")

	# Rest of your create_vector_store logic
	dprint(app_state, f"Chunk size after update: {model_run_state.chunk_size}")
	create_chunked_documents(app_state, model_run_state)

	qdrant_vectorstore = Qdrant.from_documents(
	documents=model_run_state.combined_document_objects,
	embedding=model_run_state.embedding_model,
	location=":memory:"
	)
	qdrant_retriever = qdrant_vectorstore.as_retriever()
	model_run_state.retriever = qdrant_retriever
	print("Vector store created")

	def tiktoken_len(text):
	tokens = tiktoken.encoding_for_model("gpt-4o").encode(
	text,
	)
	return len(tokens)

	def create_chunked_documents(app_state, model_run_state):
	dprint(app_state, model_run_state.chunking_strategy)
	if model_run_state.chunking_strategy == CHUNKING_STRATEGY_TABLE_AWARE:
	combined_document_objects = chunk_with_table_aware(app_state, model_run_state)
	elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SECTION_BASED:
	combined_document_objects = chunk_with_section_based(app_state, model_run_state)
	elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SEMANTIC:
	combined_document_objects = chunk_with_semantic_splitter(app_state, model_run_state)
	else:
	combined_document_objects = chunk_with_recursive_splitter(app_state, model_run_state)
	model_run_state.combined_document_objects = combined_document_objects
	dprint(app_state, "Chunking completed successfully")


	def chunk_with_recursive_splitter(app_state, model_run_state):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=model_run_state.chunk_size,
	chunk_overlap=model_run_state.chunk_overlap,
	length_function = tiktoken_len,
	)
	combined_document_objects = []
	dprint(app_state, "Chunking documents and creating document objects")
	for document in app_state.documents:
	dprint(app_state, f"processing documend: {document['title']}")
	text = document["single_text_document"]
	dprint(app_state, text)
	title = document["title"]
	# document_id = document["document_id"]
	chunks_document = text_splitter.split_text(text)
	dprint(app_state, len(chunks_document))

	for chunk_number, chunk in enumerate(chunks_document, start=1):
	document_objects = Document(
	page_content=chunk,
	metadata={
	"source": title,
	"document_id": document.get("document_id", "default_id"),
	"chunk_number": chunk_number # Add unique chunk number
	}
	)
	combined_document_objects.append(document_objects)
	return combined_document_objects

	def chunk_with_table_aware(app_state, model_run_state):
	combined_document_objects = []
	dprint(app_state, "Using Table-Aware Chunking for documents.")

	for document in app_state.documents:
	title = document["title"]
	text = document["single_text_document"]

	# Check if document is a PDF and contains tables
	if document.get("is_pdf", False):
	with pdfplumber.open(document["file_path"]) as pdf:
	for page in pdf.pages:
	tables = page.extract_tables()
	for table in tables:
	table_content = "\n".join([str(row) for row in table])
	document_objects = Document(
	page_content=table_content,
	metadata={
	"source": title,
	"document_id": document.get("document_id", "default_id"),
	"chunk_number": "table"
	}
	)
	combined_document_objects.append(document_objects)

	# Chunk the rest of the text
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap)
	chunks_document = text_splitter.split_text(text)

	for chunk_number, chunk in enumerate(chunks_document, start=1):
	document_objects = Document(
	page_content=chunk,
	metadata={
	"source": title,
	"document_id": document.get("document_id", "default_id"),
	"chunk_number": chunk_number
	}
	)
	combined_document_objects.append(document_objects)

	return combined_document_objects


	def chunk_with_section_based(app_state, model_run_state):
	combined_document_objects = []
	dprint(app_state, "Using Section-Based Chunking for documents.")

	for document in app_state.documents:
	text = document["single_text_document"]
	title = document["title"]

	# Split the text by headings
	sections = re.split(r"\n[A-Z].+?\n", text)

	# Chunk each section
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap)
	for section_number, section in enumerate(sections, start=1):
	chunks_document = text_splitter.split_text(section)
	for chunk_number, chunk in enumerate(chunks_document, start=1):
	document_objects = Document(
	page_content=chunk,
	metadata={
	"source": title,
	"document_id": document.get("document_id", "default_id"),
	"section_number": section_number,
	"chunk_number": chunk_number
	}
	)
	combined_document_objects.append(document_objects)

	return combined_document_objects


	def chunk_with_semantic_splitter(app_state, model_run_state):
	# Load pre-trained model for embeddings
	model = SentenceTransformer('all-MiniLM-L6-v2')

	combined_document_objects = []
	dprint(app_state, "Using Semantic-Based Chunking for documents.")

	for document in app_state.documents:
	text = document["single_text_document"]
	title = document["title"]

	# Split text into sentences or paragraphs
	sentences = text.split(". ") # Simple split by sentence (you can refine this)
	sentence_embeddings = model.encode(sentences)

	# Group sentences into chunks based on semantic similarity
	chunks = []
	current_chunk = []
	for i in range(len(sentences) - 1):
	current_chunk.append(sentences[i])
	# Calculate similarity between consecutive sentences
	sim = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i + 1]])[0][0]
	if sim < 0.7 or len(current_chunk) >= model_run_state.chunk_size:
	# If similarity is below threshold or chunk size is reached, start a new chunk
	chunks.append(" ".join(current_chunk))
	current_chunk = []

	# Add the final chunk
	if current_chunk:
	chunks.append(" ".join(current_chunk))

	# Create document objects for the chunks
	for chunk_number, chunk in enumerate(chunks, start=1):
	document_objects = Document(
	page_content=chunk,
	metadata={
	"source": title,
	"document_id": document.get("document_id", "default_id"),
	"chunk_number": chunk_number
	}
	)
	combined_document_objects.append(document_objects)

	return combined_document_objects