Tenant-Rights-Bot / vector_db.py
Nischal Subedi
Add Tenant-Rights-Bot files
b9756ef
import os
import fitz # PyMuPDF
import re
import chromadb
from chromadb.utils import embedding_functions
import numpy as np
import torch
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class VectorDatabase:
"""Vector database for storing and retrieving tenant rights information from PDF."""
def __init__(self, persist_directory="./data/chroma_db"):
"""Initialize the vector database."""
logging.info("Initializing VectorDatabase")
logging.info(f"NumPy version: {np.__version__}")
logging.info(f"PyTorch version: {torch.__version__}")
self.persist_directory = persist_directory
os.makedirs(persist_directory, exist_ok=True)
try:
logging.info("Creating embedding function")
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
logging.info("Initializing ChromaDB client")
self.client = chromadb.PersistentClient(path=persist_directory)
logging.info("Setting up collections")
self.document_collection = self._get_or_create_collection("tenant_documents")
self.state_collection = self._get_or_create_collection("tenant_states")
except Exception as e:
logging.error(f"Initialization failed: {str(e)}")
raise
def _get_or_create_collection(self, name):
"""Get or create a collection with the given name."""
try:
return self.client.get_collection(
name=name,
embedding_function=self.embedding_function
)
except Exception:
return self.client.create_collection(
name=name,
embedding_function=self.embedding_function
)
def extract_pdf_content(self, pdf_path):
"""Extract content from PDF file and identify state sections."""
logging.info(f"Extracting content from PDF: {pdf_path}")
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
doc = fitz.open(pdf_path)
full_text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
full_text += page.get_text("text") + "\n"
doc.close()
state_pattern = r"(?m)^\s*([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+Landlord(?:-|\s)Tenant\s+(?:Law|Laws)"
state_matches = list(re.finditer(state_pattern, full_text))
if not state_matches:
logging.info("No state sections found. Treating as single document.")
return {"Full Document": full_text.strip()}
state_sections = {}
for i, match in enumerate(state_matches):
state_name = match.group(1).strip()
start_pos = match.end()
end_pos = state_matches[i + 1].start() if i + 1 < len(state_matches) else len(full_text)
state_text = full_text[start_pos:end_pos].strip()
if state_text:
state_sections[state_name] = state_text
logging.info(f"Extracted content for {len(state_sections)} states")
return state_sections
def process_and_load_pdf(self, pdf_path):
"""Process PDF and load content into vector database."""
state_sections = self.extract_pdf_content(pdf_path)
doc_ids = self.document_collection.get()["ids"]
state_ids = self.state_collection.get()["ids"]
if doc_ids:
self.document_collection.delete(ids=doc_ids)
if state_ids:
self.state_collection.delete(ids=state_ids)
document_ids, document_texts, document_metadatas = [], [], []
state_ids, state_texts, state_metadatas = [], [], []
for state, text in state_sections.items():
state_id = f"state_{state.lower().replace(' ', '_')}"
summary = text[:1000].strip() if len(text) > 1000 else text
state_ids.append(state_id)
state_texts.append(summary)
state_metadatas.append({"state": state, "type": "summary"})
chunks = self._chunk_text(text, chunk_size=1000, overlap=200)
for i, chunk in enumerate(chunks):
doc_id = f"doc_{state.lower().replace(' ', '_')}_{i}"
document_ids.append(doc_id)
document_texts.append(chunk)
document_metadatas.append({
"state": state,
"chunk_id": i,
"total_chunks": len(chunks),
"source": os.path.basename(pdf_path)
})
if document_ids:
self.document_collection.add(
ids=document_ids,
documents=document_texts,
metadatas=document_metadatas
)
if state_ids:
self.state_collection.add(
ids=state_ids,
documents=state_texts,
metadatas=state_metadatas
)
logging.info(f"Loaded {len(document_ids)} document chunks and {len(state_ids)} state summaries")
return len(state_sections)
def _chunk_text(self, text, chunk_size=1000, overlap=200):
"""Split text into overlapping chunks."""
if not text:
return []
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = min(start + chunk_size, text_length)
if end < text_length:
last_period = text.rfind(".", start, end)
last_newline = text.rfind("\n", start, end)
split_point = max(last_period, last_newline)
if split_point > start:
end = split_point + 1
chunks.append(text[start:end].strip())
start = end - overlap if end - overlap > start else end
return chunks
def query(self, query_text, state=None, n_results=5):
"""Query the vector database for relevant tenant rights information."""
state_filter = {"state": state} if state else None
document_results = self.document_collection.query(
query_texts=[query_text],
n_results=n_results,
where=state_filter
)
state_results = self.state_collection.query(
query_texts=[query_text],
n_results=n_results,
where=state_filter
)
return {"document_results": document_results, "state_results": state_results}
def get_states(self):
"""Get a list of all states in the database."""
results = self.state_collection.get()
states = {meta["state"] for meta in results["metadatas"] if meta}
return sorted(list(states))
if __name__ == "__main__":
try:
db = VectorDatabase()
pdf_path = "data/tenant-landlord.pdf"
db.process_and_load_pdf(pdf_path)
states = db.get_states()
print(f"Available states: {states}")
except Exception as e:
logging.error(f"Script execution failed: {str(e)}")
raise