LegalAlly / src /index.py
Rohil Bansal
Chatbot working.
4adc02d
raw
history blame
3.88 kB
#%%
import sys
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
import time
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm import tqdm # Add this import for progress bar
# Load environment variables
load_dotenv()
# Set up environment variables
try:
tavily_api_key = os.getenv("TAVILY_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "legalairag"
azure_endpoint = os.getenv("API_BASE")
api_key = os.getenv("API_KEY")
api_version = os.getenv("API_VERSION")
print("Environment variables loaded successfully.")
except Exception as e:
print(f"Error loading environment variables: {e}")
sys.exit(1)
# Set up Azure OpenAI embeddings and model
try:
embd = AzureOpenAIEmbeddings(
api_key=api_key,
api_version=api_version,
azure_endpoint=azure_endpoint
)
llm = AzureChatOpenAI(
api_key=api_key,
api_version=api_version,
azure_endpoint=azure_endpoint,
azure_deployment="gpt-4o",
temperature=0.3,
)
print("Azure OpenAI embeddings and model set up successfully.")
except Exception as e:
print(f"Error setting up Azure OpenAI: {e}")
sys.exit(1)
# Function to check if vector store exists
def vector_store_exists(persist_directory):
return os.path.exists(persist_directory) and len(os.listdir(persist_directory)) > 0
# Load and process documents
try:
print("Loading PDF document...")
docs = PyPDFLoader("assets/data/IPC_and_Constitution.pdf").load()
print("PDF loaded successfully.")
print("Splitting documents...")
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=500, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(docs)
print(f"Documents split into {len(doc_splits)} chunks.")
except Exception as e:
print(f"Error processing documents: {e}")
sys.exit(1)
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
def create_vector_store_batch(persist_directory, documents, embedding, batch_size=50):
vectorstore = None
for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
batch = documents[i:i+batch_size]
if vectorstore is None:
vectorstore = Chroma.from_documents(
documents=batch,
collection_name="rag-chroma",
embedding=embedding,
persist_directory=persist_directory
)
else:
vectorstore.add_documents(batch)
time.sleep(1) # Add a small delay between batches
return vectorstore
# Create or load vector store
try:
persist_directory = './vectordb'
if not vector_store_exists(persist_directory):
print("Creating new vector store...")
vectorstore = create_vector_store_batch(persist_directory, doc_splits, embd)
print("New vector store created and populated.")
else:
print("Loading existing vector store...")
vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function=embd,
collection_name="rag-chroma"
)
print("Existing vector store loaded.")
retriever = vectorstore.as_retriever()
print("Retriever set up successfully.")
except Exception as e:
print(f"Error with vector store operations: {e}")
sys.exit(1)
print("Index setup completed successfully.")