Chat-w-PDF / src /main.py
Rohan Kataria
adding logic
36e073f
"""
This is main logic file for the project responsible for the following:
1. Read the loaded file using langchains
2. Split the loaded data into chunks
3. Ingest the data in vector form
4. Conversational Retrieval logic on loaded data create conversational response
5. Return the response to the user (Output)
"""
#Importing the required libraries
import os
import openai
import sys
sys.path.append('../..') #To import the langchain package from the parent directory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import DocArrayInMemorySearch
import datetime
from langchain.prompts import PromptTemplate
#Function to load the data from the file
def load_data(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
return pages
#Function to split the data into chunks
def split_data(data):
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=150,
)
chunks = splitter.split_documents(data)
return chunks
# #Creating the OpenAI Embeddings
# embeddings = OpenAIEmbeddings()
#Function to ingest the data in vector form in data memory
def ingest_data(chunks, embeddings):
vector_store = DocArrayInMemorySearch.from_documents(chunks, embeddings)
return vector_store
#Function to create the conversational response
def create_conversational_response(vector_store, chain_type, k):
#Creating the retriever, this can also be a contextual compressed retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"
#Creating Memory
memory = ConversationBufferMemory(
memory_key="chat_history",
input_key="question",
output_key="answer",
return_messages=True)
#Creating LLM
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model=llm_name, temperature=0.5)
# Creating Prompt template
template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
PROMPT = PromptTemplate(input_variables=["context", "question"], template=template,)
#creating the conversational retrieval chain
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
chain_type=chain_type, #chain type can be refine, stuff, map_reduce
retriever=retriever,
memory=memory,
return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
combine_docs_chain_kwargs=dict({"prompt": PROMPT})
)
return chain
# ConversationalResponse Class to call all the defined functions in a single call
class ConversationalResponse:
def __init__(self, file, api_key):
self.file = file
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
self.data = load_data(self.file)
self.chunks = split_data(self.data)
self.vector_store = ingest_data(self.chunks, embeddings)
self.chain_type = "stuff"
self.k = 5
self.chain = create_conversational_response(self.vector_store, self.chain_type, self.k)
def __call__(self, question, callbacks=None):
response = self.chain(question, callbacks=callbacks)
return response['answer']