Spaces:
Sleeping
Sleeping
""" | |
This is main logic file for the project responsible for the following: | |
1. Read the loaded file using langchains | |
2. Split the loaded data into chunks | |
3. Ingest the data in vector form | |
4. Conversational Retrieval logic on loaded data create conversational response | |
5. Return the response to the user (Output) | |
""" | |
#Importing the required libraries | |
import os | |
import openai | |
import sys | |
sys.path.append('../..') #To import the langchain package from the parent directory | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.vectorstores import DocArrayInMemorySearch | |
from langchain.document_loaders import TextLoader | |
from langchain.chains import RetrievalQA, ConversationalRetrievalChain | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chat_models import ChatOpenAI | |
from langchain.document_loaders import TextLoader | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.llms import OpenAI | |
from langchain.memory import ConversationBufferMemory | |
from langchain.vectorstores import DocArrayInMemorySearch | |
import datetime | |
from langchain.prompts import PromptTemplate | |
from dotenv import load_dotenv, find_dotenv | |
_ = load_dotenv(find_dotenv()) | |
#Function to load the data from the file | |
def load_data(file_path): | |
loader = PyPDFLoader(file_path) | |
pages = loader.load() | |
return pages | |
#Function to split the data into chunks | |
def split_data(data): | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=150, | |
) | |
chunks = splitter.split_documents(data) | |
return chunks | |
# #Creating the OpenAI Embeddings | |
# embeddings = OpenAIEmbeddings() | |
#Function to ingest the data in vector form in data memory | |
def ingest_data(chunks, embeddings): | |
vector_store = DocArrayInMemorySearch.from_documents(chunks, embeddings) | |
return vector_store | |
#Function to create the conversational response | |
def create_conversational_response(vector_store, chain_type, k): | |
#Creating the retriever, this can also be a contextual compressed retriever | |
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr" | |
#Creating Memory | |
memory = ConversationBufferMemory( | |
memory_key="chat_history", | |
input_key="question", | |
output_key="answer", | |
return_messages=True) | |
#Creating LLM | |
current_date = datetime.datetime.now().date() | |
if current_date < datetime.date(2023, 9, 2): | |
llm_name = "gpt-3.5-turbo-0301" | |
else: | |
llm_name = "gpt-3.5-turbo" | |
llm = ChatOpenAI(model=llm_name, temperature=0) | |
# Creating Prompt template | |
template = """ | |
{chat_history} | |
{context} | |
Question: {question} | |
Helpful Answer:""" | |
PROMPT = PromptTemplate(input_variables=["chat_history", "context", "question"], template=template,) | |
#creating the conversational retrieval chain | |
chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
chain_type=chain_type, #chain type can be refine, stuff, map_reduce | |
retriever=retriever, | |
memory=memory, | |
return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work | |
combine_docs_chain_kwargs=dict({"prompt": PROMPT}) | |
) | |
return chain | |
# ConversationalResponse Class to call all the defined functions in a single call | |
class ConversationalResponse: | |
def __init__(self, file, api_key): | |
self.file = file | |
embeddings = OpenAIEmbeddings(openai_api_key=api_key) | |
self.data = load_data(self.file) | |
self.chunks = split_data(self.data) | |
self.vector_store = ingest_data(self.chunks, embeddings) | |
self.chain_type = "stuff" | |
self.k = 5 | |
self.chain = create_conversational_response(self.vector_store, self.chain_type, self.k) | |
def __call__(self, question, callbacks=None): | |
response = self.chain(question, callbacks=callbacks) | |
return response['answer'] |