Spaces:

thewise
/

Chat-w-PDF

Running

File size: 4,172 Bytes

085c24c

"""
This is main logic file for the project responsible for the following:
1. Read the loaded file using langchains
2. Split the loaded data into chunks
3. Ingest the data in vector form
4. Conversational Retrieval logic on loaded data create conversational response
5. Return the response to the user (Output) 
"""

#Importing the required libraries
import os
import openai
import sys
sys.path.append('../..') #To import the langchain package from the parent directory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import DocArrayInMemorySearch
import datetime
from langchain.prompts import PromptTemplate

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

#Function to load the data from the file
def load_data(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    return pages

#Function to split the data into chunks
def split_data(data):
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=150,
    )
    chunks = splitter.split_documents(data)
    return chunks

# #Creating the OpenAI Embeddings
# embeddings = OpenAIEmbeddings()

#Function to ingest the data in vector form in data memory
def ingest_data(chunks, embeddings):
    vector_store = DocArrayInMemorySearch.from_documents(chunks, embeddings)
    return vector_store

#Function to create the conversational response
def create_conversational_response(vector_store, chain_type, k):

    #Creating the retriever, this can also be a contextual compressed retriever
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"

    #Creating Memory
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        input_key="question",
        output_key="answer",
        return_messages=True)

    #Creating LLM
    current_date = datetime.datetime.now().date()
    if current_date < datetime.date(2023, 9, 2):
        llm_name = "gpt-3.5-turbo-0301"
    else:
        llm_name = "gpt-3.5-turbo"

    llm = ChatOpenAI(model=llm_name, temperature=0)

    # Creating Prompt template
    template = """
    {chat_history}
    {context}
    Question: {question}
    Helpful Answer:"""

    PROMPT = PromptTemplate(input_variables=["chat_history", "context", "question"], template=template,)


    #creating the conversational retrieval chain
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm, 
        chain_type=chain_type, #chain type can be refine, stuff, map_reduce
        retriever=retriever, 
        memory=memory,
        return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
        combine_docs_chain_kwargs=dict({"prompt": PROMPT})
    )
    return chain

# ConversationalResponse Class to call all the defined functions in a single call
class ConversationalResponse:
    def __init__(self, file, api_key):
        self.file = file
        embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        self.data = load_data(self.file)
        self.chunks = split_data(self.data)
        self.vector_store = ingest_data(self.chunks, embeddings)
        self.chain_type = "stuff"
        self.k = 5
        self.chain = create_conversational_response(self.vector_store, self.chain_type, self.k)

    def __call__(self, question, callbacks=None):
        response = self.chain(question, callbacks=callbacks)
        return response['answer']