Spaces:

thewise
/

Chat-w-PDF

Running

Chat-w-PDF / src /main.py

Rohan Kataria

adding logic

36e073f almost 2 years ago

4.15 kB

	"""
	This is main logic file for the project responsible for the following:
	1. Read the loaded file using langchains
	2. Split the loaded data into chunks
	3. Ingest the data in vector form
	4. Conversational Retrieval logic on loaded data create conversational response
	5. Return the response to the user (Output)
	"""

	#Importing the required libraries
	import os
	import openai
	import sys
	sys.path.append('../..') #To import the langchain package from the parent directory
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain.vectorstores import DocArrayInMemorySearch
	from langchain.document_loaders import TextLoader
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain.chat_models import ChatOpenAI
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import PyPDFLoader
	from langchain.llms import OpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.vectorstores import DocArrayInMemorySearch
	import datetime
	from langchain.prompts import PromptTemplate

	#Function to load the data from the file
	def load_data(file_path):
	loader = PyPDFLoader(file_path)
	pages = loader.load()
	return pages

	#Function to split the data into chunks
	def split_data(data):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=150,
	)
	chunks = splitter.split_documents(data)
	return chunks

	# #Creating the OpenAI Embeddings
	# embeddings = OpenAIEmbeddings()

	#Function to ingest the data in vector form in data memory
	def ingest_data(chunks, embeddings):
	vector_store = DocArrayInMemorySearch.from_documents(chunks, embeddings)
	return vector_store

	#Function to create the conversational response
	def create_conversational_response(vector_store, chain_type, k):

	#Creating the retriever, this can also be a contextual compressed retriever
	retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"

	#Creating Memory
	memory = ConversationBufferMemory(
	memory_key="chat_history",
	input_key="question",
	output_key="answer",
	return_messages=True)

	#Creating LLM
	llm_name = "gpt-3.5-turbo"

	llm = ChatOpenAI(model=llm_name, temperature=0.5)

	# Creating Prompt template
	template = """
	Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
	{context}
	Question: {question}
	Helpful Answer:"""

	PROMPT = PromptTemplate(input_variables=["context", "question"], template=template,)


	#creating the conversational retrieval chain
	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	chain_type=chain_type, #chain type can be refine, stuff, map_reduce
	retriever=retriever,
	memory=memory,
	return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
	combine_docs_chain_kwargs=dict({"prompt": PROMPT})
	)
	return chain

	# ConversationalResponse Class to call all the defined functions in a single call
	class ConversationalResponse:
	def __init__(self, file, api_key):
	self.file = file
	embeddings = OpenAIEmbeddings(openai_api_key=api_key)
	self.data = load_data(self.file)
	self.chunks = split_data(self.data)
	self.vector_store = ingest_data(self.chunks, embeddings)
	self.chain_type = "stuff"
	self.k = 5
	self.chain = create_conversational_response(self.vector_store, self.chain_type, self.k)

	def __call__(self, question, callbacks=None):
	response = self.chain(question, callbacks=callbacks)
	return response['answer']