Spaces:

thewise
/

Chat-w-PDF

Sleeping

Chat-w-PDF / src /main.py

Rohan Kataria

adding app

085c24c over 1 year ago

4.17 kB

	"""
	This is main logic file for the project responsible for the following:
	1. Read the loaded file using langchains
	2. Split the loaded data into chunks
	3. Ingest the data in vector form
	4. Conversational Retrieval logic on loaded data create conversational response
	5. Return the response to the user (Output)
	"""

	#Importing the required libraries
	import os
	import openai
	import sys
	sys.path.append('../..') #To import the langchain package from the parent directory
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain.vectorstores import DocArrayInMemorySearch
	from langchain.document_loaders import TextLoader
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain.chat_models import ChatOpenAI
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import PyPDFLoader
	from langchain.llms import OpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.vectorstores import DocArrayInMemorySearch
	import datetime
	from langchain.prompts import PromptTemplate

	from dotenv import load_dotenv, find_dotenv
	_ = load_dotenv(find_dotenv())

	#Function to load the data from the file
	def load_data(file_path):
	loader = PyPDFLoader(file_path)
	pages = loader.load()
	return pages

	#Function to split the data into chunks
	def split_data(data):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=150,
	)
	chunks = splitter.split_documents(data)
	return chunks

	# #Creating the OpenAI Embeddings
	# embeddings = OpenAIEmbeddings()

	#Function to ingest the data in vector form in data memory
	def ingest_data(chunks, embeddings):
	vector_store = DocArrayInMemorySearch.from_documents(chunks, embeddings)
	return vector_store

	#Function to create the conversational response
	def create_conversational_response(vector_store, chain_type, k):

	#Creating the retriever, this can also be a contextual compressed retriever
	retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"

	#Creating Memory
	memory = ConversationBufferMemory(
	memory_key="chat_history",
	input_key="question",
	output_key="answer",
	return_messages=True)

	#Creating LLM
	current_date = datetime.datetime.now().date()
	if current_date < datetime.date(2023, 9, 2):
	llm_name = "gpt-3.5-turbo-0301"
	else:
	llm_name = "gpt-3.5-turbo"

	llm = ChatOpenAI(model=llm_name, temperature=0)

	# Creating Prompt template
	template = """
	{chat_history}
	{context}
	Question: {question}
	Helpful Answer:"""

	PROMPT = PromptTemplate(input_variables=["chat_history", "context", "question"], template=template,)


	#creating the conversational retrieval chain
	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	chain_type=chain_type, #chain type can be refine, stuff, map_reduce
	retriever=retriever,
	memory=memory,
	return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
	combine_docs_chain_kwargs=dict({"prompt": PROMPT})
	)
	return chain

	# ConversationalResponse Class to call all the defined functions in a single call
	class ConversationalResponse:
	def __init__(self, file, api_key):
	self.file = file
	embeddings = OpenAIEmbeddings(openai_api_key=api_key)
	self.data = load_data(self.file)
	self.chunks = split_data(self.data)
	self.vector_store = ingest_data(self.chunks, embeddings)
	self.chain_type = "stuff"
	self.k = 5
	self.chain = create_conversational_response(self.vector_store, self.chain_type, self.k)

	def __call__(self, question, callbacks=None):
	response = self.chain(question, callbacks=callbacks)
	return response['answer']