rag-tool

Sleeping

rag-tool / app.py

Update app.py

ae1abcc over 1 year ago

1.42 kB

	import gradio as gr
	from langchain.vectorstores import Chroma
	from langchain.document_loaders import PyPDFLoader
	from langchain.embeddings import HuggingFaceInstructEmbeddings

	# Initialize the HuggingFaceInstructEmbeddings
	hf = HuggingFaceInstructEmbeddings(
	model_name="gpt2",
	embed_instruction="Represent the document for retrieval: ",
	query_instruction="Represent the query for retrieval: "
	)
	# Add a padding token to the tokenizer
	hf.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

	# Load and process the PDF files
	loader = PyPDFLoader("./new_papers/ReACT.pdf")

	#loader = PyPDFLoader('./new_papers/', glob="./*.pdf")
	documents = loader.load()

	# Create a Chroma vector store from the PDF documents
	db = Chroma.from_documents(documents, hf, collection_name="my-collection")

	class PDFRetrievalTool:
	def __init__(self):
	self.retriever = db.as_retriever(search_kwargs={"k": 1})

	def __call__(self, query):
	# Run the query through the retriever
	response = self.retriever.run(query)
	return response['result']

	# Create the Gradio interface using the PDFRetrievalTool
	tool = gr.Interface(
	PDFRetrievalTool(),
	inputs=gr.Textbox(),
	outputs=gr.Textbox(),
	live=True,
	title="PDF Retrieval Tool",
	description="This tool indexes PDF documents and retrieves relevant answers based on a given query.",
	)

	# Launch the Gradio interface
	tool.launch()