|
import gradio as gr |
|
from langchain.vectorstores import Chroma |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings |
|
|
|
|
|
hf = HuggingFaceInstructEmbeddings( |
|
model_name="gpt2", |
|
embed_instruction="Represent the document for retrieval: ", |
|
query_instruction="Represent the query for retrieval: " |
|
) |
|
|
|
hf.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
|
|
|
loader = PyPDFLoader("./new_papers/ReACT.pdf") |
|
|
|
|
|
documents = loader.load() |
|
|
|
|
|
db = Chroma.from_documents(documents, hf, collection_name="my-collection") |
|
|
|
class PDFRetrievalTool: |
|
def __init__(self): |
|
self.retriever = db.as_retriever(search_kwargs={"k": 1}) |
|
|
|
def __call__(self, query): |
|
|
|
response = self.retriever.run(query) |
|
return response['result'] |
|
|
|
|
|
tool = gr.Interface( |
|
PDFRetrievalTool(), |
|
inputs=gr.Textbox(), |
|
outputs=gr.Textbox(), |
|
live=True, |
|
title="PDF Retrieval Tool", |
|
description="This tool indexes PDF documents and retrieves relevant answers based on a given query.", |
|
) |
|
|
|
|
|
tool.launch() |
|
|