File size: 1,642 Bytes
c33d1d0
018fb30
e5633a7
842c848
 
018fb30
f7493dd
e5633a7
c33d1d0
 
1f5e9cb
018fb30
38e2fac
cbed288
18cb8f3
842c848
142d17f
 
 
6976271
 
 
ea07eae
 
 
 
 
 
 
 
c33d1d0
6976271
142d17f
68b31c9
 
 
 
 
018fb30
037c950
 
68b31c9
 
018fb30
 
037c950
842c848
018fb30
 
 
c33d1d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import gradio as gr
from dotenv import load_dotenv
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

# Load environment variables
load_dotenv()

# Load and process the PDF files
loader = PyPDFLoader("./new_papers/ALiBi.pdf")
documents = loader.load()

# Split the documents into chunks and embed them using HuggingFaceBgeEmbeddings
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
vdocuments = text_splitter.split_documents(documents)

# Extract the text from the Document objects
docs_text = [doc.text for doc in vdocuments]

model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
    "normalize_embeddings": True
}  # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)

# Create FAISS vector store for API embeddings
api_db = FAISS.from_texts(texts=docs_text, embedding=embeddings)

# Define the PDF retrieval function
def pdf_retrieval(query):
    # Run the query through the retriever
    response = api_db.similarity_search(query)
    return response

# Create Gradio interface for the API retriever
api_tool = gr.Interface(
    fn=pdf_retrieval,
    inputs=[gr.Textbox()],
    outputs=gr.Textbox(),
    live=True,
    title="API PDF Retrieval Tool",
    description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)

# Launch the Gradio interface
api_tool.launch()