import os import gradio as gr from dotenv import load_dotenv from langchain.vectorstores.faiss import FAISS from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter # Load environment variables load_dotenv() # Load and process the PDF files loader = PyPDFLoader("./new_papers/ALiBi.pdf") documents = loader.load() # Split the documents into chunks and embed them using HuggingFaceBgeEmbeddings text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) vdocuments = text_splitter.split_documents(documents) # Extract the text from the Document objects docs_text = [doc.text for doc in vdocuments] model = "BAAI/bge-base-en-v1.5" encode_kwargs = { "normalize_embeddings": True } # set True to compute cosine similarity embeddings = HuggingFaceBgeEmbeddings( model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"} ) # Create FAISS vector store for API embeddings api_db = FAISS.from_texts(texts=docs_text, embedding=embeddings) # Define the PDF retrieval function def pdf_retrieval(query): # Run the query through the retriever response = api_db.similarity_search(query) return response # Create Gradio interface for the API retriever api_tool = gr.Interface( fn=pdf_retrieval, inputs=[gr.Textbox()], outputs=gr.Textbox(), live=True, title="API PDF Retrieval Tool", description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).", ) # Launch the Gradio interface api_tool.launch()