rag-tool

Running

File size: 2,731 Bytes

import os
import gradio as gr
from dotenv import load_dotenv
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

# Load environment variables
#load_dotenv()



def get_pdf_text(pdf_docs):
    """
    Extract text from a list of PDF documents.

    Parameters
    ----------
    pdf_docs : list
        List of PDF documents to extract text from.

    Returns
    -------
    str
        Extracted text from all the PDF documents.

    """
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunks(text):
    """
    Split the input text into chunks.

    Parameters
    ----------
    text : str
        The input text to be split.

    Returns
    -------
    list
        List of text chunks.

    """
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):
    """
    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.

    Parameters
    ----------
    text_chunks : list
        List of text chunks to be embedded.

    Returns
    -------
    FAISS
        A FAISS vector store containing the embeddings of the text chunks.

    """
    model = "BAAI/bge-base-en-v1.5"
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
    )
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    print("-----")
    print(vectorstore.as_retriever.similarity("What is ALiBi?"))
    print("-----")
    return vectorstore
    
pdf_text = get_pdf_text("./new_papers/ALiBi.pdf")
text_chunks = get_text_chunks(pdf_text)
api_db = get_vectorstore(text_chunks)

 

# Define the PDF retrieval function
def pdf_retrieval(query):
    # Run the query through the retriever
    response = api_db.similarity_search(query)
    print(response)
    return response

# Create Gradio interface for the API retriever
api_tool = gr.Interface(
    fn=pdf_retrieval,
    inputs=[gr.Textbox()],
    outputs=gr.Textbox(),
    live=True,
    title="API PDF Retrieval Tool",
    description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)

# Launch the Gradio interface
api_tool.launch()