rag-tool

Running

App Files Files Community

Chris4K commited on Apr 28

Commit

e8faa1f

verified ·

1 Parent(s): e96852d

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -110

app.py CHANGED Viewed

@@ -1,117 +1,98 @@
 import os
 import gradio as gr
-from langchain.vectorstores.faiss import FAISS
-from langchain.embeddings import HuggingFaceBgeEmbeddings
-from langchain.document_loaders import PyPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-from PyPDF2 import PdfReader
-# Load environment variables
-#load_dotenv()
-# Print the current working directory
-current_directory = os.getcwd()
-print("Current Working Directory:", current_directory)
-def get_pdf_text(pdf_docs):
-    """
-    Extract text from a list of PDF documents.
-    Parameters
-    ----------
-    pdf_docs : list
-        List of PDF documents to extract text from.
-    Returns
-    -------
-    str
-        Extracted text from all the PDF documents.
-    """
-    text = ""
-    #for pdf in pdf_docs:
-    pdf_reader = PdfReader(pdf_docs)
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-def get_text_chunks(text):
-    """
-    Split the input text into chunks.
-    Parameters
-    ----------
-    text : str
-        The input text to be split.
-    Returns
-    -------
-    list
-        List of text chunks.
-    """
-    text_splitter = CharacterTextSplitter(
-        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    return chunks
-def get_vectorstore(text_chunks):
-    """
-    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
-    Parameters
-    ----------
-    text_chunks : list
-        List of text chunks to be embedded.
-    Returns
-    -------
-    FAISS
-        A FAISS vector store containing the embeddings of the text chunks.
-    """
-    model = "BAAI/bge-base-en-v1.5"
-    encode_kwargs = {
-        "normalize_embeddings": True
-    }  # set True to compute cosine similarity
-    embeddings = HuggingFaceBgeEmbeddings(
-        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
     )
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    print("-----")
-    print(vectorstore.similarity_search("What is ALiBi?"))
-    print("-----")
-    return vectorstore
-# Adjust the path to your PDF file by escaping the space
-pdf_path = r"new_papers/ALiBi.pdf"
-pdf_text = get_pdf_text(pdf_path)
-text_chunks = get_text_chunks(pdf_text)
-api_db = get_vectorstore(text_chunks)
-# Define the PDF retrieval function
-def pdf_retrieval(query):
-    # Run the query through the retriever
-    response = api_db.similarity_search(query)
-    print(response)
-    return response
-# Create Gradio interface for the API retriever
-api_tool = gr.Interface(
-    fn=pdf_retrieval,
-    inputs=[gr.Textbox()],
-    outputs=gr.Textbox(),
-    live=True,
-    title="API PDF Retrieval Tool",
-    description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
-)
-# Launch the Gradio interface
-api_tool.launch()

 import os
 import gradio as gr
+from rag_tool import RAGTool
+# Initialize the RAG Tool with default settings
+rag_tool = RAGTool(
+    documents_path="./documents",
+    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+    vector_store_type="faiss",
+    chunk_size=1000,
+    chunk_overlap=200,
+    persist_directory="./vector_store"
+)
+# Function to handle document uploads
+def upload_documents(files, chunk_size, chunk_overlap, embedding_model, vector_store_type):
+    # Create a temporary directory for uploaded files
+    os.makedirs("./uploaded_docs", exist_ok=True)
+    # Save uploaded files
+    for file in files:
+        file_path = os.path.join("./uploaded_docs", os.path.basename(file.name))
+        with open(file_path, "wb") as f:
+            f.write(file.read())
+    # Initialize a new RAG Tool with the uploaded documents
+    global rag_tool
+    rag_tool = RAGTool(
+        documents_path="./uploaded_docs",
+        embedding_model=embedding_model,
+        vector_store_type=vector_store_type,
+        chunk_size=int(chunk_size),
+        chunk_overlap=int(chunk_overlap),
+        persist_directory="./uploaded_vector_store"
     )
+    return f"Documents uploaded and processed. Vector store created with {embedding_model} model."
+# Function to handle queries
+def query_documents(query, top_k):
+    global rag_tool
+    return rag_tool(query, top_k=int(top_k))
+# Gradio interface
+with gr.Blocks(title="Advanced RAG Tool") as demo:
+    gr.Markdown("# Advanced RAG Tool")
+    gr.Markdown("Upload documents and query them using semantic search")
+    with gr.Tab("Upload & Configure"):
+        with gr.Row():
+            with gr.Column():
+                files = gr.File(file_count="multiple", label="Upload Documents")
+                chunk_size = gr.Slider(200, 2000, value=1000, step=100, label="Chunk Size")
+                chunk_overlap = gr.Slider(0, 500, value=200, step=50, label="Chunk Overlap")
+            with gr.Column():
+                embedding_models = [
+                    "sentence-transformers/all-MiniLM-L6-v2",
+                    "BAAI/bge-small-en-v1.5",
+                    "BAAI/bge-base-en-v1.5",
+                    "thenlper/gte-small",
+                    "thenlper/gte-base"
+                ]
+                embedding_model = gr.Dropdown(
+                    choices=embedding_models,
+                    value="sentence-transformers/all-MiniLM-L6-v2",
+                    label="Embedding Model"
+                )
+                vector_store_type = gr.Radio(
+                    choices=["faiss", "chroma"],
+                    value="faiss",
+                    label="Vector Store Type"
+                )
+        upload_button = gr.Button("Upload and Process Documents")
+        upload_result = gr.Textbox(label="Upload Result")
+        upload_button.click(
+            upload_documents,
+            inputs=[files, chunk_size, chunk_overlap, embedding_model, vector_store_type],
+            outputs=upload_result
+        )
+    with gr.Tab("Query Documents"):
+        query = gr.Textbox(label="Your Question", placeholder="What information are you looking for?")
+        top_k = gr.Slider(1, 10, value=3, step=1, label="Number of Results")
+        query_button = gr.Button("Search")
+        answer = gr.Textbox(label="Results")
+        query_button.click(
+            query_documents,
+            inputs=[query, top_k],
+            outputs=answer
+        )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()