Spaces:

jchen8000
/

RAG_Demo

Sleeping

File size: 7,369 Bytes

7279c69
fc616b7
055baa9
aeb1340
7eb4640
 
 
 
 
 
 
 
 
33177b1
edb320d
0c5910e
 
aeb1340
 
 
9994f95
25656b2
 
 
 
9994f95
8883910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30f179c
8883910
 
f16ca94
 
 
 
 
 
b051e7d
 
 
 
f16ca94
055baa9
 
 
 
 
 
 
 
 
 
 
 
0fad869
 
 
 
 
 
 
edb320d
0fad869
 
 
 
 
 
 
 
 
 
 
 
1593c74
0fad869
1593c74
25656b2
 
1593c74
 
25656b2
1593c74
edb320d
0fad869
25656b2
0fad869
515f14b
0fad869
 
055baa9
0fad869
 
 
25656b2
0fad869
 
 
 
c0a84b3
055baa9
0fad869
 
 
055baa9
0fad869
 
055baa9
0fad869
 
 
3b96e5f
0fad869
 
 
 
 
 
 
 
055baa9
0fad869
 
 
7190d80
 
515f14b
0fad869
a4018ab
0fad869
 
 
 
 
 
fb473f6
8599fab
 
8883910
04937cf
a4018ab
04937cf
8d193b1
b051e7d
8883910
911c1ac
04937cf
1593c74
8599fab
caa9983
 
 
 
 
 
 
8599fab
 
 
7d2c69b

import os
import sys
import random
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


print(f"Pyton version {sys.version}.")

# Initialize the FAISS vector store
vector_store = None

# Sample PDF file
sample_filenames = ["Attention Is All You Need.pdf",
                   "Generative Adversarial Nets.pdf",
                   "Parameter-Efficient Transfer Learning for NLP.pdf",
                  ]

sample_desc = """
### 1. Attention Is All You Need (Vaswani et al., 2017)
This groundbreaking paper introduced the **Transformer** architecture. It revolutionized natural language processing by enabling parallelization and significantly improving performance on tasks like translation, leading to models like *BERT* and *GPT*.

### 2. Generative Adversarial Nets (Goodfellow et al., 2014)
This paper proposed **GANs**, a novel framework for generative modeling using two neural networks—a generator and a discriminator—that compete in a zero-sum game.  

### 3. Parameter-Efficient Transfer Learning for NLP (Houlsby et al., 2019)
This paper introduces **adapter modules**, a method for fine-tuning large pre-trained language models with significantly fewer parameters.

It could take several minutes to load and index the files.
"""

rag_desc = """
### This is a Demo of Retrieval-Augmented Generation (RAG)

**RAG** is an approach that combines retrieval-based and generative LLM models to improve the accuracy and relevance of generated text.  
It works by first retrieving relevant documents from an external knowledge source (like PDF files) and then using a LLM model to produce responses based on both the input query and the retrieved content.  
This method enhances factual correctness and allows the model to access up-to-date or domain-specific information without retraining.


"""


examples_questions = [["What is Transformer?"],
            ["What is Attention?"],
            ["What is Scaled Dot-Product Attention?"],
            ["What are Encoder and Decoder?"],
            ["Describe more about the Transformer."],
            ["Why use self-attention?"],
            ["Describe Parameter-Efficient fine-tuning?"],
            ["Describe Generative Adversarial Networks?"],                      
            ["How does GAN work?"]
        ]

template = \
"""Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always say "Thanks for asking!" at the end of the answer.

{context}

Question: {question}

Answer:
"""

# Function to handle PDF upload and indexing
def index_pdf(pdf):
    global vector_store
    
    # Load the PDF
    loader = PyPDFLoader(pdf.name)
    documents = loader.load()

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    # Embed the chunks 
    embeddings = HuggingFaceEmbeddings(model_name="bert-base-uncased", encode_kwargs={"normalize_embeddings": True})

    # Store the embeddings in the vector store
    vector_store = FAISS.from_documents(texts, embeddings)

    return "PDF indexed successfully!"

def load_sample_pdf():
    global vector_store
    documents = []    

    # Load the PDFs
    for file in sample_filenames:
        loader = PyPDFLoader(file)
        documents.extend(loader.load())
        print(f"{file} is processed!")

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    texts = text_splitter.split_documents(documents)

    # Embed the chunks 
    embeddings = HuggingFaceEmbeddings(model_name="bert-base-uncased", encode_kwargs={"normalize_embeddings": True})

    # Store the embeddings in the vector store
    vector_store = FAISS.from_documents(texts, embeddings)

    return "Sample PDFs indexed successfully!"


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

def generate_response(query, history, model, temperature, max_tokens, top_p, seed):
    if vector_store is None:
        return "Please upload and index a PDF at the Indexing tab."

    if seed == 0:
        seed = random.randint(1, 100000)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 16})
    llm = ChatGroq(groq_api_key=os.environ.get("GROQ_API_KEY"), model=model)
    custom_rag_prompt = PromptTemplate.from_template(template)
   
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(query)
   
    return response





additional_inputs = [
    gr.Dropdown(choices=["llama-3.3-70b-versatile", "llama-3.1-8b-instant", "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma2-9b-it"], value="gemma2-9b-it", label="Model"),
    gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.5, label="Temperature", info="Controls diversity of the generated text. Lower is more deterministic, higher is more creative."),
    gr.Slider(minimum=1, maximum=8000, step=1, value=8000, label="Max Tokens", info="The maximum number of tokens that the model can process in a single response.<br>Maximums: 8k for gemma 7b it, gemma2 9b it, llama 7b & 70b, 32k for mixtral 8x7b, 132k for llama 3.1."),
    gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.5, label="Top P", info="A method of text generation where a model will only consider the most probable next tokens that make up the probability p."),
    gr.Number(precision=0, value=0, label="Seed", info="A starting point to initiate generation, use 0 for random")
]

# Create the Gradio interface
with gr.Blocks(theme="Nymbo/Alyx_Theme") as demo:
    with gr.Tab("Indexing"):
        gr.Markdown(rag_desc)
        # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        # pdf_input = gr.Textbox(label="PDF File")
        # index_button = gr.Button("Index PDF")
        # load_sample = gr.Button("Alternatively, Load and Index [Attention Is All You Need.pdf] as a Sample")
        load_sample = gr.Button("Load and Index the following three papers as a RAG Demo")
        sample_description = gr.Markdown(sample_desc)
        index_output = gr.Textbox(label="Indexing Status")
        # index_button.click(index_pdf, inputs=pdf_input, outputs=index_output)
        load_sample.click(load_sample_pdf, inputs=None, outputs=index_output)
    
    with gr.Tab("Chatbot"):
        gr.ChatInterface(
            fn=generate_response, 
            chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
            examples=examples_questions,
            additional_inputs=additional_inputs,
            cache_examples=False,
        )       

# Launch the Gradio app
demo.launch(share=True)