Spaces:

adnaniqbal001
/

QA1-pdf

Sleeping

File size: 3,212 Bytes

66acf4a

# Import necessary libraries
import os
import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from transformers import pipeline
import gradio as gr

# Step 1: Extract text from uploaded PDF
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Chunk the text
def chunk_text(text, chunk_size=500, overlap=50):
    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len
    )
    chunks = splitter.split_text(text)
    return chunks

# Step 3: Generate embeddings
def generate_embeddings(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks, show_progress_bar=False)
    return embeddings

# Step 4: Store embeddings in a retriever
def create_retriever(chunks, embeddings):
    client = chromadb.Client()
    collection = client.create_collection("pdf_chunks")
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            ids=[str(i)],
            documents=[chunk],
            embeddings=[embedding]
        )
    return collection

# Step 5: Answer questions using RAG
def answer_question(question, retriever, embedding_model):
    query_embedding = embedding_model.encode([question])[0]
    results = retriever.query(query_embeddings=[query_embedding], n_results=3)
    retrieved_docs = [doc["document"] for doc in results]
    
    # Combine the retrieved chunks for context
    context = " ".join(retrieved_docs)
    
    # Use a language model to answer the question
    qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
    answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
    return answer

# Define the main function for the app
def process_pdf_and_answer_question(pdf_file, question):
    # Extract text from the uploaded PDF
    text = extract_text_from_pdf(pdf_file)
    
    # Chunk the text
    chunks = chunk_text(text)
    
    # Generate embeddings
    embeddings = generate_embeddings(chunks)
    
    # Create retriever
    retriever = create_retriever(chunks, embeddings)
    
    # Load embedding model
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Answer the question
    answer = answer_question(question, retriever, embedding_model)
    return answer

# Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# PDF Question Answering with RAG")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
    answer_output = gr.Textbox(label="Answer")
    submit_button = gr.Button("Get Answer")
    
    submit_button.click(
        process_pdf_and_answer_question,
        inputs=[pdf_input, question_input],
        outputs=answer_output
    )

# Run the app
if __name__ == "__main__":
    app.launch()