QA-pdf / app.py
adnaniqbal001's picture
Create app.py
c068e17 verified
# Import necessary libraries
import os
import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from transformers import pipeline
import gradio as gr
# Step 1: Extract text from uploaded PDF
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Step 2: Chunk the text
def chunk_text(text, chunk_size=500, overlap=50):
splitter = CharacterTextSplitter(
separator=" ",
chunk_size=chunk_size,
chunk_overlap=overlap,
length_function=len
)
chunks = splitter.split_text(text)
return chunks
# Step 3: Generate embeddings
def generate_embeddings(chunks):
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=False)
return embeddings
# Step 4: Store embeddings in a retriever
def create_retriever(chunks, embeddings):
client = chromadb.Client()
collection = client.create_collection("pdf_chunks")
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
collection.add(
ids=[str(i)],
documents=[chunk],
embeddings=[embedding]
)
return collection
# Step 5: Answer questions using RAG
def answer_question(question, retriever, embedding_model):
query_embedding = embedding_model.encode([question])[0]
results = retriever.query(query_embeddings=[query_embedding], n_results=3)
retrieved_docs = [doc["document"] for doc in results]
# Combine the retrieved chunks for context
context = " ".join(retrieved_docs)
# Use a language model to answer the question
qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
return answer
# Define the main function for the app
def process_pdf_and_answer_question(pdf_file, question):
# Extract text from the uploaded PDF
text = extract_text_from_pdf(pdf_file)
# Chunk the text
chunks = chunk_text(text)
# Generate embeddings
embeddings = generate_embeddings(chunks)
# Create retriever
retriever = create_retriever(chunks, embeddings)
# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Answer the question
answer = answer_question(question, retriever, embedding_model)
return answer
# Gradio interface
with gr.Blocks() as app:
gr.Markdown("# PDF Question Answering with RAG")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
answer_output = gr.Textbox(label="Answer")
submit_button = gr.Button("Get Answer")
submit_button.click(
process_pdf_and_answer_question,
inputs=[pdf_input, question_input],
outputs=answer_output
)
# Run the app
if __name__ == "__main__":
app.launch()