Spaces:
Sleeping
Sleeping
# Import necessary libraries | |
import os | |
import PyPDF2 | |
from langchain.text_splitter import CharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import chromadb | |
from chromadb.utils import embedding_functions | |
from transformers import pipeline | |
import gradio as gr | |
# Step 1: Extract text from uploaded PDF | |
def extract_text_from_pdf(pdf_file): | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# Step 2: Chunk the text | |
def chunk_text(text, chunk_size=500, overlap=50): | |
splitter = CharacterTextSplitter( | |
separator=" ", | |
chunk_size=chunk_size, | |
chunk_overlap=overlap, | |
length_function=len | |
) | |
chunks = splitter.split_text(text) | |
return chunks | |
# Step 3: Generate embeddings | |
def generate_embeddings(chunks): | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
embeddings = model.encode(chunks, show_progress_bar=False) | |
return embeddings | |
# Step 4: Store embeddings in a retriever | |
def create_retriever(chunks, embeddings): | |
client = chromadb.Client() | |
collection = client.create_collection("pdf_chunks") | |
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): | |
collection.add( | |
ids=[str(i)], | |
documents=[chunk], | |
embeddings=[embedding] | |
) | |
return collection | |
# Step 5: Answer questions using RAG | |
def answer_question(question, retriever, embedding_model): | |
query_embedding = embedding_model.encode([question])[0] | |
results = retriever.query(query_embeddings=[query_embedding], n_results=3) | |
retrieved_docs = [doc["document"] for doc in results] | |
# Combine the retrieved chunks for context | |
context = " ".join(retrieved_docs) | |
# Use a language model to answer the question | |
qa_model = pipeline("text2text-generation", model="google/flan-t5-base") | |
answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text'] | |
return answer | |
# Define the main function for the app | |
def process_pdf_and_answer_question(pdf_file, question): | |
# Extract text from the uploaded PDF | |
text = extract_text_from_pdf(pdf_file) | |
# Chunk the text | |
chunks = chunk_text(text) | |
# Generate embeddings | |
embeddings = generate_embeddings(chunks) | |
# Create retriever | |
retriever = create_retriever(chunks, embeddings) | |
# Load embedding model | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Answer the question | |
answer = answer_question(question, retriever, embedding_model) | |
return answer | |
# Gradio interface | |
with gr.Blocks() as app: | |
gr.Markdown("# PDF Question Answering with RAG") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?") | |
answer_output = gr.Textbox(label="Answer") | |
submit_button = gr.Button("Get Answer") | |
submit_button.click( | |
process_pdf_and_answer_question, | |
inputs=[pdf_input, question_input], | |
outputs=answer_output | |
) | |
# Run the app | |
if __name__ == "__main__": | |
app.launch() | |