Spaces:

adnaniqbal001
/

QA-pdf

Sleeping

App Files Files Community

QA-pdf / app.py

adnaniqbal001

Create app.py

c068e17 verified 6 months ago

raw

history blame contribute delete

3.21 kB

	# Import necessary libraries
	import os
	import PyPDF2
	from langchain.text_splitter import CharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	import chromadb
	from chromadb.utils import embedding_functions
	from transformers import pipeline
	import gradio as gr

	# Step 1: Extract text from uploaded PDF
	def extract_text_from_pdf(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	# Step 2: Chunk the text
	def chunk_text(text, chunk_size=500, overlap=50):
	splitter = CharacterTextSplitter(
	separator=" ",
	chunk_size=chunk_size,
	chunk_overlap=overlap,
	length_function=len
	)
	chunks = splitter.split_text(text)
	return chunks

	# Step 3: Generate embeddings
	def generate_embeddings(chunks):
	model = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = model.encode(chunks, show_progress_bar=False)
	return embeddings

	# Step 4: Store embeddings in a retriever
	def create_retriever(chunks, embeddings):
	client = chromadb.Client()
	collection = client.create_collection("pdf_chunks")
	for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
	collection.add(
	ids=[str(i)],
	documents=[chunk],
	embeddings=[embedding]
	)
	return collection

	# Step 5: Answer questions using RAG
	def answer_question(question, retriever, embedding_model):
	query_embedding = embedding_model.encode([question])[0]
	results = retriever.query(query_embeddings=[query_embedding], n_results=3)
	retrieved_docs = [doc["document"] for doc in results]

	# Combine the retrieved chunks for context
	context = " ".join(retrieved_docs)

	# Use a language model to answer the question
	qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
	answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
	return answer

	# Define the main function for the app
	def process_pdf_and_answer_question(pdf_file, question):
	# Extract text from the uploaded PDF
	text = extract_text_from_pdf(pdf_file)

	# Chunk the text
	chunks = chunk_text(text)

	# Generate embeddings
	embeddings = generate_embeddings(chunks)

	# Create retriever
	retriever = create_retriever(chunks, embeddings)

	# Load embedding model
	embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

	# Answer the question
	answer = answer_question(question, retriever, embedding_model)
	return answer

	# Gradio interface
	with gr.Blocks() as app:
	gr.Markdown("# PDF Question Answering with RAG")
	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
	answer_output = gr.Textbox(label="Answer")
	submit_button = gr.Button("Get Answer")

	submit_button.click(
	process_pdf_and_answer_question,
	inputs=[pdf_input, question_input],
	outputs=answer_output
	)

	# Run the app
	if __name__ == "__main__":
	app.launch()