Spaces:

adnaniqbal001
/

QA-pdf

Sleeping

App Files Files Community

adnaniqbal001 commited on Jan 7

Commit

c068e17

verified ·

1 Parent(s): d5b3b40

Create app.py

Browse files

Files changed (1) hide show

app.py +100 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Import necessary libraries
+import os
+import PyPDF2
+from langchain.text_splitter import CharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import chromadb
+from chromadb.utils import embedding_functions
+from transformers import pipeline
+import gradio as gr
+# Step 1: Extract text from uploaded PDF
+def extract_text_from_pdf(pdf_file):
+    reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Step 2: Chunk the text
+def chunk_text(text, chunk_size=500, overlap=50):
+    splitter = CharacterTextSplitter(
+        separator=" ",
+        chunk_size=chunk_size,
+        chunk_overlap=overlap,
+        length_function=len
+    )
+    chunks = splitter.split_text(text)
+    return chunks
+# Step 3: Generate embeddings
+def generate_embeddings(chunks):
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(chunks, show_progress_bar=False)
+    return embeddings
+# Step 4: Store embeddings in a retriever
+def create_retriever(chunks, embeddings):
+    client = chromadb.Client()
+    collection = client.create_collection("pdf_chunks")
+    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+        collection.add(
+            ids=[str(i)],
+            documents=[chunk],
+            embeddings=[embedding]
+        )
+    return collection
+# Step 5: Answer questions using RAG
+def answer_question(question, retriever, embedding_model):
+    query_embedding = embedding_model.encode([question])[0]
+    results = retriever.query(query_embeddings=[query_embedding], n_results=3)
+    retrieved_docs = [doc["document"] for doc in results]
+    # Combine the retrieved chunks for context
+    context = " ".join(retrieved_docs)
+    # Use a language model to answer the question
+    qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
+    answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
+    return answer
+# Define the main function for the app
+def process_pdf_and_answer_question(pdf_file, question):
+    # Extract text from the uploaded PDF
+    text = extract_text_from_pdf(pdf_file)
+    # Chunk the text
+    chunks = chunk_text(text)
+    # Generate embeddings
+    embeddings = generate_embeddings(chunks)
+    # Create retriever
+    retriever = create_retriever(chunks, embeddings)
+    # Load embedding model
+    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+    # Answer the question
+    answer = answer_question(question, retriever, embedding_model)
+    return answer
+# Gradio interface
+with gr.Blocks() as app:
+    gr.Markdown("# PDF Question Answering with RAG")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
+    answer_output = gr.Textbox(label="Answer")
+    submit_button = gr.Button("Get Answer")
+    submit_button.click(
+        process_pdf_and_answer_question,
+        inputs=[pdf_input, question_input],
+        outputs=answer_output
+    )
+# Run the app
+if __name__ == "__main__":
+    app.launch()