File size: 3,212 Bytes
66acf4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Import necessary libraries
import os
import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from transformers import pipeline
import gradio as gr

# Step 1: Extract text from uploaded PDF
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Chunk the text
def chunk_text(text, chunk_size=500, overlap=50):
    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len
    )
    chunks = splitter.split_text(text)
    return chunks

# Step 3: Generate embeddings
def generate_embeddings(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks, show_progress_bar=False)
    return embeddings

# Step 4: Store embeddings in a retriever
def create_retriever(chunks, embeddings):
    client = chromadb.Client()
    collection = client.create_collection("pdf_chunks")
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            ids=[str(i)],
            documents=[chunk],
            embeddings=[embedding]
        )
    return collection

# Step 5: Answer questions using RAG
def answer_question(question, retriever, embedding_model):
    query_embedding = embedding_model.encode([question])[0]
    results = retriever.query(query_embeddings=[query_embedding], n_results=3)
    retrieved_docs = [doc["document"] for doc in results]
    
    # Combine the retrieved chunks for context
    context = " ".join(retrieved_docs)
    
    # Use a language model to answer the question
    qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
    answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
    return answer

# Define the main function for the app
def process_pdf_and_answer_question(pdf_file, question):
    # Extract text from the uploaded PDF
    text = extract_text_from_pdf(pdf_file)
    
    # Chunk the text
    chunks = chunk_text(text)
    
    # Generate embeddings
    embeddings = generate_embeddings(chunks)
    
    # Create retriever
    retriever = create_retriever(chunks, embeddings)
    
    # Load embedding model
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Answer the question
    answer = answer_question(question, retriever, embedding_model)
    return answer

# Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# PDF Question Answering with RAG")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
    answer_output = gr.Textbox(label="Answer")
    submit_button = gr.Button("Get Answer")
    
    submit_button.click(
        process_pdf_and_answer_question,
        inputs=[pdf_input, question_input],
        outputs=answer_output
    )

# Run the app
if __name__ == "__main__":
    app.launch()