adnaniqbal001 commited on
Commit
c068e17
·
verified ·
1 Parent(s): d5b3b40

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os
3
+ import PyPDF2
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from sentence_transformers import SentenceTransformer
6
+ import chromadb
7
+ from chromadb.utils import embedding_functions
8
+ from transformers import pipeline
9
+ import gradio as gr
10
+
11
+ # Step 1: Extract text from uploaded PDF
12
+ def extract_text_from_pdf(pdf_file):
13
+ reader = PyPDF2.PdfReader(pdf_file)
14
+ text = ""
15
+ for page in reader.pages:
16
+ text += page.extract_text()
17
+ return text
18
+
19
+ # Step 2: Chunk the text
20
+ def chunk_text(text, chunk_size=500, overlap=50):
21
+ splitter = CharacterTextSplitter(
22
+ separator=" ",
23
+ chunk_size=chunk_size,
24
+ chunk_overlap=overlap,
25
+ length_function=len
26
+ )
27
+ chunks = splitter.split_text(text)
28
+ return chunks
29
+
30
+ # Step 3: Generate embeddings
31
+ def generate_embeddings(chunks):
32
+ model = SentenceTransformer("all-MiniLM-L6-v2")
33
+ embeddings = model.encode(chunks, show_progress_bar=False)
34
+ return embeddings
35
+
36
+ # Step 4: Store embeddings in a retriever
37
+ def create_retriever(chunks, embeddings):
38
+ client = chromadb.Client()
39
+ collection = client.create_collection("pdf_chunks")
40
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
41
+ collection.add(
42
+ ids=[str(i)],
43
+ documents=[chunk],
44
+ embeddings=[embedding]
45
+ )
46
+ return collection
47
+
48
+ # Step 5: Answer questions using RAG
49
+ def answer_question(question, retriever, embedding_model):
50
+ query_embedding = embedding_model.encode([question])[0]
51
+ results = retriever.query(query_embeddings=[query_embedding], n_results=3)
52
+ retrieved_docs = [doc["document"] for doc in results]
53
+
54
+ # Combine the retrieved chunks for context
55
+ context = " ".join(retrieved_docs)
56
+
57
+ # Use a language model to answer the question
58
+ qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
59
+ answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text']
60
+ return answer
61
+
62
+ # Define the main function for the app
63
+ def process_pdf_and_answer_question(pdf_file, question):
64
+ # Extract text from the uploaded PDF
65
+ text = extract_text_from_pdf(pdf_file)
66
+
67
+ # Chunk the text
68
+ chunks = chunk_text(text)
69
+
70
+ # Generate embeddings
71
+ embeddings = generate_embeddings(chunks)
72
+
73
+ # Create retriever
74
+ retriever = create_retriever(chunks, embeddings)
75
+
76
+ # Load embedding model
77
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
78
+
79
+ # Answer the question
80
+ answer = answer_question(question, retriever, embedding_model)
81
+ return answer
82
+
83
+ # Gradio interface
84
+ with gr.Blocks() as app:
85
+ gr.Markdown("# PDF Question Answering with RAG")
86
+ with gr.Row():
87
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
88
+ question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?")
89
+ answer_output = gr.Textbox(label="Answer")
90
+ submit_button = gr.Button("Get Answer")
91
+
92
+ submit_button.click(
93
+ process_pdf_and_answer_question,
94
+ inputs=[pdf_input, question_input],
95
+ outputs=answer_output
96
+ )
97
+
98
+ # Run the app
99
+ if __name__ == "__main__":
100
+ app.launch()