NaimaAqeel commited on
Commit
58fc57a
·
verified ·
1 Parent(s): b667739

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -51
app.py CHANGED
@@ -1,65 +1,152 @@
1
- import gradio as gr
2
- from PyPDF2 import PdfReader
 
 
 
3
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Load QA pipeline
6
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
7
-
8
- # Function to extract text from PDF
9
- def extract_text_from_pdf(file):
10
- reader = PdfReader(file)
11
- text = ''
12
- for page in reader.pages:
13
- content = page.extract_text()
14
- if content:
15
- text += content
16
  return text
17
 
18
- # Store context globally
19
- document_context = {"text": ""}
20
-
21
- # Function to set context from PDF or text
22
- def set_context(pdf_file, text_input):
23
- if pdf_file:
24
- extracted = extract_text_from_pdf(pdf_file)
25
- document_context["text"] = extracted
26
- return "PDF uploaded and processed successfully!"
27
- elif text_input.strip():
28
- document_context["text"] = text_input.strip()
29
- return "Text received and stored successfully!"
30
- else:
31
- return "Please upload a PDF or provide some text."
32
-
33
- # Function to answer questions based on stored context
34
- def answer_question(question):
35
- context = document_context["text"]
36
- if not context:
37
- return "Please upload a document or enter some text first."
38
- if not question.strip():
39
- return "Please enter a question."
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
- result = qa_pipeline(question=question, context=context)
42
- return result["answer"]
 
 
 
 
43
  except Exception as e:
44
- return f"Error during QA: {str(e)}"
45
 
46
- # Gradio Interface
47
- with gr.Blocks() as demo:
48
- gr.Markdown("# 📄 Ask Questions from a Document")
49
- gr.Markdown("Upload a PDF or paste some text, then ask questions about it!")
50
 
51
- with gr.Row():
52
- pdf_input = gr.File(label="Upload PDF (optional)", type="binary")
53
- text_input = gr.Textbox(label="Or paste text here", lines=8, placeholder="Paste your document text...")
 
 
 
 
 
 
 
 
 
 
54
 
55
- upload_btn = gr.Button("Submit Document")
56
- upload_output = gr.Textbox(label="Status", interactive=False)
 
 
 
57
 
58
- question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
59
- answer_output = gr.Textbox(label="Answer", interactive=False)
60
 
61
- upload_btn.click(set_context, inputs=[pdf_input, text_input], outputs=upload_output)
62
- question_input.change(answer_question, inputs=question_input, outputs=answer_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  demo.launch()
65
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import faiss
4
+ import pickle
5
+ from sentence_transformers import SentenceTransformer
6
  from transformers import pipeline
7
+ import gradio as gr
8
+ import fitz # PyMuPDF for PDFs
9
+ import docx # python-docx for Word files
10
+
11
+ # Initialize global variables
12
+ index_path = "faiss_index.pkl"
13
+ document_texts_path = "document_texts.pkl"
14
+
15
+ # Load or initialize FAISS index and document chunks
16
+ if os.path.exists(index_path) and os.path.exists(document_texts_path):
17
+ with open(index_path, "rb") as f:
18
+ index = pickle.load(f)
19
+ with open(document_texts_path, "rb") as f:
20
+ document_texts = pickle.load(f)
21
+ else:
22
+ # Use 384 dim for all-MiniLM-L6-v2 model
23
+ dim = 384
24
+ index = faiss.IndexFlatL2(dim)
25
+ document_texts = []
26
+
27
+ # Load SentenceTransformer for embeddings
28
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
29
+
30
+ # Initialize QA pipeline with a text generation model
31
+ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
32
 
33
+ def extract_text_from_pdf(file_path):
34
+ doc = fitz.open(file_path)
35
+ text = ""
36
+ for page in doc:
37
+ text += page.get_text()
38
+ doc.close()
 
 
 
 
 
39
  return text
40
 
41
+ def extract_text_from_docx(file_path):
42
+ doc = docx.Document(file_path)
43
+ fullText = []
44
+ for para in doc.paragraphs:
45
+ fullText.append(para.text)
46
+ return "\n".join(fullText)
47
+
48
+ def chunk_text(text, max_len=500):
49
+ """Split text into chunks of max_len characters, trying to split at sentence boundaries."""
50
+ import re
51
+ sentences = re.split(r'(?<=[.!?]) +', text)
52
+ chunks = []
53
+ current_chunk = ""
54
+ for sent in sentences:
55
+ if len(current_chunk) + len(sent) + 1 <= max_len:
56
+ current_chunk += sent + " "
57
+ else:
58
+ chunks.append(current_chunk.strip())
59
+ current_chunk = sent + " "
60
+ if current_chunk:
61
+ chunks.append(current_chunk.strip())
62
+ return chunks
63
+
64
+ def get_embeddings(texts, is_query=False):
65
+ if isinstance(texts, str):
66
+ texts = [texts]
67
+ embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
68
+ return embeddings
69
+
70
+ def upload_document(file):
71
+ global index, document_texts
72
+
73
+ ext = os.path.splitext(file.name)[-1].lower()
74
  try:
75
+ if ext == ".pdf":
76
+ text = extract_text_from_pdf(file.file.name)
77
+ elif ext == ".docx":
78
+ text = extract_text_from_docx(file.file.name)
79
+ else:
80
+ return "Unsupported file type. Please upload a PDF or DOCX file."
81
  except Exception as e:
82
+ return f"Failed to extract text: {str(e)}"
83
 
84
+ if not text.strip():
85
+ return "Failed to extract any text from the document."
 
 
86
 
87
+ chunks = chunk_text(text)
88
+ embeddings = get_embeddings(chunks)
89
+
90
+ # Convert FAISS index to IDMap to allow adding new vectors incrementally
91
+ if not isinstance(index, faiss.IndexIDMap):
92
+ id_map = faiss.IndexIDMap(index)
93
+ index = id_map
94
+
95
+ start_id = len(document_texts)
96
+ ids = np.arange(start_id, start_id + len(chunks))
97
+
98
+ index.add_with_ids(embeddings.astype('float32'), ids)
99
+ document_texts.extend(chunks)
100
 
101
+ # Save index and texts
102
+ with open(index_path, "wb") as f:
103
+ pickle.dump(index, f)
104
+ with open(document_texts_path, "wb") as f:
105
+ pickle.dump(document_texts, f)
106
 
107
+ return f"Document uploaded and indexed successfully with {len(chunks)} chunks."
 
108
 
109
+ def generate_answer_from_file(query, top_k=5):
110
+ global index, document_texts
111
+
112
+ if len(document_texts) == 0:
113
+ return "No document uploaded yet. Please upload a PDF or DOCX file first."
114
+
115
+ query_vec = get_embeddings(query, is_query=True).astype("float32")
116
+ scores, indices = index.search(query_vec, top_k)
117
+ retrieved_chunks = [document_texts[i] for i in indices[0] if i < len(document_texts)]
118
+
119
+ context = "\n\n".join(retrieved_chunks)
120
+
121
+ prompt = (
122
+ "You are a helpful assistant reading a document.\n\n"
123
+ "Context:\n"
124
+ f"{context}\n\n"
125
+ f"Question: {query}\n"
126
+ "Answer:"
127
+ )
128
+
129
+ # Generate answer with max length 256 tokens
130
+ result = qa_pipeline(prompt, max_length=256, do_sample=False)[0]['generated_text']
131
+
132
+ return result.strip()
133
+
134
+ with gr.Blocks() as demo:
135
+ gr.Markdown("## Document Question Answering App\nUpload a PDF or DOCX file, then ask questions based on it.")
136
+
137
+ with gr.Row():
138
+ file_input = gr.File(label="Upload PDF or DOCX file", file_types=['.pdf', '.docx'])
139
+ upload_btn = gr.Button("Upload & Index Document")
140
+
141
+ upload_output = gr.Textbox(label="Upload Status", interactive=False)
142
+
143
+ question = gr.Textbox(label="Enter your question here")
144
+ answer = gr.Textbox(label="Answer", interactive=False)
145
+ ask_btn = gr.Button("Ask")
146
+
147
+ upload_btn.click(upload_document, inputs=file_input, outputs=upload_output)
148
+ ask_btn.click(generate_answer_from_file, inputs=question, outputs=answer)
149
 
150
  demo.launch()
151
 
152
+