NaimaAqeel commited on
Commit
9502a66
·
verified ·
1 Parent(s): d00c686

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -28
app.py CHANGED
@@ -8,9 +8,9 @@ from transformers import AutoModel, AutoTokenizer, pipeline
8
  import faiss
9
  import torch
10
 
11
- # =============================================
12
- # EMBEDDING MODEL SETUP
13
- # =============================================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  embedding_model = AutoModel.from_pretrained(model_name)
@@ -23,9 +23,9 @@ def get_embeddings(texts):
23
  outputs = embedding_model(**inputs)
24
  return outputs.last_hidden_state[:, 0].cpu().numpy()
25
 
26
- # =============================================
27
  # TEXT CHUNKING
28
- # =============================================
29
  def chunk_text(text, chunk_size=500, overlap=50):
30
  chunks = []
31
  start = 0
@@ -35,14 +35,14 @@ def chunk_text(text, chunk_size=500, overlap=50):
35
  start += chunk_size - overlap
36
  return chunks
37
 
38
- # =============================================
39
  # FAISS INDEX SETUP
40
- # =============================================
41
  index_path = "faiss_index.pkl"
42
  document_texts_path = "document_texts.pkl"
43
  document_texts = []
 
44
 
45
- embedding_dim = 384 # for all-MiniLM-L6-v2
46
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
47
  try:
48
  with open(index_path, "rb") as f:
@@ -55,9 +55,9 @@ if os.path.exists(index_path) and os.path.exists(document_texts_path):
55
  else:
56
  index = faiss.IndexFlatIP(embedding_dim)
57
 
58
- # =============================================
59
- # DOCUMENT PROCESSING
60
- # =============================================
61
  def extract_text_from_pdf(path):
62
  text = ""
63
  try:
@@ -77,9 +77,9 @@ def extract_text_from_docx(path):
77
  print(f"DOCX error: {e}")
78
  return text
79
 
80
- # =============================================
81
- # UPLOAD AND INDEX FILE
82
- # =============================================
83
  def upload_document(file):
84
  ext = os.path.splitext(file.name)[-1].lower()
85
  if ext == ".pdf":
@@ -101,12 +101,12 @@ def upload_document(file):
101
 
102
  return "Document uploaded and indexed successfully."
103
 
104
- # =============================================
105
- # QA PIPELINE WITH FLAN-T5
106
- # =============================================
107
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
108
 
109
- def generate_answer_from_file(query, top_k=3):
110
  if not document_texts:
111
  return "No documents indexed yet."
112
 
@@ -115,27 +115,33 @@ def generate_answer_from_file(query, top_k=3):
115
  retrieved_chunks = [document_texts[i] for i in indices[0]]
116
  context = " ".join(retrieved_chunks)
117
 
118
- prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
119
- result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
120
- return result
 
 
 
121
 
122
- # =============================================
123
- # GRADIO UI
124
- # =============================================
 
 
 
125
  upload_interface = gr.Interface(
126
  fn=upload_document,
127
  inputs=gr.File(file_types=[".pdf", ".docx"]),
128
  outputs="text",
129
  title="Upload Document",
130
- description="Upload a Word or PDF file to index it for question answering."
131
  )
132
 
133
  search_interface = gr.Interface(
134
  fn=generate_answer_from_file,
135
- inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
136
  outputs="text",
137
- title="Ask Your Document",
138
- description="Ask any question. The chatbot will read the document and answer like ChatGPT."
139
  )
140
 
141
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
 
8
  import faiss
9
  import torch
10
 
11
+ # ===============================
12
+ # EMBEDDING MODEL
13
+ # ===============================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  embedding_model = AutoModel.from_pretrained(model_name)
 
23
  outputs = embedding_model(**inputs)
24
  return outputs.last_hidden_state[:, 0].cpu().numpy()
25
 
26
+ # ===============================
27
  # TEXT CHUNKING
28
+ # ===============================
29
  def chunk_text(text, chunk_size=500, overlap=50):
30
  chunks = []
31
  start = 0
 
35
  start += chunk_size - overlap
36
  return chunks
37
 
38
+ # ===============================
39
  # FAISS INDEX SETUP
40
+ # ===============================
41
  index_path = "faiss_index.pkl"
42
  document_texts_path = "document_texts.pkl"
43
  document_texts = []
44
+ embedding_dim = 384
45
 
 
46
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
47
  try:
48
  with open(index_path, "rb") as f:
 
55
  else:
56
  index = faiss.IndexFlatIP(embedding_dim)
57
 
58
+ # ===============================
59
+ # FILE EXTRACTORS
60
+ # ===============================
61
  def extract_text_from_pdf(path):
62
  text = ""
63
  try:
 
77
  print(f"DOCX error: {e}")
78
  return text
79
 
80
+ # ===============================
81
+ # UPLOAD HANDLER
82
+ # ===============================
83
  def upload_document(file):
84
  ext = os.path.splitext(file.name)[-1].lower()
85
  if ext == ".pdf":
 
101
 
102
  return "Document uploaded and indexed successfully."
103
 
104
+ # ===============================
105
+ # GENERATION PIPELINE (FLAN-T5)
106
+ # ===============================
107
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
108
 
109
+ def generate_answer_from_file(query, top_k=5):
110
  if not document_texts:
111
  return "No documents indexed yet."
112
 
 
115
  retrieved_chunks = [document_texts[i] for i in indices[0]]
116
  context = " ".join(retrieved_chunks)
117
 
118
+ prompt = (
119
+ f"Use the following context from a textbook or academic document to answer the question accurately and in detail.\n\n"
120
+ f"Context:\n{context}\n\n"
121
+ f"Question: {query}\n\n"
122
+ f"Answer:"
123
+ )
124
 
125
+ result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
126
+ return result.strip()
127
+
128
+ # ===============================
129
+ # GRADIO INTERFACES
130
+ # ===============================
131
  upload_interface = gr.Interface(
132
  fn=upload_document,
133
  inputs=gr.File(file_types=[".pdf", ".docx"]),
134
  outputs="text",
135
  title="Upload Document",
136
+ description="Upload your Word or PDF document for question answering."
137
  )
138
 
139
  search_interface = gr.Interface(
140
  fn=generate_answer_from_file,
141
+ inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
142
  outputs="text",
143
+ title="Ask the Document",
144
+ description="Ask questions about the uploaded content. The chatbot will answer based on the document."
145
  )
146
 
147
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])