NaimaAqeel commited on
Commit
d87413b
·
verified ·
1 Parent(s): a028e27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -33
app.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import gradio as gr
5
  import fitz # PyMuPDF
6
  from docx import Document
7
- from transformers import AutoModel, AutoTokenizer
8
  import faiss
9
  import torch
10
 
@@ -24,13 +24,25 @@ def get_embeddings(texts):
24
  return outputs.last_hidden_state[:, 0].cpu().numpy()
25
 
26
  # =============================================
27
- # DOCUMENT STORAGE SETUP
 
 
 
 
 
 
 
 
 
 
 
 
28
  # =============================================
29
  index_path = "faiss_index.pkl"
30
  document_texts_path = "document_texts.pkl"
31
  document_texts = []
32
 
33
- embedding_dim = 384 # Dimension for all-MiniLM-L6-v2
34
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
35
  try:
36
  with open(index_path, "rb") as f:
@@ -44,29 +56,29 @@ else:
44
  index = faiss.IndexFlatIP(embedding_dim)
45
 
46
  # =============================================
47
- # DOCUMENT PROCESSING FUNCTIONS
48
  # =============================================
49
- def extract_text_from_pdf(pdf_path):
50
  text = ""
51
  try:
52
- doc = fitz.open(pdf_path)
53
  for page in doc:
54
  text += page.get_text()
55
  except Exception as e:
56
  print(f"PDF error: {e}")
57
  return text
58
 
59
- def extract_text_from_docx(docx_path):
60
  text = ""
61
  try:
62
- doc = Document(docx_path)
63
  text = "\n".join([para.text for para in doc.paragraphs])
64
  except Exception as e:
65
  print(f"DOCX error: {e}")
66
  return text
67
 
68
  # =============================================
69
- # DOCUMENT UPLOAD HANDLER
70
  # =============================================
71
  def upload_document(file):
72
  ext = os.path.splitext(file.name)[-1].lower()
@@ -75,53 +87,56 @@ def upload_document(file):
75
  elif ext == ".docx":
76
  text = extract_text_from_docx(file.name)
77
  else:
78
- return "Unsupported file type"
79
 
80
- embedding = get_embeddings(text)
81
- index.add(embedding)
82
- document_texts.append(text)
 
83
 
84
- # Save updated index and texts
85
  with open(index_path, "wb") as f:
86
  pickle.dump(index, f)
87
  with open(document_texts_path, "wb") as f:
88
  pickle.dump(document_texts, f)
89
 
90
- return "Document uploaded and indexed successfully!"
91
 
92
  # =============================================
93
- # SEMANTIC SEARCH HANDLER
94
  # =============================================
95
- def search_documents(query):
 
 
96
  if not document_texts:
97
  return "No documents indexed yet."
98
 
99
- query_vector = get_embeddings(query)
100
- scores, indices = index.search(query_vector, k=1)
101
- best_match_idx = indices[0][0]
 
102
 
103
- return f"**Best Match:**\n\n{document_texts[best_match_idx][:1000]}..."
 
 
104
 
105
  # =============================================
106
- # GRADIO INTERFACE
107
  # =============================================
108
  upload_interface = gr.Interface(
109
  fn=upload_document,
110
  inputs=gr.File(file_types=[".pdf", ".docx"]),
111
  outputs="text",
112
- title="Upload PDF/DOCX",
113
- description="Upload a PDF or Word document to be indexed for semantic search."
114
  )
115
 
116
  search_interface = gr.Interface(
117
- fn=search_documents,
118
- inputs=gr.Textbox(placeholder="Enter your question or search query here..."),
119
- outputs="markdown",
120
- title="Semantic Search",
121
- description="Search for content in uploaded documents using natural language."
122
  )
123
 
124
- app = gr.TabbedInterface([upload_interface, search_interface], ["Upload Document", "Search Document"])
125
-
126
- if __name__ == "__main__":
127
- app.launch()
 
4
  import gradio as gr
5
  import fitz # PyMuPDF
6
  from docx import Document
7
+ from transformers import AutoModel, AutoTokenizer, pipeline
8
  import faiss
9
  import torch
10
 
 
24
  return outputs.last_hidden_state[:, 0].cpu().numpy()
25
 
26
  # =============================================
27
+ # TEXT CHUNKING
28
+ # =============================================
29
+ def chunk_text(text, chunk_size=500, overlap=50):
30
+ chunks = []
31
+ start = 0
32
+ while start < len(text):
33
+ end = min(len(text), start + chunk_size)
34
+ chunks.append(text[start:end])
35
+ start += chunk_size - overlap
36
+ return chunks
37
+
38
+ # =============================================
39
+ # FAISS INDEX SETUP
40
  # =============================================
41
  index_path = "faiss_index.pkl"
42
  document_texts_path = "document_texts.pkl"
43
  document_texts = []
44
 
45
+ embedding_dim = 384 # for all-MiniLM-L6-v2
46
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
47
  try:
48
  with open(index_path, "rb") as f:
 
56
  index = faiss.IndexFlatIP(embedding_dim)
57
 
58
  # =============================================
59
+ # DOCUMENT PROCESSING
60
  # =============================================
61
+ def extract_text_from_pdf(path):
62
  text = ""
63
  try:
64
+ doc = fitz.open(path)
65
  for page in doc:
66
  text += page.get_text()
67
  except Exception as e:
68
  print(f"PDF error: {e}")
69
  return text
70
 
71
+ def extract_text_from_docx(path):
72
  text = ""
73
  try:
74
+ doc = Document(path)
75
  text = "\n".join([para.text for para in doc.paragraphs])
76
  except Exception as e:
77
  print(f"DOCX error: {e}")
78
  return text
79
 
80
  # =============================================
81
+ # UPLOAD AND INDEX FILE
82
  # =============================================
83
  def upload_document(file):
84
  ext = os.path.splitext(file.name)[-1].lower()
 
87
  elif ext == ".docx":
88
  text = extract_text_from_docx(file.name)
89
  else:
90
+ return "Unsupported file type."
91
 
92
+ chunks = chunk_text(text)
93
+ chunk_embeddings = get_embeddings(chunks)
94
+ index.add(np.array(chunk_embeddings).astype('float32'))
95
+ document_texts.extend(chunks)
96
 
 
97
  with open(index_path, "wb") as f:
98
  pickle.dump(index, f)
99
  with open(document_texts_path, "wb") as f:
100
  pickle.dump(document_texts, f)
101
 
102
+ return "Document uploaded and indexed successfully."
103
 
104
  # =============================================
105
+ # QA PIPELINE WITH FLAN-T5
106
  # =============================================
107
+ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
108
+
109
+ def generate_answer_from_file(query, top_k=3):
110
  if not document_texts:
111
  return "No documents indexed yet."
112
 
113
+ query_vector = get_embeddings(query).astype("float32")
114
+ scores, indices = index.search(query_vector, k=top_k)
115
+ retrieved_chunks = [document_texts[i] for i in indices[0]]
116
+ context = " ".join(retrieved_chunks)
117
 
118
+ prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
119
+ result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
120
+ return result
121
 
122
  # =============================================
123
+ # GRADIO UI
124
  # =============================================
125
  upload_interface = gr.Interface(
126
  fn=upload_document,
127
  inputs=gr.File(file_types=[".pdf", ".docx"]),
128
  outputs="text",
129
+ title="Upload Document",
130
+ description="Upload a Word or PDF file to index it for question answering."
131
  )
132
 
133
  search_interface = gr.Interface(
134
+ fn=generate_answer_from_file,
135
+ inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
136
+ outputs="text",
137
+ title="Ask Your Document",
138
+ description="Ask any question. The chatbot will read the document and answer like ChatGPT."
139
  )
140
 
141
+ app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
142
+ app.launch()