NaimaAqeel commited on
Commit
f01a813
·
verified ·
1 Parent(s): ca47d69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -17
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  import pickle
3
  import numpy as np
4
  import gradio as gr
@@ -25,9 +25,10 @@ def get_embeddings(texts, is_query=False):
25
  with torch.no_grad():
26
  model_output = embedding_model(**inputs)
27
 
28
- embeddings = model_output.last_hidden_state[:, 0] # CLS token embeddings
29
  return embeddings.cpu().numpy()
30
 
 
31
  # ===============================
32
  # TEXT CHUNKING
33
  # ===============================
@@ -46,7 +47,7 @@ def chunk_text(text, chunk_size=800, overlap=100):
46
  index_path = "faiss_index.pkl"
47
  document_texts_path = "document_texts.pkl"
48
  document_texts = []
49
- embedding_dim = 768 # E5-small-v2 embedding dimension
50
 
51
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
52
  try:
@@ -87,21 +88,15 @@ def extract_text_from_docx(path):
87
  # ===============================
88
  def upload_document(file):
89
  ext = os.path.splitext(file.name)[-1].lower()
90
-
91
- # Save uploaded file temporarily
92
- temp_path = f"temp_upload{ext}"
93
- with open(temp_path, "wb") as f:
94
- f.write(file.read())
95
-
96
  if ext == ".pdf":
97
- text = extract_text_from_pdf(temp_path)
98
  elif ext == ".docx":
99
- text = extract_text_from_docx(temp_path)
100
  else:
101
  return "Unsupported file type."
102
 
103
  chunks = chunk_text(text)
104
- chunk_embeddings = get_embeddings(chunks, is_query=False)
105
  index.add(np.array(chunk_embeddings).astype('float32'))
106
  document_texts.extend(chunks)
107
 
@@ -110,12 +105,8 @@ def upload_document(file):
110
  with open(document_texts_path, "wb") as f:
111
  pickle.dump(document_texts, f)
112
 
113
- # Remove the temporary file after processing (optional)
114
- os.remove(temp_path)
115
-
116
  return "Document uploaded and indexed successfully."
117
 
118
-
119
  # ===============================
120
  # GENERATION PIPELINE (FLAN-T5)
121
  # ===============================
@@ -125,7 +116,7 @@ def generate_answer_from_file(query, top_k=10):
125
  if not document_texts:
126
  return "No documents indexed yet."
127
 
128
- query_vector = get_embeddings(query, is_query=True).astype("float32")
129
  scores, indices = index.search(query_vector, k=top_k)
130
  retrieved_chunks = [document_texts[i] for i in indices[0]]
131
  context = "\n\n".join(retrieved_chunks)
@@ -170,3 +161,5 @@ search_interface = gr.Interface(
170
 
171
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
172
  app.launch()
 
 
 
1
+ now explain this code that how this work? i want to understand deeply import os
2
  import pickle
3
  import numpy as np
4
  import gradio as gr
 
25
  with torch.no_grad():
26
  model_output = embedding_model(**inputs)
27
 
28
+ embeddings = model_output.last_hidden_state[:, 0] # CLS token
29
  return embeddings.cpu().numpy()
30
 
31
+
32
  # ===============================
33
  # TEXT CHUNKING
34
  # ===============================
 
47
  index_path = "faiss_index.pkl"
48
  document_texts_path = "document_texts.pkl"
49
  document_texts = []
50
+ embedding_dim = 384
51
 
52
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
53
  try:
 
88
  # ===============================
89
  def upload_document(file):
90
  ext = os.path.splitext(file.name)[-1].lower()
 
 
 
 
 
 
91
  if ext == ".pdf":
92
+ text = extract_text_from_pdf(file.name)
93
  elif ext == ".docx":
94
+ text = extract_text_from_docx(file.name)
95
  else:
96
  return "Unsupported file type."
97
 
98
  chunks = chunk_text(text)
99
+ chunk_embeddings = get_embeddings(chunks)
100
  index.add(np.array(chunk_embeddings).astype('float32'))
101
  document_texts.extend(chunks)
102
 
 
105
  with open(document_texts_path, "wb") as f:
106
  pickle.dump(document_texts, f)
107
 
 
 
 
108
  return "Document uploaded and indexed successfully."
109
 
 
110
  # ===============================
111
  # GENERATION PIPELINE (FLAN-T5)
112
  # ===============================
 
116
  if not document_texts:
117
  return "No documents indexed yet."
118
 
119
+ query_vector = get_embeddings(query).astype("float32")
120
  scores, indices = index.search(query_vector, k=top_k)
121
  retrieved_chunks = [document_texts[i] for i in indices[0]]
122
  context = "\n\n".join(retrieved_chunks)
 
161
 
162
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
163
  app.launch()
164
+
165
+