NaimaAqeel commited on
Commit
6a2ef85
·
verified ·
1 Parent(s): 2737463

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -9,19 +9,24 @@ import faiss
9
  import torch
10
 
11
  # ===============================
12
- # EMBEDDING MODEL
13
  # ===============================
14
- model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  embedding_model = AutoModel.from_pretrained(model_name)
17
 
18
- def get_embeddings(texts):
19
  if isinstance(texts, str):
20
  texts = [texts]
 
 
 
21
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
22
  with torch.no_grad():
23
- outputs = embedding_model(**inputs)
24
- return outputs.last_hidden_state[:, 0].cpu().numpy()
 
 
25
 
26
  # ===============================
27
  # TEXT CHUNKING
@@ -41,7 +46,7 @@ def chunk_text(text, chunk_size=800, overlap=100):
41
  index_path = "faiss_index.pkl"
42
  document_texts_path = "document_texts.pkl"
43
  document_texts = []
44
- embedding_dim = 384
45
 
46
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
47
  try:
@@ -90,7 +95,7 @@ def upload_document(file):
90
  return "Unsupported file type."
91
 
92
  chunks = chunk_text(text)
93
- chunk_embeddings = get_embeddings(chunks)
94
  index.add(np.array(chunk_embeddings).astype('float32'))
95
  document_texts.extend(chunks)
96
 
@@ -110,7 +115,7 @@ def generate_answer_from_file(query, top_k=10):
110
  if not document_texts:
111
  return "No documents indexed yet."
112
 
113
- query_vector = get_embeddings(query).astype("float32")
114
  scores, indices = index.search(query_vector, k=top_k)
115
  retrieved_chunks = [document_texts[i] for i in indices[0]]
116
  context = "\n\n".join(retrieved_chunks)
@@ -155,5 +160,3 @@ search_interface = gr.Interface(
155
 
156
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
157
  app.launch()
158
-
159
-
 
9
  import torch
10
 
11
  # ===============================
12
+ # EMBEDDING MODEL (E5)
13
  # ===============================
14
+ model_name = "intfloat/e5-small-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  embedding_model = AutoModel.from_pretrained(model_name)
17
 
18
+ def get_embeddings(texts, is_query=False):
19
  if isinstance(texts, str):
20
  texts = [texts]
21
+ prefix = "query: " if is_query else "passage: "
22
+ texts = [prefix + t for t in texts]
23
+
24
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
25
  with torch.no_grad():
26
+ model_output = embedding_model(**inputs)
27
+
28
+ embeddings = model_output.last_hidden_state[:, 0] # CLS token embeddings
29
+ return embeddings.cpu().numpy()
30
 
31
  # ===============================
32
  # TEXT CHUNKING
 
46
  index_path = "faiss_index.pkl"
47
  document_texts_path = "document_texts.pkl"
48
  document_texts = []
49
+ embedding_dim = 768 # E5-small-v2 embedding dimension
50
 
51
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
52
  try:
 
95
  return "Unsupported file type."
96
 
97
  chunks = chunk_text(text)
98
+ chunk_embeddings = get_embeddings(chunks, is_query=False)
99
  index.add(np.array(chunk_embeddings).astype('float32'))
100
  document_texts.extend(chunks)
101
 
 
115
  if not document_texts:
116
  return "No documents indexed yet."
117
 
118
+ query_vector = get_embeddings(query, is_query=True).astype("float32")
119
  scores, indices = index.search(query_vector, k=top_k)
120
  retrieved_chunks = [document_texts[i] for i in indices[0]]
121
  context = "\n\n".join(retrieved_chunks)
 
160
 
161
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
162
  app.launch()