Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -9,19 +9,24 @@ import faiss
|
|
9 |
import torch
|
10 |
|
11 |
# ===============================
|
12 |
-
# EMBEDDING MODEL
|
13 |
# ===============================
|
14 |
-
model_name = "
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
embedding_model = AutoModel.from_pretrained(model_name)
|
17 |
|
18 |
-
def get_embeddings(texts):
|
19 |
if isinstance(texts, str):
|
20 |
texts = [texts]
|
|
|
|
|
|
|
21 |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
22 |
with torch.no_grad():
|
23 |
-
|
24 |
-
|
|
|
|
|
25 |
|
26 |
# ===============================
|
27 |
# TEXT CHUNKING
|
@@ -41,7 +46,7 @@ def chunk_text(text, chunk_size=800, overlap=100):
|
|
41 |
index_path = "faiss_index.pkl"
|
42 |
document_texts_path = "document_texts.pkl"
|
43 |
document_texts = []
|
44 |
-
embedding_dim =
|
45 |
|
46 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
47 |
try:
|
@@ -90,7 +95,7 @@ def upload_document(file):
|
|
90 |
return "Unsupported file type."
|
91 |
|
92 |
chunks = chunk_text(text)
|
93 |
-
chunk_embeddings = get_embeddings(chunks)
|
94 |
index.add(np.array(chunk_embeddings).astype('float32'))
|
95 |
document_texts.extend(chunks)
|
96 |
|
@@ -110,7 +115,7 @@ def generate_answer_from_file(query, top_k=10):
|
|
110 |
if not document_texts:
|
111 |
return "No documents indexed yet."
|
112 |
|
113 |
-
query_vector = get_embeddings(query).astype("float32")
|
114 |
scores, indices = index.search(query_vector, k=top_k)
|
115 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
116 |
context = "\n\n".join(retrieved_chunks)
|
@@ -155,5 +160,3 @@ search_interface = gr.Interface(
|
|
155 |
|
156 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
157 |
app.launch()
|
158 |
-
|
159 |
-
|
|
|
9 |
import torch
|
10 |
|
11 |
# ===============================
|
12 |
+
# EMBEDDING MODEL (E5)
|
13 |
# ===============================
|
14 |
+
model_name = "intfloat/e5-small-v2"
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
embedding_model = AutoModel.from_pretrained(model_name)
|
17 |
|
18 |
+
def get_embeddings(texts, is_query=False):
|
19 |
if isinstance(texts, str):
|
20 |
texts = [texts]
|
21 |
+
prefix = "query: " if is_query else "passage: "
|
22 |
+
texts = [prefix + t for t in texts]
|
23 |
+
|
24 |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
25 |
with torch.no_grad():
|
26 |
+
model_output = embedding_model(**inputs)
|
27 |
+
|
28 |
+
embeddings = model_output.last_hidden_state[:, 0] # CLS token embeddings
|
29 |
+
return embeddings.cpu().numpy()
|
30 |
|
31 |
# ===============================
|
32 |
# TEXT CHUNKING
|
|
|
46 |
index_path = "faiss_index.pkl"
|
47 |
document_texts_path = "document_texts.pkl"
|
48 |
document_texts = []
|
49 |
+
embedding_dim = 768 # E5-small-v2 embedding dimension
|
50 |
|
51 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
52 |
try:
|
|
|
95 |
return "Unsupported file type."
|
96 |
|
97 |
chunks = chunk_text(text)
|
98 |
+
chunk_embeddings = get_embeddings(chunks, is_query=False)
|
99 |
index.add(np.array(chunk_embeddings).astype('float32'))
|
100 |
document_texts.extend(chunks)
|
101 |
|
|
|
115 |
if not document_texts:
|
116 |
return "No documents indexed yet."
|
117 |
|
118 |
+
query_vector = get_embeddings(query, is_query=True).astype("float32")
|
119 |
scores, indices = index.search(query_vector, k=top_k)
|
120 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
121 |
context = "\n\n".join(retrieved_chunks)
|
|
|
160 |
|
161 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
162 |
app.launch()
|
|
|
|