Spaces:
Running
on
Zero
Running
on
Zero
Liam Dyer
commited on
idk if i could do this less efficiently
Browse files
app.py
CHANGED
@@ -153,7 +153,10 @@ def predict(queries, documents, max_characters) -> list[list[str]]:
|
|
153 |
# Getting a structure like [[chunk, ...]]
|
154 |
document_embeddings = [[] for _ in range(len(documents))]
|
155 |
total_chars = 0
|
156 |
-
while
|
|
|
|
|
|
|
157 |
for query, doc_scores in query_embeddings.items():
|
158 |
if len(doc_scores) == 0:
|
159 |
continue
|
@@ -176,6 +179,12 @@ def predict(queries, documents, max_characters) -> list[list[str]]:
|
|
176 |
document_embeddings[doc_idx].append(chunk_idx)
|
177 |
total_chars += len(chunk)
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
return document_embeddings
|
180 |
|
181 |
|
|
|
153 |
# Getting a structure like [[chunk, ...]]
|
154 |
document_embeddings = [[] for _ in range(len(documents))]
|
155 |
total_chars = 0
|
156 |
+
while (
|
157 |
+
total_chars < max_characters
|
158 |
+
and sum([len(x) for x in query_embeddings.values()]) > 0
|
159 |
+
):
|
160 |
for query, doc_scores in query_embeddings.items():
|
161 |
if len(doc_scores) == 0:
|
162 |
continue
|
|
|
179 |
document_embeddings[doc_idx].append(chunk_idx)
|
180 |
total_chars += len(chunk)
|
181 |
|
182 |
+
# Get the actual text for the chunks
|
183 |
+
document_embeddings = [
|
184 |
+
[chunked_docs[doc_idx][chunk_idx] for chunk_idx in chunks]
|
185 |
+
for doc_idx, chunks in enumerate(document_embeddings)
|
186 |
+
]
|
187 |
+
|
188 |
return document_embeddings
|
189 |
|
190 |
|