Spaces:
Running
on
Zero
Running
on
Zero
Liam Dyer
commited on
kerfuffles
Browse files
app.py
CHANGED
@@ -13,15 +13,6 @@ model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m")
|
|
13 |
model.to(device="cuda")
|
14 |
|
15 |
|
16 |
-
def chunk(text, max_length=512):
|
17 |
-
chunks = []
|
18 |
-
while len(text) > max_length:
|
19 |
-
chunks.append(text[:max_length])
|
20 |
-
text = text[max_length:]
|
21 |
-
chunks.append(text)
|
22 |
-
return chunks
|
23 |
-
|
24 |
-
|
25 |
@spaces.GPU
|
26 |
def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
|
27 |
query_embeddings = model.encode(queries, prompt_name="query")
|
@@ -118,6 +109,15 @@ def convert(input_file) -> str:
|
|
118 |
return convert_pandoc(input_file, input_file)
|
119 |
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
@spaces.GPU
|
122 |
def predict(queries, documents, max_characters) -> list[list[str]]:
|
123 |
queries = queries.split("\n")
|
@@ -131,7 +131,7 @@ def predict(queries, documents, max_characters) -> list[list[str]]:
|
|
131 |
return [[doc] for doc, _ in converted_docs]
|
132 |
|
133 |
# Embed the documents in 512 character chunks
|
134 |
-
chunked_docs = [
|
135 |
embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
|
136 |
|
137 |
# Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
|
|
|
13 |
model.to(device="cuda")
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@spaces.GPU
|
17 |
def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
|
18 |
query_embeddings = model.encode(queries, prompt_name="query")
|
|
|
109 |
return convert_pandoc(input_file, input_file)
|
110 |
|
111 |
|
112 |
+
def chunk_to_length(text, max_length=512):
|
113 |
+
chunks = []
|
114 |
+
while len(text) > max_length:
|
115 |
+
chunks.append(text[:max_length])
|
116 |
+
text = text[max_length:]
|
117 |
+
chunks.append(text)
|
118 |
+
return chunks
|
119 |
+
|
120 |
+
|
121 |
@spaces.GPU
|
122 |
def predict(queries, documents, max_characters) -> list[list[str]]:
|
123 |
queries = queries.split("\n")
|
|
|
131 |
return [[doc] for doc, _ in converted_docs]
|
132 |
|
133 |
# Embed the documents in 512 character chunks
|
134 |
+
chunked_docs = [chunk_to_length(doc, 512) for doc in converted_docs]
|
135 |
embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
|
136 |
|
137 |
# Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
|