Update update_embeddings.py
Browse files- update_embeddings.py +1 -45
update_embeddings.py
CHANGED
@@ -324,7 +324,7 @@ if BINARY:
|
|
324 |
|
325 |
# Function to convert dense vector to binary vector
|
326 |
def dense_to_binary(dense_vector):
|
327 |
-
return np.packbits(np.where(dense_vector >= 0, 1, 0))
|
328 |
|
329 |
# Create a folder to store binary embeddings
|
330 |
binary_folder = f"{year}-binary-embed"
|
@@ -358,50 +358,6 @@ else:
|
|
358 |
print("Not uploading Binary embeddings to the repo")
|
359 |
print("To upload embeddings, set UPLOAD and BINARY both to True")
|
360 |
|
361 |
-
|
362 |
-
################################################################################
|
363 |
-
|
364 |
-
# BMRL the data
|
365 |
-
if BMRL:
|
366 |
-
print(f"BMRL'ing the data for year: {year}")
|
367 |
-
print("Set BMRL = False to not binarise and MRL the embeddings")
|
368 |
-
|
369 |
-
# Function to chop a binary vector to a specific size
|
370 |
-
def binary_to_mrl(binary_vector, size=512):
|
371 |
-
return np.packbits(np.unpackbits(binary_vector)[:size])
|
372 |
-
|
373 |
-
# Create a folder to store binary embeddings
|
374 |
-
bmrl_folder = f"{year}-bmrl-embed"
|
375 |
-
os.makedirs(bmrl_folder, exist_ok=True)
|
376 |
-
|
377 |
-
# Convert the dense vectors to binary vectors
|
378 |
-
new_embeddings['vector'] = new_embeddings['vector'].progress_apply(binary_to_mrl)
|
379 |
-
|
380 |
-
# Save the binary embeddings to a parquet file
|
381 |
-
new_embeddings.to_parquet(f'{bmrl_folder}/{year}.parquet', index=False)
|
382 |
-
|
383 |
-
if BMRL and UPLOAD:
|
384 |
-
|
385 |
-
# Setup transaction details
|
386 |
-
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
|
387 |
-
repo_type = "dataset"
|
388 |
-
|
389 |
-
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
|
390 |
-
|
391 |
-
# Subfolder in the repo of the dataset where the file is stored
|
392 |
-
folder_in_repo = "data"
|
393 |
-
|
394 |
-
print(f"Uploading binary embeddings to {repo_id} from folder {bmrl_folder}")
|
395 |
-
|
396 |
-
# Upload all files within the folder to the specified repository
|
397 |
-
api.upload_folder(repo_id=repo_id, folder_path=bmrl_folder, path_in_repo=folder_in_repo, repo_type=repo_type)
|
398 |
-
|
399 |
-
print("Upload complete")
|
400 |
-
|
401 |
-
else:
|
402 |
-
print("Not uploading BMRL embeddings to the repo")
|
403 |
-
print("To upload embeddings, set UPLOAD and BMRL both to True")
|
404 |
-
|
405 |
################################################################################
|
406 |
|
407 |
# Track time
|
|
|
324 |
|
325 |
# Function to convert dense vector to binary vector
|
326 |
def dense_to_binary(dense_vector):
|
327 |
+
return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()
|
328 |
|
329 |
# Create a folder to store binary embeddings
|
330 |
binary_folder = f"{year}-binary-embed"
|
|
|
358 |
print("Not uploading Binary embeddings to the repo")
|
359 |
print("To upload embeddings, set UPLOAD and BINARY both to True")
|
360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
################################################################################
|
362 |
|
363 |
# Track time
|