bluuebunny commited on
Commit
b321e29
·
verified ·
1 Parent(s): b71255d

Update update_embeddings.py

Browse files
Files changed (1) hide show
  1. update_embeddings.py +1 -45
update_embeddings.py CHANGED
@@ -324,7 +324,7 @@ if BINARY:
324
 
325
  # Function to convert dense vector to binary vector
326
  def dense_to_binary(dense_vector):
327
- return np.packbits(np.where(dense_vector >= 0, 1, 0))
328
 
329
  # Create a folder to store binary embeddings
330
  binary_folder = f"{year}-binary-embed"
@@ -358,50 +358,6 @@ else:
358
  print("Not uploading Binary embeddings to the repo")
359
  print("To upload embeddings, set UPLOAD and BINARY both to True")
360
 
361
-
362
- ################################################################################
363
-
364
- # BMRL the data
365
- if BMRL:
366
- print(f"BMRL'ing the data for year: {year}")
367
- print("Set BMRL = False to not binarise and MRL the embeddings")
368
-
369
- # Function to chop a binary vector to a specific size
370
- def binary_to_mrl(binary_vector, size=512):
371
- return np.packbits(np.unpackbits(binary_vector)[:size])
372
-
373
- # Create a folder to store binary embeddings
374
- bmrl_folder = f"{year}-bmrl-embed"
375
- os.makedirs(bmrl_folder, exist_ok=True)
376
-
377
- # Convert the dense vectors to binary vectors
378
- new_embeddings['vector'] = new_embeddings['vector'].progress_apply(binary_to_mrl)
379
-
380
- # Save the binary embeddings to a parquet file
381
- new_embeddings.to_parquet(f'{bmrl_folder}/{year}.parquet', index=False)
382
-
383
- if BMRL and UPLOAD:
384
-
385
- # Setup transaction details
386
- repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
387
- repo_type = "dataset"
388
-
389
- api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
390
-
391
- # Subfolder in the repo of the dataset where the file is stored
392
- folder_in_repo = "data"
393
-
394
- print(f"Uploading binary embeddings to {repo_id} from folder {bmrl_folder}")
395
-
396
- # Upload all files within the folder to the specified repository
397
- api.upload_folder(repo_id=repo_id, folder_path=bmrl_folder, path_in_repo=folder_in_repo, repo_type=repo_type)
398
-
399
- print("Upload complete")
400
-
401
- else:
402
- print("Not uploading BMRL embeddings to the repo")
403
- print("To upload embeddings, set UPLOAD and BMRL both to True")
404
-
405
  ################################################################################
406
 
407
  # Track time
 
324
 
325
  # Function to convert dense vector to binary vector
326
  def dense_to_binary(dense_vector):
327
+ return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()
328
 
329
  # Create a folder to store binary embeddings
330
  binary_folder = f"{year}-binary-embed"
 
358
  print("Not uploading Binary embeddings to the repo")
359
  print("To upload embeddings, set UPLOAD and BINARY both to True")
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  ################################################################################
362
 
363
  # Track time