bluuebunny commited on
Commit
5cd10ec
·
verified ·
1 Parent(s): f861aa5

Update update_embeddings.py

Browse files
Files changed (1) hide show
  1. update_embeddings.py +111 -12
update_embeddings.py CHANGED
@@ -40,6 +40,14 @@ LOCAL = False
40
  # Flag to upload the data to the Hugging Face Hub
41
  UPLOAD = True
42
 
 
 
 
 
 
 
 
 
43
  # Model to use for embedding
44
  model_name = "mixedbread-ai/mxbai-embed-large-v1"
45
 
@@ -48,18 +56,6 @@ num_cores = cpu_count()-1
48
 
49
  # Setup transaction details
50
  repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
51
- repo_type = "dataset"
52
-
53
- # Subfolder in the repo of the dataset where the file is stored
54
- folder_in_repo = "data"
55
- allow_patterns = f"{folder_in_repo}/{year}.parquet"
56
-
57
- # Where to store the local copy of the dataset
58
- local_dir = repo_id
59
-
60
- # Create embed folder
61
- embed_folder = f"{year}-diff-embed"
62
- os.makedirs(embed_folder, exist_ok=True)
63
 
64
  ################################################################################
65
  # Download the dataset
@@ -150,6 +146,7 @@ def embed(input_text):
150
 
151
  else:
152
 
 
153
  sleep(0.2)
154
 
155
  # Calculate embeddings by calling mxbai.embeddings()
@@ -169,6 +166,16 @@ def embed(input_text):
169
  ################################################################################
170
  # Gather preexisting embeddings
171
 
 
 
 
 
 
 
 
 
 
 
172
  # Create local directory
173
  os.makedirs(local_dir, exist_ok=True)
174
 
@@ -224,6 +231,10 @@ selected_columns = ['id', 'vector', '$meta']
224
  # Merge previous embeddings and new embeddings
225
  new_embeddings = pd.concat([previous_embeddings, new_papers[selected_columns]])
226
 
 
 
 
 
227
  # Save the embedded file
228
  embed_filename = f'{embed_folder}/{year}.parquet'
229
  print(f"Saving newly embedded dataframe to: {embed_filename}")
@@ -250,6 +261,94 @@ else:
250
  print("To upload new embeddings, set UPLOAD to True")
251
  ################################################################################
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  # Track time
254
  end = time()
255
 
 
40
  # Flag to upload the data to the Hugging Face Hub
41
  UPLOAD = True
42
 
43
+ # Flag to binarise the data
44
+ BINARY = True
45
+
46
+ # Flag to BMRL the data
47
+ BMRL = True
48
+
49
+ ########################################
50
+
51
  # Model to use for embedding
52
  model_name = "mixedbread-ai/mxbai-embed-large-v1"
53
 
 
56
 
57
  # Setup transaction details
58
  repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  ################################################################################
61
  # Download the dataset
 
146
 
147
  else:
148
 
149
+ # Avoid rate limit from api
150
  sleep(0.2)
151
 
152
  # Calculate embeddings by calling mxbai.embeddings()
 
166
  ################################################################################
167
  # Gather preexisting embeddings
168
 
169
+ # Subfolder in the repo of the dataset where the file is stored
170
+ folder_in_repo = "data"
171
+ allow_patterns = f"{folder_in_repo}/{year}.parquet"
172
+
173
+ # Where to store the local copy of the dataset
174
+ local_dir = repo_id
175
+
176
+ # Set repo type
177
+ repo_type = "dataset"
178
+
179
  # Create local directory
180
  os.makedirs(local_dir, exist_ok=True)
181
 
 
231
  # Merge previous embeddings and new embeddings
232
  new_embeddings = pd.concat([previous_embeddings, new_papers[selected_columns]])
233
 
234
+ # Create embed folder
235
+ embed_folder = f"{year}-diff-embed"
236
+ os.makedirs(embed_folder, exist_ok=True)
237
+
238
  # Save the embedded file
239
  embed_filename = f'{embed_folder}/{year}.parquet'
240
  print(f"Saving newly embedded dataframe to: {embed_filename}")
 
261
  print("To upload new embeddings, set UPLOAD to True")
262
  ################################################################################
263
 
264
+ # Binarise the data
265
+ if BINARY:
266
+
267
+ print(f"Binarising the data for year: {year}")
268
+ print("Set BINARY = False to not binarise the embeddings")
269
+
270
+ # Function to convert dense vector to binary vector
271
+ def dense_to_binary(dense_vector):
272
+ return np.packbits(np.where(dense_vector >= 0, 1, 0))
273
+
274
+ # Create a folder to store binary embeddings
275
+ binary_folder = f"{year}-binary-embed"
276
+ os.makedirs(binary_folder, exist_ok=True)
277
+
278
+ # Convert the dense vectors to binary vectors
279
+ new_embeddings['vector'] = new_embeddings['vector'].progress_apply(dense_to_binary)
280
+
281
+ # Save the binary embeddings to a parquet file
282
+ new_embeddings.to_parquet(f'{binary_folder}/{year}.parquet', index=False)
283
+
284
+ if BINARY and UPLOAD:
285
+
286
+ # Setup transaction details
287
+ repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"
288
+ repo_type = "dataset"
289
+
290
+ api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
291
+
292
+ # Subfolder in the repo of the dataset where the file is stored
293
+ folder_in_repo = "data"
294
+
295
+ print(f"Uploading binary embeddings to {repo_id} from folder {binary_folder}")
296
+
297
+ # Upload all files within the folder to the specified repository
298
+ api.upload_folder(repo_id=repo_id, folder_path=binary_folder, path_in_repo=folder_in_repo, repo_type=repo_type)
299
+
300
+ print("Upload complete")
301
+
302
+ else:
303
+ print("Not uploading Binary embeddings to the repo")
304
+ print("To upload embeddings, set UPLOAD and BINARY both to True")
305
+
306
+
307
+ ################################################################################
308
+
309
+ # BMRL the data
310
+ if BMRL:
311
+ print(f"BMRL'ing the data for year: {year}")
312
+ print("Set BMRL = False to not binarise and MRL the embeddings")
313
+
314
+ # Function to chop a binary vector to a specific size
315
+ def binary_to_mrl(binary_vector, size=512):
316
+ return np.packbits(np.unpackbits(binary_vector)[:size])
317
+
318
+ # Create a folder to store binary embeddings
319
+ bmrl_folder = f"{year}-bmrl-embed"
320
+ os.makedirs(bmrl_folder, exist_ok=True)
321
+
322
+ # Convert the dense vectors to binary vectors
323
+ new_embeddings['vector'] = new_embeddings['vector'].progress_apply(binary_to_mrl)
324
+
325
+ # Save the binary embeddings to a parquet file
326
+ new_embeddings.to_parquet(f'{bmrl_folder}/{year}.parquet', index=False)
327
+
328
+ if BMRL and UPLOAD:
329
+
330
+ # Setup transaction details
331
+ repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
332
+ repo_type = "dataset"
333
+
334
+ api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
335
+
336
+ # Subfolder in the repo of the dataset where the file is stored
337
+ folder_in_repo = "data"
338
+
339
+ print(f"Uploading binary embeddings to {repo_id} from folder {bmrl_folder}")
340
+
341
+ # Upload all files within the folder to the specified repository
342
+ api.upload_folder(repo_id=repo_id, folder_path=bmrl_folder, path_in_repo=folder_in_repo, repo_type=repo_type)
343
+
344
+ print("Upload complete")
345
+
346
+ else:
347
+ print("Not uploading BMRL embeddings to the repo")
348
+ print("To upload embeddings, set UPLOAD and BMRL both to True")
349
+
350
+ ################################################################################
351
+
352
  # Track time
353
  end = time()
354