Pringled commited on
Commit
2ba6e60
·
1 Parent(s): f5eb405

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +270 -12
app.py CHANGED
@@ -32,7 +32,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
32
  embedding_matrix,
33
  threshold=threshold,
34
  batch_size=batch_size,
35
- show_progressbar=False # Disable internal progress bar
36
  )
37
 
38
  # Process duplicates
@@ -62,7 +62,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
62
  embedding_matrix_2,
63
  threshold=threshold,
64
  batch_size=batch_size,
65
- show_progressbar=False # Disable internal progress bar
66
  )
67
 
68
  # Process duplicates
@@ -111,11 +111,8 @@ def perform_deduplication(
111
  texts = [example[dataset1_text_column] for example in ds]
112
 
113
  # Compute embeddings
114
- embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
115
 
116
- # Show progress bar for embedding computation
117
- embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings")
118
-
119
  # Deduplicate
120
  deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
121
 
@@ -160,12 +157,8 @@ def perform_deduplication(
160
  texts2 = [example[dataset2_text_column] for example in ds2]
161
 
162
  # Compute embeddings
163
- embedding_matrix1 = model.encode(texts1, show_progressbar=False) # Disable internal progress bar
164
- embedding_matrix2 = model.encode(texts2, show_progressbar=False) # Disable internal progress bar
165
-
166
- # Show progress bar for embedding computation
167
- embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1")
168
- embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2")
169
 
170
  # Deduplicate across datasets
171
  duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
@@ -263,6 +256,271 @@ with gr.Blocks() as demo:
263
  demo.launch()
264
 
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  # import gradio as gr
267
  # from datasets import load_dataset
268
  # import numpy as np
 
32
  embedding_matrix,
33
  threshold=threshold,
34
  batch_size=batch_size,
35
+ show_progressbar=True # Allow internal progress bar
36
  )
37
 
38
  # Process duplicates
 
62
  embedding_matrix_2,
63
  threshold=threshold,
64
  batch_size=batch_size,
65
+ show_progressbar=True # Allow internal progress bar
66
  )
67
 
68
  # Process duplicates
 
111
  texts = [example[dataset1_text_column] for example in ds]
112
 
113
  # Compute embeddings
114
+ embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
115
 
 
 
 
116
  # Deduplicate
117
  deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
118
 
 
157
  texts2 = [example[dataset2_text_column] for example in ds2]
158
 
159
  # Compute embeddings
160
+ embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
161
+ embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
 
 
 
 
162
 
163
  # Deduplicate across datasets
164
  duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 
256
  demo.launch()
257
 
258
 
259
+ # import gradio as gr
260
+ # from datasets import load_dataset
261
+ # import numpy as np
262
+ # from model2vec import StaticModel
263
+ # from reach import Reach
264
+ # from difflib import ndiff
265
+ # import sys
266
+ # import tqdm
267
+
268
+ # # Load the model at startup
269
+ # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
270
+
271
+ # # Load the default datasets at startup
272
+ # default_dataset1_name = "ag_news"
273
+ # default_dataset1_split = "train"
274
+ # default_dataset2_name = "ag_news"
275
+ # default_dataset2_split = "test"
276
+
277
+ # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
278
+ # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
279
+
280
+ # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
281
+ # """
282
+ # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
283
+ # """
284
+ # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
285
+
286
+ # deduplicated_indices = set(range(len(embedding_matrix)))
287
+ # duplicate_to_original_mapping = {}
288
+
289
+ # results = reach.nearest_neighbor_threshold(
290
+ # embedding_matrix,
291
+ # threshold=threshold,
292
+ # batch_size=batch_size,
293
+ # show_progressbar=False # Disable internal progress bar
294
+ # )
295
+
296
+ # # Process duplicates
297
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
298
+ # if i not in deduplicated_indices:
299
+ # continue
300
+
301
+ # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
302
+
303
+ # for sim_idx in similar_indices:
304
+ # if sim_idx in deduplicated_indices:
305
+ # deduplicated_indices.remove(sim_idx)
306
+ # duplicate_to_original_mapping[sim_idx] = i
307
+
308
+ # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
309
+
310
+ # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
311
+ # """
312
+ # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
313
+ # """
314
+ # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
315
+
316
+ # duplicate_indices_in_test = []
317
+ # duplicate_to_original_mapping = {}
318
+
319
+ # results = reach.nearest_neighbor_threshold(
320
+ # embedding_matrix_2,
321
+ # threshold=threshold,
322
+ # batch_size=batch_size,
323
+ # show_progressbar=False # Disable internal progress bar
324
+ # )
325
+
326
+ # # Process duplicates
327
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
328
+ # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
329
+
330
+ # if similar_indices:
331
+ # duplicate_indices_in_test.append(i)
332
+ # duplicate_to_original_mapping[i] = similar_indices[0]
333
+
334
+ # return duplicate_indices_in_test, duplicate_to_original_mapping
335
+
336
+ # def display_word_differences(x: str, y: str) -> str:
337
+ # diff = ndiff(x.split(), y.split())
338
+ # return " ".join([word for word in diff if word.startswith(('+', '-'))])
339
+
340
+ # def perform_deduplication(
341
+ # deduplication_type,
342
+ # dataset1_name,
343
+ # dataset1_split,
344
+ # dataset1_text_column,
345
+ # dataset2_name="",
346
+ # dataset2_split="",
347
+ # dataset2_text_column="",
348
+ # threshold=0.8,
349
+ # progress=gr.Progress(track_tqdm=True)
350
+ # ):
351
+ # # Monkey-patch tqdm
352
+ # original_tqdm = tqdm.tqdm
353
+ # tqdm.tqdm = progress.tqdm
354
+ # sys.modules['tqdm'].tqdm = progress.tqdm
355
+ # sys.modules['tqdm.auto'].tqdm = progress.tqdm
356
+
357
+ # try:
358
+ # # Convert threshold to float
359
+ # threshold = float(threshold)
360
+
361
+ # if deduplication_type == "Single dataset":
362
+ # # Check if the dataset is the default one
363
+ # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
364
+ # ds = ds_default1
365
+ # else:
366
+ # ds = load_dataset(dataset1_name, split=dataset1_split)
367
+
368
+ # # Extract texts
369
+ # texts = [example[dataset1_text_column] for example in ds]
370
+
371
+ # # Compute embeddings
372
+ # embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
373
+
374
+ # # Show progress bar for embedding computation
375
+ # embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings")
376
+
377
+ # # Deduplicate
378
+ # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
379
+
380
+ # # Prepare the results
381
+ # num_duplicates = len(duplicate_to_original_mapping)
382
+ # num_total = len(texts)
383
+ # num_deduplicated = len(deduplicated_indices)
384
+
385
+ # result_text = f"**Total documents:** {num_total}\n"
386
+ # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
387
+ # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
388
+
389
+ # # Show deduplicated examples
390
+ # result_text += "**Examples of duplicates found:**\n\n"
391
+ # num_examples = min(5, num_duplicates)
392
+ # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
393
+ # original_text = texts[original_idx]
394
+ # duplicate_text = texts[duplicate_idx]
395
+ # differences = display_word_differences(original_text, duplicate_text)
396
+ # result_text += f"**Original text:**\n{original_text}\n\n"
397
+ # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
398
+ # result_text += f"**Differences:**\n{differences}\n"
399
+ # result_text += "-" * 50 + "\n\n"
400
+
401
+ # return result_text
402
+
403
+ # elif deduplication_type == "Cross-dataset":
404
+ # # Dataset 1
405
+ # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
406
+ # ds1 = ds_default1
407
+ # else:
408
+ # ds1 = load_dataset(dataset1_name, split=dataset1_split)
409
+
410
+ # # Dataset 2
411
+ # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
412
+ # ds2 = ds_default2
413
+ # else:
414
+ # ds2 = load_dataset(dataset2_name, split=dataset2_split)
415
+
416
+ # # Extract texts
417
+ # texts1 = [example[dataset1_text_column] for example in ds1]
418
+ # texts2 = [example[dataset2_text_column] for example in ds2]
419
+
420
+ # # Compute embeddings
421
+ # embedding_matrix1 = model.encode(texts1, show_progressbar=False) # Disable internal progress bar
422
+ # embedding_matrix2 = model.encode(texts2, show_progressbar=False) # Disable internal progress bar
423
+
424
+ # # Show progress bar for embedding computation
425
+ # embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1")
426
+ # embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2")
427
+
428
+ # # Deduplicate across datasets
429
+ # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
430
+
431
+ # num_duplicates = len(duplicate_indices_in_ds2)
432
+ # num_total_ds2 = len(texts2)
433
+ # num_unique_ds2 = num_total_ds2 - num_duplicates
434
+
435
+ # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
436
+ # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
437
+ # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
438
+
439
+ # # Show deduplicated examples
440
+ # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
441
+ # num_examples = min(5, num_duplicates)
442
+ # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
443
+ # original_idx = duplicate_to_original_mapping[duplicate_idx]
444
+ # original_text = texts1[original_idx]
445
+ # duplicate_text = texts2[duplicate_idx]
446
+ # differences = display_word_differences(original_text, duplicate_text)
447
+ # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
448
+ # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
449
+ # result_text += f"**Differences:**\n{differences}\n"
450
+ # result_text += "-" * 50 + "\n\n"
451
+
452
+ # return result_text
453
+
454
+ # finally:
455
+ # # Restore original tqdm
456
+ # tqdm.tqdm = original_tqdm
457
+ # sys.modules['tqdm'].tqdm = original_tqdm
458
+ # sys.modules['tqdm.auto'].tqdm = original_tqdm
459
+
460
+ # with gr.Blocks() as demo:
461
+ # gr.Markdown("# Semantic Deduplication")
462
+
463
+ # deduplication_type = gr.Radio(
464
+ # choices=["Single dataset", "Cross-dataset"],
465
+ # label="Deduplication Type",
466
+ # value="Single dataset"
467
+ # )
468
+
469
+ # with gr.Row():
470
+ # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
471
+ # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
472
+ # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
473
+
474
+ # dataset2_inputs = gr.Column(visible=False)
475
+ # with dataset2_inputs:
476
+ # gr.Markdown("### Dataset 2")
477
+ # with gr.Row():
478
+ # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
479
+ # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
480
+ # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
481
+
482
+ # threshold = gr.Slider(
483
+ # minimum=0.0,
484
+ # maximum=1.0,
485
+ # value=0.8,
486
+ # label="Similarity Threshold"
487
+ # )
488
+
489
+ # compute_button = gr.Button("Compute")
490
+
491
+ # output = gr.Markdown()
492
+
493
+ # # Function to update the visibility of dataset2_inputs
494
+ # def update_visibility(deduplication_type_value):
495
+ # if deduplication_type_value == "Cross-dataset":
496
+ # return gr.update(visible=True)
497
+ # else:
498
+ # return gr.update(visible=False)
499
+
500
+ # deduplication_type.change(
501
+ # update_visibility,
502
+ # inputs=deduplication_type,
503
+ # outputs=dataset2_inputs
504
+ # )
505
+
506
+ # compute_button.click(
507
+ # fn=perform_deduplication,
508
+ # inputs=[
509
+ # deduplication_type,
510
+ # dataset1_name,
511
+ # dataset1_split,
512
+ # dataset1_text_column,
513
+ # dataset2_name,
514
+ # dataset2_split,
515
+ # dataset2_text_column,
516
+ # threshold
517
+ # ],
518
+ # outputs=output
519
+ # )
520
+
521
+ # demo.launch()
522
+
523
+
524
  # import gradio as gr
525
  # from datasets import load_dataset
526
  # import numpy as np