Pringled commited on
Commit
20f4a6e
·
1 Parent(s): 6b0e834

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +977 -630
app.py CHANGED
@@ -26,79 +26,6 @@ def batch_iterable(iterable, batch_size):
26
  for i in range(0, len(iterable), batch_size):
27
  yield iterable[i:i + batch_size]
28
 
29
- def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
30
- embeddings = []
31
- for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
32
- batch_embeddings = model.encode(batch, show_progressbar=False)
33
- embeddings.append(batch_embeddings)
34
- return np.concatenate(embeddings, axis=0)
35
-
36
- def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
37
- """
38
- Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
39
- """
40
- # Building the index
41
- progress(0, desc="Building search index...")
42
- reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
43
-
44
- deduplicated_indices = set(range(len(embedding_matrix)))
45
- duplicate_to_original_mapping = {}
46
-
47
- # Finding nearest neighbors
48
- progress(0, desc="Finding nearest neighbors...")
49
- results = reach.nearest_neighbor_threshold(
50
- embedding_matrix,
51
- threshold=threshold,
52
- batch_size=batch_size,
53
- show_progressbar=False # Disable internal progress bar
54
- )
55
-
56
- # Processing duplicates with a progress bar
57
- total_items = len(embedding_matrix)
58
- for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
59
- if i not in deduplicated_indices:
60
- continue
61
-
62
- similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
63
-
64
- for sim_idx in similar_indices:
65
- if sim_idx in deduplicated_indices:
66
- deduplicated_indices.remove(sim_idx)
67
- duplicate_to_original_mapping[sim_idx] = i
68
-
69
- return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
70
-
71
- def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
72
- """
73
- Deduplicate embeddings across two datasets and return the indices of duplicates between them.
74
- """
75
- # Building the index from Dataset 1
76
- progress(0, desc="Building search index from Dataset 1...")
77
- reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
78
-
79
- duplicate_indices_in_test = []
80
- duplicate_to_original_mapping = {}
81
-
82
- # Finding nearest neighbors between datasets
83
- progress(0, desc="Finding nearest neighbors between datasets...")
84
- results = reach.nearest_neighbor_threshold(
85
- embedding_matrix_2,
86
- threshold=threshold,
87
- batch_size=batch_size,
88
- show_progressbar=False # Disable internal progress bar
89
- )
90
-
91
- total_items = len(embedding_matrix_2)
92
- # Processing duplicates with a progress bar
93
- for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
94
- similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
95
-
96
- if similar_indices:
97
- duplicate_indices_in_test.append(i)
98
- duplicate_to_original_mapping[i] = similar_indices[0]
99
-
100
- return duplicate_indices_in_test, duplicate_to_original_mapping
101
-
102
  def display_word_differences(x: str, y: str) -> str:
103
  diff = ndiff(x.split(), y.split())
104
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -138,7 +65,13 @@ def perform_deduplication(
138
  # Compute embeddings
139
  status = "Computing embeddings for Dataset 1..."
140
  yield status, ""
141
- embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
142
 
143
  # Deduplicate
144
  status = "Deduplicating embeddings..."
@@ -205,12 +138,23 @@ def perform_deduplication(
205
  # Compute embeddings for Dataset 1
206
  status = "Computing embeddings for Dataset 1..."
207
  yield status, ""
208
- embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
209
 
210
  # Compute embeddings for Dataset 2
211
  status = "Computing embeddings for Dataset 2..."
212
  yield status, ""
213
- embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 
 
 
 
 
214
 
215
  # Deduplicate across datasets
216
  status = "Deduplicating embeddings across datasets..."
@@ -251,6 +195,72 @@ def perform_deduplication(
251
  yield f"An error occurred: {e}", ""
252
  raise e
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  with gr.Blocks() as demo:
255
  gr.Markdown("# Semantic Deduplication")
256
 
@@ -317,14 +327,12 @@ demo.launch()
317
 
318
 
319
 
320
-
321
  # import gradio as gr
322
  # from datasets import load_dataset
323
  # import numpy as np
324
  # from model2vec import StaticModel
325
  # from reach import Reach
326
  # from difflib import ndiff
327
- # import sys
328
  # import tqdm
329
 
330
  # # Load the model at startup
@@ -342,26 +350,41 @@ demo.launch()
342
  # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
343
  # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
344
 
345
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
 
 
 
 
 
 
 
 
 
 
 
 
346
  # """
347
  # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
348
  # """
349
  # # Building the index
 
350
  # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
351
 
352
  # deduplicated_indices = set(range(len(embedding_matrix)))
353
  # duplicate_to_original_mapping = {}
354
 
355
  # # Finding nearest neighbors
 
356
  # results = reach.nearest_neighbor_threshold(
357
  # embedding_matrix,
358
  # threshold=threshold,
359
  # batch_size=batch_size,
360
- # show_progressbar=True # Allow internal progress bar
361
  # )
362
 
363
- # # Processing duplicates
364
- # for i, similar_items in enumerate(results):
 
365
  # if i not in deduplicated_indices:
366
  # continue
367
 
@@ -374,26 +397,29 @@ demo.launch()
374
 
375
  # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
376
 
377
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
378
  # """
379
  # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
380
  # """
381
  # # Building the index from Dataset 1
 
382
  # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
383
 
384
  # duplicate_indices_in_test = []
385
  # duplicate_to_original_mapping = {}
386
 
387
  # # Finding nearest neighbors between datasets
 
388
  # results = reach.nearest_neighbor_threshold(
389
  # embedding_matrix_2,
390
  # threshold=threshold,
391
  # batch_size=batch_size,
392
- # show_progressbar=True # Allow internal progress bar
393
  # )
394
 
395
- # # Processing duplicates
396
- # for i, similar_items in enumerate(results):
 
397
  # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
398
 
399
  # if similar_indices:
@@ -417,17 +443,10 @@ demo.launch()
417
  # threshold=default_threshold,
418
  # progress=gr.Progress(track_tqdm=True)
419
  # ):
420
- # # Deep Monkey-Patching of tqdm
421
- # original_tqdm = tqdm.tqdm
422
- # tqdm.tqdm = progress.tqdm
423
- # for mod_name in list(sys.modules.keys()):
424
- # if 'tqdm' in mod_name:
425
- # sys.modules[mod_name].tqdm = progress.tqdm
426
-
427
  # try:
428
  # # Convert threshold to float
429
  # threshold = float(threshold)
430
-
431
  # # Initialize status message
432
  # status = ""
433
 
@@ -439,33 +458,33 @@ demo.launch()
439
  # ds = ds_default1
440
  # else:
441
  # ds = load_dataset(dataset1_name, split=dataset1_split)
442
-
443
  # # Extract texts
444
  # status = "Extracting texts from Dataset 1..."
445
  # yield status, ""
446
  # texts = [example[dataset1_text_column] for example in ds]
447
-
448
  # # Compute embeddings
449
  # status = "Computing embeddings for Dataset 1..."
450
  # yield status, ""
451
- # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
452
-
453
  # # Deduplicate
454
  # status = "Deduplicating embeddings..."
455
  # yield status, ""
456
  # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
457
- # embedding_matrix, threshold
458
  # )
459
-
460
  # # Prepare the results
461
  # num_duplicates = len(duplicate_to_original_mapping)
462
  # num_total = len(texts)
463
  # num_deduplicated = len(deduplicated_indices)
464
-
465
  # result_text = f"**Total documents:** {num_total}\n"
466
  # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
467
  # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
468
-
469
  # # Show deduplicated examples
470
  # if num_duplicates > 0:
471
  # result_text += "**Examples of duplicates found:**\n\n"
@@ -480,11 +499,11 @@ demo.launch()
480
  # result_text += "-" * 50 + "\n\n"
481
  # else:
482
  # result_text += "No duplicates found."
483
-
484
  # # Final status
485
  # status = "Deduplication completed."
486
  # yield status, result_text
487
-
488
  # elif deduplication_type == "Cross-dataset":
489
  # # Load Dataset 1
490
  # status = "Loading Dataset 1..."
@@ -493,7 +512,7 @@ demo.launch()
493
  # ds1 = ds_default1
494
  # else:
495
  # ds1 = load_dataset(dataset1_name, split=dataset1_split)
496
-
497
  # # Load Dataset 2
498
  # status = "Loading Dataset 2..."
499
  # yield status, ""
@@ -501,42 +520,42 @@ demo.launch()
501
  # ds2 = ds_default2
502
  # else:
503
  # ds2 = load_dataset(dataset2_name, split=dataset2_split)
504
-
505
  # # Extract texts from Dataset 1
506
  # status = "Extracting texts from Dataset 1..."
507
  # yield status, ""
508
  # texts1 = [example[dataset1_text_column] for example in ds1]
509
-
510
  # # Extract texts from Dataset 2
511
  # status = "Extracting texts from Dataset 2..."
512
  # yield status, ""
513
  # texts2 = [example[dataset2_text_column] for example in ds2]
514
-
515
  # # Compute embeddings for Dataset 1
516
  # status = "Computing embeddings for Dataset 1..."
517
  # yield status, ""
518
- # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
519
-
520
  # # Compute embeddings for Dataset 2
521
  # status = "Computing embeddings for Dataset 2..."
522
  # yield status, ""
523
- # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
524
-
525
  # # Deduplicate across datasets
526
  # status = "Deduplicating embeddings across datasets..."
527
  # yield status, ""
528
  # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
529
- # embedding_matrix1, embedding_matrix2, threshold
530
  # )
531
-
532
  # num_duplicates = len(duplicate_indices_in_ds2)
533
  # num_total_ds2 = len(texts2)
534
  # num_unique_ds2 = num_total_ds2 - num_duplicates
535
-
536
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
537
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
538
  # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
539
-
540
  # # Show deduplicated examples
541
  # if num_duplicates > 0:
542
  # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
@@ -552,17 +571,14 @@ demo.launch()
552
  # result_text += "-" * 50 + "\n\n"
553
  # else:
554
  # result_text += "No duplicates found."
555
-
556
  # # Final status
557
  # status = "Deduplication completed."
558
  # yield status, result_text
559
 
560
- # finally:
561
- # # Restore original tqdm
562
- # tqdm.tqdm = original_tqdm
563
- # for mod_name in list(sys.modules.keys()):
564
- # if 'tqdm' in mod_name:
565
- # sys.modules[mod_name].tqdm = original_tqdm
566
 
567
  # with gr.Blocks() as demo:
568
  # gr.Markdown("# Semantic Deduplication")
@@ -614,605 +630,670 @@ demo.launch()
614
  # compute_button.click(
615
  # fn=perform_deduplication,
616
  # inputs=[
617
- # deduplication_type,
618
- # dataset1_name,
619
- # dataset1_split,
620
  # dataset1_text_column,
621
- # dataset2_name,
622
- # dataset2_split,
623
  # dataset2_text_column,
624
  # threshold
625
  # ],
626
  # outputs=[status_output, result_output]
627
  # )
628
-
629
  # demo.launch()
630
 
631
 
632
- # import gradio as gr
633
- # from datasets import load_dataset
634
- # import numpy as np
635
- # from model2vec import StaticModel
636
- # from reach import Reach
637
- # from difflib import ndiff
638
- # import sys
639
- # import tqdm
640
 
641
- # # Load the model at startup
642
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
643
 
644
- # # Update default dataset to 'sst2' and set default threshold to 0.9
645
- # default_dataset1_name = "sst2"
646
- # default_dataset1_split = "train"
647
- # default_dataset2_name = "sst2"
648
- # default_dataset2_split = "validation"
649
- # default_text_column = "sentence"
650
- # default_threshold = 0.9
651
 
652
- # # Load the default datasets at startup
653
- # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
654
- # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
655
 
656
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
657
- # """
658
- # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
659
- # """
660
- # # Update progress to indicate building the index
661
- # progress(0, desc="Building search index...")
662
- # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
663
 
664
- # deduplicated_indices = set(range(len(embedding_matrix)))
665
- # duplicate_to_original_mapping = {}
666
 
667
- # # Finding nearest neighbors
668
- # progress(0, desc="Finding nearest neighbors...")
669
- # results = reach.nearest_neighbor_threshold(
670
- # embedding_matrix,
671
- # threshold=threshold,
672
- # batch_size=batch_size,
673
- # show_progressbar=True # Allow internal progress bar
674
- # )
675
 
676
- # # Processing duplicates with a progress bar
677
- # total_items = len(embedding_matrix)
678
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
679
- # if i not in deduplicated_indices:
680
- # continue
681
 
682
- # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
683
 
684
- # for sim_idx in similar_indices:
685
- # if sim_idx in deduplicated_indices:
686
- # deduplicated_indices.remove(sim_idx)
687
- # duplicate_to_original_mapping[sim_idx] = i
688
 
689
- # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
690
 
691
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
692
- # """
693
- # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
694
- # """
695
- # # Update progress to indicate building the index
696
- # progress(0, desc="Building search index from Dataset 1...")
697
- # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
698
 
699
- # duplicate_indices_in_test = []
700
- # duplicate_to_original_mapping = {}
701
 
702
- # # Finding nearest neighbors between datasets
703
- # progress(0, desc="Finding nearest neighbors between datasets...")
704
- # results = reach.nearest_neighbor_threshold(
705
- # embedding_matrix_2,
706
- # threshold=threshold,
707
- # batch_size=batch_size,
708
- # show_progressbar=True # Allow internal progress bar
709
- # )
710
 
711
- # total_items = len(embedding_matrix_2)
712
- # # Processing duplicates with a progress bar
713
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
714
- # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
715
 
716
- # if similar_indices:
717
- # duplicate_indices_in_test.append(i)
718
- # duplicate_to_original_mapping[i] = similar_indices[0]
719
 
720
- # return duplicate_indices_in_test, duplicate_to_original_mapping
721
 
722
- # def display_word_differences(x: str, y: str) -> str:
723
- # diff = ndiff(x.split(), y.split())
724
- # return " ".join([word for word in diff if word.startswith(('+', '-'))])
725
 
726
- # def perform_deduplication(
727
- # deduplication_type,
728
- # dataset1_name,
729
- # dataset1_split,
730
- # dataset1_text_column,
731
- # dataset2_name="",
732
- # dataset2_split="",
733
- # dataset2_text_column="",
734
- # threshold=default_threshold,
735
- # progress=gr.Progress(track_tqdm=True)
736
- # ):
737
- # # Monkey-patch tqdm
738
- # original_tqdm = tqdm.tqdm
739
- # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
740
- # tqdm.tqdm = progress.tqdm
741
- # sys.modules['tqdm'].tqdm = progress.tqdm
742
- # sys.modules['tqdm.auto'].tqdm = progress.tqdm
743
- # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
744
 
745
- # try:
746
- # # Convert threshold to float
747
- # threshold = float(threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
 
749
- # if deduplication_type == "Single dataset":
750
- # # Load Dataset 1
751
- # progress(0, desc="Loading Dataset 1...")
752
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
753
- # ds = ds_default1
754
- # else:
755
- # ds = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
756
 
757
- # # Extract texts
758
- # progress(0, desc="Extracting texts from Dataset 1...")
759
- # texts = [example[dataset1_text_column] for example in ds]
 
760
 
761
- # # Compute embeddings
762
- # progress(0, desc="Computing embeddings for Dataset 1...")
763
- # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
 
764
 
765
- # # Deduplicate
766
- # result_text = deduplicate_and_prepare_results_single(
767
- # embedding_matrix, texts, threshold, progress
768
- # )
 
 
 
 
 
 
 
 
 
 
 
769
 
770
- # return result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
- # elif deduplication_type == "Cross-dataset":
773
- # # Load Dataset 1
774
- # progress(0, desc="Loading Dataset 1...")
775
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
776
- # ds1 = ds_default1
777
- # else:
778
- # ds1 = load_dataset(dataset1_name, split=dataset1_split)
 
779
 
780
- # # Load Dataset 2
781
- # progress(0, desc="Loading Dataset 2...")
782
- # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
783
- # ds2 = ds_default2
784
- # else:
785
- # ds2 = load_dataset(dataset2_name, split=dataset2_split)
 
786
 
787
- # # Extract texts from Dataset 1
788
- # progress(0, desc="Extracting texts from Dataset 1...")
789
- # texts1 = [example[dataset1_text_column] for example in ds1]
 
790
 
791
- # # Extract texts from Dataset 2
792
- # progress(0, desc="Extracting texts from Dataset 2...")
793
- # texts2 = [example[dataset2_text_column] for example in ds2]
 
794
 
795
- # # Compute embeddings for Dataset 1
796
- # progress(0, desc="Computing embeddings for Dataset 1...")
797
- # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
 
798
 
799
- # # Compute embeddings for Dataset 2
800
- # progress(0, desc="Computing embeddings for Dataset 2...")
801
- # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
 
802
 
803
- # # Deduplicate across datasets
804
- # result_text = deduplicate_and_prepare_results_cross(
805
- # embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
806
- # )
 
 
807
 
808
- # return result_text
809
-
810
- # finally:
811
- # # Restore original tqdm
812
- # tqdm.tqdm = original_tqdm
813
- # sys.modules['tqdm'].tqdm = original_tqdm
814
- # sys.modules['tqdm.auto'].tqdm = original_tqdm
815
-
816
- # # Restore reach's original tqdm
817
- # if original_reach_tqdm is not None:
818
- # Reach.tqdm = original_reach_tqdm
819
- # else:
820
- # del Reach.tqdm # If it wasn't originally in Reach's __dict__
821
-
822
- # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
823
- # # Deduplicate
824
- # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
825
- # embedding_matrix, threshold, progress=progress
826
- # )
827
-
828
- # # Prepare the results
829
- # num_duplicates = len(duplicate_to_original_mapping)
830
- # num_total = len(texts)
831
- # num_deduplicated = len(deduplicated_indices)
832
-
833
- # result_text = f"**Total documents:** {num_total}\n"
834
- # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
835
- # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
836
-
837
- # # Show deduplicated examples
838
- # if num_duplicates > 0:
839
- # result_text += "**Examples of duplicates found:**\n\n"
840
- # num_examples = min(5, num_duplicates)
841
- # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
842
- # original_text = texts[original_idx]
843
- # duplicate_text = texts[duplicate_idx]
844
- # differences = display_word_differences(original_text, duplicate_text)
845
- # result_text += f"**Original text:**\n{original_text}\n\n"
846
- # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
847
- # result_text += f"**Differences:**\n{differences}\n"
848
- # result_text += "-" * 50 + "\n\n"
849
- # else:
850
- # result_text += "No duplicates found."
851
-
852
- # return result_text
853
-
854
- # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
855
- # # Deduplicate across datasets
856
- # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
857
- # embedding_matrix1, embedding_matrix2, threshold, progress=progress
858
- # )
859
-
860
- # num_duplicates = len(duplicate_indices_in_ds2)
861
- # num_total_ds2 = len(texts2)
862
- # num_unique_ds2 = num_total_ds2 - num_duplicates
863
-
864
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
865
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
866
- # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
867
-
868
- # # Show deduplicated examples
869
- # if num_duplicates > 0:
870
- # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
871
- # num_examples = min(5, num_duplicates)
872
- # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
873
- # original_idx = duplicate_to_original_mapping[duplicate_idx]
874
- # original_text = texts1[original_idx]
875
- # duplicate_text = texts2[duplicate_idx]
876
- # differences = display_word_differences(original_text, duplicate_text)
877
- # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
878
- # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
879
- # result_text += f"**Differences:**\n{differences}\n"
880
- # result_text += "-" * 50 + "\n\n"
881
- # else:
882
- # result_text += "No duplicates found."
883
-
884
- # return result_text
885
-
886
- # with gr.Blocks() as demo:
887
- # gr.Markdown("# Semantic Deduplication")
888
 
889
- # deduplication_type = gr.Radio(
890
- # choices=["Single dataset", "Cross-dataset"],
891
- # label="Deduplication Type",
892
- # value="Single dataset"
893
- # )
 
894
 
895
- # with gr.Row():
896
- # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
897
- # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
898
- # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
899
 
900
- # dataset2_inputs = gr.Column(visible=False)
901
- # with dataset2_inputs:
902
- # gr.Markdown("### Dataset 2")
903
- # with gr.Row():
904
- # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
905
- # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
906
- # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
907
 
908
- # threshold = gr.Slider(
909
- # minimum=0.0,
910
- # maximum=1.0,
911
- # value=default_threshold,
912
- # label="Similarity Threshold"
913
- # )
914
 
915
- # compute_button = gr.Button("Compute")
 
 
 
 
 
 
916
 
917
- # output = gr.Markdown()
 
 
 
 
 
918
 
919
- # # Function to update the visibility of dataset2_inputs
920
- # def update_visibility(deduplication_type_value):
921
- # if deduplication_type_value == "Cross-dataset":
922
- # return gr.update(visible=True)
923
- # else:
924
- # return gr.update(visible=False)
925
 
926
- # deduplication_type.change(
927
- # update_visibility,
928
- # inputs=deduplication_type,
929
- # outputs=dataset2_inputs
930
- # )
931
 
932
- # compute_button.click(
933
- # fn=perform_deduplication,
934
- # inputs=[
935
- # deduplication_type,
936
- # dataset1_name,
937
- # dataset1_split,
938
- # dataset1_text_column,
939
- # dataset2_name,
940
- # dataset2_split,
941
- # dataset2_text_column,
942
- # threshold
943
- # ],
944
- # outputs=output
945
- # )
946
-
947
- # demo.launch()
948
 
 
 
 
 
 
949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
 
951
 
952
- # import gradio as gr
953
- # from datasets import load_dataset
954
- # import numpy as np
955
- # from model2vec import StaticModel
956
- # from reach import Reach
957
- # from difflib import ndiff
958
- # import sys
959
- # import tqdm
960
 
961
- # # Load the model at startup
962
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
963
 
964
- # # Load the default datasets at startup
965
- # default_dataset1_name = "ag_news"
966
- # default_dataset1_split = "train"
967
- # default_dataset2_name = "ag_news"
968
- # default_dataset2_split = "test"
 
 
969
 
970
- # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
971
- # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 
972
 
973
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
974
- # """
975
- # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
976
- # """
977
- # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 
 
978
 
979
- # deduplicated_indices = set(range(len(embedding_matrix)))
980
- # duplicate_to_original_mapping = {}
981
 
982
- # results = reach.nearest_neighbor_threshold(
983
- # embedding_matrix,
984
- # threshold=threshold,
985
- # batch_size=batch_size,
986
- # show_progressbar=True # Allow internal progress bar
987
- # )
 
 
988
 
989
- # # Process duplicates
990
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
991
- # if i not in deduplicated_indices:
992
- # continue
 
993
 
994
- # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
995
 
996
- # for sim_idx in similar_indices:
997
- # if sim_idx in deduplicated_indices:
998
- # deduplicated_indices.remove(sim_idx)
999
- # duplicate_to_original_mapping[sim_idx] = i
1000
 
1001
- # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1002
 
1003
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1004
- # """
1005
- # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1006
- # """
1007
- # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 
 
1008
 
1009
- # duplicate_indices_in_test = []
1010
- # duplicate_to_original_mapping = {}
1011
 
1012
- # results = reach.nearest_neighbor_threshold(
1013
- # embedding_matrix_2,
1014
- # threshold=threshold,
1015
- # batch_size=batch_size,
1016
- # show_progressbar=True # Allow internal progress bar
1017
- # )
 
 
1018
 
1019
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
1020
- # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 
 
1021
 
1022
- # if similar_indices:
1023
- # duplicate_indices_in_test.append(i)
1024
- # duplicate_to_original_mapping[i] = similar_indices[0]
1025
 
1026
- # return duplicate_indices_in_test, duplicate_to_original_mapping
1027
 
1028
- # def display_word_differences(x: str, y: str) -> str:
1029
- # diff = ndiff(x.split(), y.split())
1030
- # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1031
 
1032
- # def perform_deduplication(
1033
- # deduplication_type,
1034
- # dataset1_name,
1035
- # dataset1_split,
1036
- # dataset1_text_column,
1037
- # dataset2_name="",
1038
- # dataset2_split="",
1039
- # dataset2_text_column="",
1040
- # threshold=0.8,
1041
- # progress=gr.Progress(track_tqdm=True)
1042
- # ):
1043
- # # Monkey-patch tqdm
1044
- # original_tqdm = tqdm.tqdm
1045
- # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1046
- # tqdm.tqdm = progress.tqdm
1047
- # sys.modules['tqdm'].tqdm = progress.tqdm
1048
- # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1049
- # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1050
 
1051
- # try:
1052
- # # Convert threshold to float
1053
- # threshold = float(threshold)
1054
 
1055
- # if deduplication_type == "Single dataset":
1056
- # # Check if the dataset is the default one
1057
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1058
- # ds = ds_default1
1059
- # else:
1060
- # ds = load_dataset(dataset1_name, split=dataset1_split)
1061
-
1062
- # # Extract texts
1063
- # texts = [example[dataset1_text_column] for example in ds]
1064
-
1065
- # # Compute embeddings
1066
- # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1067
-
1068
- # # Deduplicate
1069
- # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
1070
 
1071
- # # Prepare the results
1072
- # num_duplicates = len(duplicate_to_original_mapping)
1073
- # num_total = len(texts)
1074
- # num_deduplicated = len(deduplicated_indices)
1075
 
1076
- # result_text = f"**Total documents:** {num_total}\n"
1077
- # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1078
- # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1079
 
1080
- # # Show deduplicated examples
1081
- # result_text += "**Examples of duplicates found:**\n\n"
1082
- # num_examples = min(5, num_duplicates)
1083
- # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1084
- # original_text = texts[original_idx]
1085
- # duplicate_text = texts[duplicate_idx]
1086
- # differences = display_word_differences(original_text, duplicate_text)
1087
- # result_text += f"**Original text:**\n{original_text}\n\n"
1088
- # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1089
- # result_text += f"**Differences:**\n{differences}\n"
1090
- # result_text += "-" * 50 + "\n\n"
1091
 
1092
- # return result_text
1093
 
1094
- # elif deduplication_type == "Cross-dataset":
1095
- # # Dataset 1
1096
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1097
- # ds1 = ds_default1
1098
- # else:
1099
- # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1100
-
1101
- # # Dataset 2
1102
- # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1103
- # ds2 = ds_default2
1104
- # else:
1105
- # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1106
 
1107
- # # Extract texts
1108
- # texts1 = [example[dataset1_text_column] for example in ds1]
1109
- # texts2 = [example[dataset2_text_column] for example in ds2]
 
 
 
1110
 
1111
- # # Compute embeddings
1112
- # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
1113
- # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1114
 
1115
- # # Deduplicate across datasets
1116
- # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1117
- # embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1118
 
1119
- # num_duplicates = len(duplicate_indices_in_ds2)
1120
- # num_total_ds2 = len(texts2)
1121
- # num_unique_ds2 = num_total_ds2 - num_duplicates
1122
 
1123
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1124
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1125
- # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1126
 
1127
- # # Show deduplicated examples
1128
- # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1129
- # num_examples = min(5, num_duplicates)
1130
- # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1131
- # original_idx = duplicate_to_original_mapping[duplicate_idx]
1132
- # original_text = texts1[original_idx]
1133
- # duplicate_text = texts2[duplicate_idx]
1134
- # differences = display_word_differences(original_text, duplicate_text)
1135
- # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1136
- # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1137
- # result_text += f"**Differences:**\n{differences}\n"
1138
- # result_text += "-" * 50 + "\n\n"
1139
 
1140
- # return result_text
1141
 
1142
- # finally:
1143
- # # Restore original tqdm
1144
- # tqdm.tqdm = original_tqdm
1145
- # sys.modules['tqdm'].tqdm = original_tqdm
1146
- # sys.modules['tqdm.auto'].tqdm = original_tqdm
1147
 
1148
- # # Restore reach's original tqdm
1149
- # if original_reach_tqdm is not None:
1150
- # Reach.tqdm = original_reach_tqdm
1151
- # else:
1152
- # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1153
 
1154
- # with gr.Blocks() as demo:
1155
- # gr.Markdown("# Semantic Deduplication")
1156
-
1157
- # deduplication_type = gr.Radio(
1158
- # choices=["Single dataset", "Cross-dataset"],
1159
- # label="Deduplication Type",
1160
- # value="Single dataset"
1161
- # )
1162
 
1163
- # with gr.Row():
1164
- # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
1165
- # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
1166
- # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
1167
 
1168
- # dataset2_inputs = gr.Column(visible=False)
1169
- # with dataset2_inputs:
1170
- # gr.Markdown("### Dataset 2")
1171
- # with gr.Row():
1172
- # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
1173
- # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
1174
- # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
1175
 
1176
- # threshold = gr.Slider(
1177
- # minimum=0.0,
1178
- # maximum=1.0,
1179
- # value=0.8,
1180
- # label="Similarity Threshold"
1181
- # )
 
 
 
 
 
 
 
 
1182
 
1183
- # compute_button = gr.Button("Compute")
 
 
 
 
 
 
1184
 
1185
- # output = gr.Markdown()
 
 
1186
 
1187
- # # Function to update the visibility of dataset2_inputs
1188
- # def update_visibility(deduplication_type_value):
1189
- # if deduplication_type_value == "Cross-dataset":
1190
- # return gr.update(visible=True)
1191
- # else:
1192
- # return gr.update(visible=False)
1193
 
1194
- # deduplication_type.change(
1195
- # update_visibility,
1196
- # inputs=deduplication_type,
1197
- # outputs=dataset2_inputs
1198
- # )
 
 
 
 
 
 
 
 
 
 
1199
 
1200
- # compute_button.click(
1201
- # fn=perform_deduplication,
1202
- # inputs=[
1203
- # deduplication_type,
1204
- # dataset1_name,
1205
- # dataset1_split,
1206
- # dataset1_text_column,
1207
- # dataset2_name,
1208
- # dataset2_split,
1209
- # dataset2_text_column,
1210
- # threshold
1211
- # ],
1212
- # outputs=output
1213
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214
 
1215
- # demo.launch()
 
 
1216
 
1217
 
1218
  # # import gradio as gr
@@ -1253,7 +1334,7 @@ demo.launch()
1253
  # # )
1254
 
1255
  # # # Process duplicates
1256
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
1257
  # # if i not in deduplicated_indices:
1258
  # # continue
1259
 
@@ -1282,8 +1363,7 @@ demo.launch()
1282
  # # show_progressbar=True # Allow internal progress bar
1283
  # # )
1284
 
1285
- # # # Process duplicates
1286
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
1287
  # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1288
 
1289
  # # if similar_indices:
@@ -1309,9 +1389,11 @@ demo.launch()
1309
  # # ):
1310
  # # # Monkey-patch tqdm
1311
  # # original_tqdm = tqdm.tqdm
 
1312
  # # tqdm.tqdm = progress.tqdm
1313
  # # sys.modules['tqdm'].tqdm = progress.tqdm
1314
  # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
 
1315
 
1316
  # # try:
1317
  # # # Convert threshold to float
@@ -1378,7 +1460,8 @@ demo.launch()
1378
  # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1379
 
1380
  # # # Deduplicate across datasets
1381
- # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 
1382
 
1383
  # # num_duplicates = len(duplicate_indices_in_ds2)
1384
  # # num_total_ds2 = len(texts2)
@@ -1409,6 +1492,12 @@ demo.launch()
1409
  # # sys.modules['tqdm'].tqdm = original_tqdm
1410
  # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1411
 
 
 
 
 
 
 
1412
  # # with gr.Blocks() as demo:
1413
  # # gr.Markdown("# Semantic Deduplication")
1414
 
@@ -1471,3 +1560,261 @@ demo.launch()
1471
  # # )
1472
 
1473
  # # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  for i in range(0, len(iterable), batch_size):
27
  yield iterable[i:i + batch_size]
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def display_word_differences(x: str, y: str) -> str:
30
  diff = ndiff(x.split(), y.split())
31
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
 
65
  # Compute embeddings
66
  status = "Computing embeddings for Dataset 1..."
67
  yield status, ""
68
+ embeddings = []
69
+ batch_size = 64
70
+ total_batches = (len(texts) + batch_size - 1) // batch_size
71
+ for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
72
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
73
+ embeddings.append(batch_embeddings)
74
+ embedding_matrix = np.concatenate(embeddings, axis=0)
75
 
76
  # Deduplicate
77
  status = "Deduplicating embeddings..."
 
138
  # Compute embeddings for Dataset 1
139
  status = "Computing embeddings for Dataset 1..."
140
  yield status, ""
141
+ embeddings1 = []
142
+ batch_size = 64
143
+ total_batches1 = (len(texts1) + batch_size - 1) // batch_size
144
+ for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
145
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
146
+ embeddings1.append(batch_embeddings)
147
+ embedding_matrix1 = np.concatenate(embeddings1, axis=0)
148
 
149
  # Compute embeddings for Dataset 2
150
  status = "Computing embeddings for Dataset 2..."
151
  yield status, ""
152
+ embeddings2 = []
153
+ total_batches2 = (len(texts2) + batch_size - 1) // batch_size
154
+ for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
155
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
156
+ embeddings2.append(batch_embeddings)
157
+ embedding_matrix2 = np.concatenate(embeddings2, axis=0)
158
 
159
  # Deduplicate across datasets
160
  status = "Deduplicating embeddings across datasets..."
 
195
  yield f"An error occurred: {e}", ""
196
  raise e
197
 
198
+ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
199
+ """
200
+ Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
201
+ """
202
+ # Building the index
203
+ progress(0, desc="Building search index...")
204
+ reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
205
+
206
+ deduplicated_indices = set(range(len(embedding_matrix)))
207
+ duplicate_to_original_mapping = {}
208
+
209
+ # Finding nearest neighbors
210
+ progress(0, desc="Finding nearest neighbors...")
211
+ results = reach.nearest_neighbor_threshold(
212
+ embedding_matrix,
213
+ threshold=threshold,
214
+ batch_size=batch_size,
215
+ show_progressbar=False # Disable internal progress bar
216
+ )
217
+
218
+ # Processing duplicates with a progress bar
219
+ total_items = len(embedding_matrix)
220
+ for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
221
+ if i not in deduplicated_indices:
222
+ continue
223
+
224
+ similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
225
+
226
+ for sim_idx in similar_indices:
227
+ if sim_idx in deduplicated_indices:
228
+ deduplicated_indices.remove(sim_idx)
229
+ duplicate_to_original_mapping[sim_idx] = i
230
+
231
+ return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
232
+
233
+ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
234
+ """
235
+ Deduplicate embeddings across two datasets and return the indices of duplicates between them.
236
+ """
237
+ # Building the index from Dataset 1
238
+ progress(0, desc="Building search index from Dataset 1...")
239
+ reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
240
+
241
+ duplicate_indices_in_test = []
242
+ duplicate_to_original_mapping = {}
243
+
244
+ # Finding nearest neighbors between datasets
245
+ progress(0, desc="Finding nearest neighbors between datasets...")
246
+ results = reach.nearest_neighbor_threshold(
247
+ embedding_matrix_2,
248
+ threshold=threshold,
249
+ batch_size=batch_size,
250
+ show_progressbar=False # Disable internal progress bar
251
+ )
252
+
253
+ total_items = len(embedding_matrix_2)
254
+ # Processing duplicates with a progress bar
255
+ for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
256
+ similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
257
+
258
+ if similar_indices:
259
+ duplicate_indices_in_test.append(i)
260
+ duplicate_to_original_mapping[i] = similar_indices[0]
261
+
262
+ return duplicate_indices_in_test, duplicate_to_original_mapping
263
+
264
  with gr.Blocks() as demo:
265
  gr.Markdown("# Semantic Deduplication")
266
 
 
327
 
328
 
329
 
 
330
  # import gradio as gr
331
  # from datasets import load_dataset
332
  # import numpy as np
333
  # from model2vec import StaticModel
334
  # from reach import Reach
335
  # from difflib import ndiff
 
336
  # import tqdm
337
 
338
  # # Load the model at startup
 
350
  # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
351
  # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
352
 
353
+ # def batch_iterable(iterable, batch_size):
354
+ # """Helper function to create batches from an iterable."""
355
+ # for i in range(0, len(iterable), batch_size):
356
+ # yield iterable[i:i + batch_size]
357
+
358
+ # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
359
+ # embeddings = []
360
+ # for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
361
+ # batch_embeddings = model.encode(batch, show_progressbar=False)
362
+ # embeddings.append(batch_embeddings)
363
+ # return np.concatenate(embeddings, axis=0)
364
+
365
+ # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
366
  # """
367
  # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
368
  # """
369
  # # Building the index
370
+ # progress(0, desc="Building search index...")
371
  # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
372
 
373
  # deduplicated_indices = set(range(len(embedding_matrix)))
374
  # duplicate_to_original_mapping = {}
375
 
376
  # # Finding nearest neighbors
377
+ # progress(0, desc="Finding nearest neighbors...")
378
  # results = reach.nearest_neighbor_threshold(
379
  # embedding_matrix,
380
  # threshold=threshold,
381
  # batch_size=batch_size,
382
+ # show_progressbar=False # Disable internal progress bar
383
  # )
384
 
385
+ # # Processing duplicates with a progress bar
386
+ # total_items = len(embedding_matrix)
387
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
388
  # if i not in deduplicated_indices:
389
  # continue
390
 
 
397
 
398
  # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
399
 
400
+ # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
401
  # """
402
  # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
403
  # """
404
  # # Building the index from Dataset 1
405
+ # progress(0, desc="Building search index from Dataset 1...")
406
  # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
407
 
408
  # duplicate_indices_in_test = []
409
  # duplicate_to_original_mapping = {}
410
 
411
  # # Finding nearest neighbors between datasets
412
+ # progress(0, desc="Finding nearest neighbors between datasets...")
413
  # results = reach.nearest_neighbor_threshold(
414
  # embedding_matrix_2,
415
  # threshold=threshold,
416
  # batch_size=batch_size,
417
+ # show_progressbar=False # Disable internal progress bar
418
  # )
419
 
420
+ # total_items = len(embedding_matrix_2)
421
+ # # Processing duplicates with a progress bar
422
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
423
  # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
424
 
425
  # if similar_indices:
 
443
  # threshold=default_threshold,
444
  # progress=gr.Progress(track_tqdm=True)
445
  # ):
 
 
 
 
 
 
 
446
  # try:
447
  # # Convert threshold to float
448
  # threshold = float(threshold)
449
+
450
  # # Initialize status message
451
  # status = ""
452
 
 
458
  # ds = ds_default1
459
  # else:
460
  # ds = load_dataset(dataset1_name, split=dataset1_split)
461
+
462
  # # Extract texts
463
  # status = "Extracting texts from Dataset 1..."
464
  # yield status, ""
465
  # texts = [example[dataset1_text_column] for example in ds]
466
+
467
  # # Compute embeddings
468
  # status = "Computing embeddings for Dataset 1..."
469
  # yield status, ""
470
+ # embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
471
+
472
  # # Deduplicate
473
  # status = "Deduplicating embeddings..."
474
  # yield status, ""
475
  # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
476
+ # embedding_matrix, threshold, progress=progress
477
  # )
478
+
479
  # # Prepare the results
480
  # num_duplicates = len(duplicate_to_original_mapping)
481
  # num_total = len(texts)
482
  # num_deduplicated = len(deduplicated_indices)
483
+
484
  # result_text = f"**Total documents:** {num_total}\n"
485
  # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
486
  # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
487
+
488
  # # Show deduplicated examples
489
  # if num_duplicates > 0:
490
  # result_text += "**Examples of duplicates found:**\n\n"
 
499
  # result_text += "-" * 50 + "\n\n"
500
  # else:
501
  # result_text += "No duplicates found."
502
+
503
  # # Final status
504
  # status = "Deduplication completed."
505
  # yield status, result_text
506
+
507
  # elif deduplication_type == "Cross-dataset":
508
  # # Load Dataset 1
509
  # status = "Loading Dataset 1..."
 
512
  # ds1 = ds_default1
513
  # else:
514
  # ds1 = load_dataset(dataset1_name, split=dataset1_split)
515
+
516
  # # Load Dataset 2
517
  # status = "Loading Dataset 2..."
518
  # yield status, ""
 
520
  # ds2 = ds_default2
521
  # else:
522
  # ds2 = load_dataset(dataset2_name, split=dataset2_split)
523
+
524
  # # Extract texts from Dataset 1
525
  # status = "Extracting texts from Dataset 1..."
526
  # yield status, ""
527
  # texts1 = [example[dataset1_text_column] for example in ds1]
528
+
529
  # # Extract texts from Dataset 2
530
  # status = "Extracting texts from Dataset 2..."
531
  # yield status, ""
532
  # texts2 = [example[dataset2_text_column] for example in ds2]
533
+
534
  # # Compute embeddings for Dataset 1
535
  # status = "Computing embeddings for Dataset 1..."
536
  # yield status, ""
537
+ # embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
538
+
539
  # # Compute embeddings for Dataset 2
540
  # status = "Computing embeddings for Dataset 2..."
541
  # yield status, ""
542
+ # embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
543
+
544
  # # Deduplicate across datasets
545
  # status = "Deduplicating embeddings across datasets..."
546
  # yield status, ""
547
  # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
548
+ # embedding_matrix1, embedding_matrix2, threshold, progress=progress
549
  # )
550
+
551
  # num_duplicates = len(duplicate_indices_in_ds2)
552
  # num_total_ds2 = len(texts2)
553
  # num_unique_ds2 = num_total_ds2 - num_duplicates
554
+
555
+ # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
556
+ # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
557
  # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
558
+
559
  # # Show deduplicated examples
560
  # if num_duplicates > 0:
561
  # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 
571
  # result_text += "-" * 50 + "\n\n"
572
  # else:
573
  # result_text += "No duplicates found."
574
+
575
  # # Final status
576
  # status = "Deduplication completed."
577
  # yield status, result_text
578
 
579
+ # except Exception as e:
580
+ # yield f"An error occurred: {e}", ""
581
+ # raise e
 
 
 
582
 
583
  # with gr.Blocks() as demo:
584
  # gr.Markdown("# Semantic Deduplication")
 
630
  # compute_button.click(
631
  # fn=perform_deduplication,
632
  # inputs=[
633
+ # deduplication_type,
634
+ # dataset1_name,
635
+ # dataset1_split,
636
  # dataset1_text_column,
637
+ # dataset2_name,
638
+ # dataset2_split,
639
  # dataset2_text_column,
640
  # threshold
641
  # ],
642
  # outputs=[status_output, result_output]
643
  # )
644
+
645
  # demo.launch()
646
 
647
 
 
 
 
 
 
 
 
 
648
 
 
 
649
 
 
 
 
 
 
 
 
650
 
 
 
 
651
 
 
 
 
 
 
 
 
652
 
 
 
653
 
 
 
 
 
 
 
 
 
654
 
 
 
 
 
 
655
 
 
656
 
 
 
 
 
657
 
 
658
 
 
 
 
 
 
 
 
659
 
 
 
660
 
 
 
 
 
 
 
 
 
661
 
 
 
 
 
662
 
 
 
 
663
 
 
664
 
 
 
 
665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
 
667
+
668
+ # # import gradio as gr
669
+ # # from datasets import load_dataset
670
+ # # import numpy as np
671
+ # # from model2vec import StaticModel
672
+ # # from reach import Reach
673
+ # # from difflib import ndiff
674
+ # # import sys
675
+ # # import tqdm
676
+
677
+ # # # Load the model at startup
678
+ # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
679
+
680
+ # # # Update default dataset to 'sst2' and set default threshold to 0.9
681
+ # # default_dataset1_name = "sst2"
682
+ # # default_dataset1_split = "train"
683
+ # # default_dataset2_name = "sst2"
684
+ # # default_dataset2_split = "validation"
685
+ # # default_text_column = "sentence"
686
+ # # default_threshold = 0.9
687
+
688
+ # # # Load the default datasets at startup
689
+ # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
690
+ # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
691
+
692
+ # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
693
+ # # """
694
+ # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
695
+ # # """
696
+ # # # Building the index
697
+ # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
698
+
699
+ # # deduplicated_indices = set(range(len(embedding_matrix)))
700
+ # # duplicate_to_original_mapping = {}
701
+
702
+ # # # Finding nearest neighbors
703
+ # # results = reach.nearest_neighbor_threshold(
704
+ # # embedding_matrix,
705
+ # # threshold=threshold,
706
+ # # batch_size=batch_size,
707
+ # # show_progressbar=True # Allow internal progress bar
708
+ # # )
709
+
710
+ # # # Processing duplicates
711
+ # # for i, similar_items in enumerate(results):
712
+ # # if i not in deduplicated_indices:
713
+ # # continue
714
+
715
+ # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
716
+
717
+ # # for sim_idx in similar_indices:
718
+ # # if sim_idx in deduplicated_indices:
719
+ # # deduplicated_indices.remove(sim_idx)
720
+ # # duplicate_to_original_mapping[sim_idx] = i
721
+
722
+ # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
723
+
724
+ # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
725
+ # # """
726
+ # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
727
+ # # """
728
+ # # # Building the index from Dataset 1
729
+ # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
730
+
731
+ # # duplicate_indices_in_test = []
732
+ # # duplicate_to_original_mapping = {}
733
+
734
+ # # # Finding nearest neighbors between datasets
735
+ # # results = reach.nearest_neighbor_threshold(
736
+ # # embedding_matrix_2,
737
+ # # threshold=threshold,
738
+ # # batch_size=batch_size,
739
+ # # show_progressbar=True # Allow internal progress bar
740
+ # # )
741
+
742
+ # # # Processing duplicates
743
+ # # for i, similar_items in enumerate(results):
744
+ # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
745
+
746
+ # # if similar_indices:
747
+ # # duplicate_indices_in_test.append(i)
748
+ # # duplicate_to_original_mapping[i] = similar_indices[0]
749
+
750
+ # # return duplicate_indices_in_test, duplicate_to_original_mapping
751
+
752
+ # # def display_word_differences(x: str, y: str) -> str:
753
+ # # diff = ndiff(x.split(), y.split())
754
+ # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
755
+
756
+ # # def perform_deduplication(
757
+ # # deduplication_type,
758
+ # # dataset1_name,
759
+ # # dataset1_split,
760
+ # # dataset1_text_column,
761
+ # # dataset2_name="",
762
+ # # dataset2_split="",
763
+ # # dataset2_text_column="",
764
+ # # threshold=default_threshold,
765
+ # # progress=gr.Progress(track_tqdm=True)
766
+ # # ):
767
+ # # # Deep Monkey-Patching of tqdm
768
+ # # original_tqdm = tqdm.tqdm
769
+ # # tqdm.tqdm = progress.tqdm
770
+ # # for mod_name in list(sys.modules.keys()):
771
+ # # if 'tqdm' in mod_name:
772
+ # # sys.modules[mod_name].tqdm = progress.tqdm
773
+
774
+ # # try:
775
+ # # # Convert threshold to float
776
+ # # threshold = float(threshold)
777
 
778
+ # # # Initialize status message
779
+ # # status = ""
780
+
781
+ # # if deduplication_type == "Single dataset":
782
+ # # # Load Dataset 1
783
+ # # status = "Loading Dataset 1..."
784
+ # # yield status, ""
785
+ # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
786
+ # # ds = ds_default1
787
+ # # else:
788
+ # # ds = load_dataset(dataset1_name, split=dataset1_split)
789
 
790
+ # # # Extract texts
791
+ # # status = "Extracting texts from Dataset 1..."
792
+ # # yield status, ""
793
+ # # texts = [example[dataset1_text_column] for example in ds]
794
 
795
+ # # # Compute embeddings
796
+ # # status = "Computing embeddings for Dataset 1..."
797
+ # # yield status, ""
798
+ # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
799
 
800
+ # # # Deduplicate
801
+ # # status = "Deduplicating embeddings..."
802
+ # # yield status, ""
803
+ # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
804
+ # # embedding_matrix, threshold
805
+ # # )
806
+
807
+ # # # Prepare the results
808
+ # # num_duplicates = len(duplicate_to_original_mapping)
809
+ # # num_total = len(texts)
810
+ # # num_deduplicated = len(deduplicated_indices)
811
+
812
+ # # result_text = f"**Total documents:** {num_total}\n"
813
+ # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
814
+ # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
815
 
816
+ # # # Show deduplicated examples
817
+ # # if num_duplicates > 0:
818
+ # # result_text += "**Examples of duplicates found:**\n\n"
819
+ # # num_examples = min(5, num_duplicates)
820
+ # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
821
+ # # original_text = texts[original_idx]
822
+ # # duplicate_text = texts[duplicate_idx]
823
+ # # differences = display_word_differences(original_text, duplicate_text)
824
+ # # result_text += f"**Original text:**\n{original_text}\n\n"
825
+ # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
826
+ # # result_text += f"**Differences:**\n{differences}\n"
827
+ # # result_text += "-" * 50 + "\n\n"
828
+ # # else:
829
+ # # result_text += "No duplicates found."
830
+
831
+ # # # Final status
832
+ # # status = "Deduplication completed."
833
+ # # yield status, result_text
834
 
835
+ # # elif deduplication_type == "Cross-dataset":
836
+ # # # Load Dataset 1
837
+ # # status = "Loading Dataset 1..."
838
+ # # yield status, ""
839
+ # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
840
+ # # ds1 = ds_default1
841
+ # # else:
842
+ # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
843
 
844
+ # # # Load Dataset 2
845
+ # # status = "Loading Dataset 2..."
846
+ # # yield status, ""
847
+ # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
848
+ # # ds2 = ds_default2
849
+ # # else:
850
+ # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
851
 
852
+ # # # Extract texts from Dataset 1
853
+ # # status = "Extracting texts from Dataset 1..."
854
+ # # yield status, ""
855
+ # # texts1 = [example[dataset1_text_column] for example in ds1]
856
 
857
+ # # # Extract texts from Dataset 2
858
+ # # status = "Extracting texts from Dataset 2..."
859
+ # # yield status, ""
860
+ # # texts2 = [example[dataset2_text_column] for example in ds2]
861
 
862
+ # # # Compute embeddings for Dataset 1
863
+ # # status = "Computing embeddings for Dataset 1..."
864
+ # # yield status, ""
865
+ # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
866
 
867
+ # # # Compute embeddings for Dataset 2
868
+ # # status = "Computing embeddings for Dataset 2..."
869
+ # # yield status, ""
870
+ # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
871
 
872
+ # # # Deduplicate across datasets
873
+ # # status = "Deduplicating embeddings across datasets..."
874
+ # # yield status, ""
875
+ # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
876
+ # # embedding_matrix1, embedding_matrix2, threshold
877
+ # # )
878
 
879
+ # # num_duplicates = len(duplicate_indices_in_ds2)
880
+ # # num_total_ds2 = len(texts2)
881
+ # # num_unique_ds2 = num_total_ds2 - num_duplicates
882
+
883
+ # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
884
+ # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
885
+ # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
886
+
887
+ # # # Show deduplicated examples
888
+ # # if num_duplicates > 0:
889
+ # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
890
+ # # num_examples = min(5, num_duplicates)
891
+ # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
892
+ # # original_idx = duplicate_to_original_mapping[duplicate_idx]
893
+ # # original_text = texts1[original_idx]
894
+ # # duplicate_text = texts2[duplicate_idx]
895
+ # # differences = display_word_differences(original_text, duplicate_text)
896
+ # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
897
+ # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
898
+ # # result_text += f"**Differences:**\n{differences}\n"
899
+ # # result_text += "-" * 50 + "\n\n"
900
+ # # else:
901
+ # # result_text += "No duplicates found."
902
+
903
+ # # # Final status
904
+ # # status = "Deduplication completed."
905
+ # # yield status, result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
906
 
907
+ # # finally:
908
+ # # # Restore original tqdm
909
+ # # tqdm.tqdm = original_tqdm
910
+ # # for mod_name in list(sys.modules.keys()):
911
+ # # if 'tqdm' in mod_name:
912
+ # # sys.modules[mod_name].tqdm = original_tqdm
913
 
914
+ # # with gr.Blocks() as demo:
915
+ # # gr.Markdown("# Semantic Deduplication")
 
 
916
 
917
+ # # deduplication_type = gr.Radio(
918
+ # # choices=["Single dataset", "Cross-dataset"],
919
+ # # label="Deduplication Type",
920
+ # # value="Single dataset"
921
+ # # )
 
 
922
 
923
+ # # with gr.Row():
924
+ # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
925
+ # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
926
+ # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 
 
927
 
928
+ # # dataset2_inputs = gr.Column(visible=False)
929
+ # # with dataset2_inputs:
930
+ # # gr.Markdown("### Dataset 2")
931
+ # # with gr.Row():
932
+ # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
933
+ # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
934
+ # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
935
 
936
+ # # threshold = gr.Slider(
937
+ # # minimum=0.0,
938
+ # # maximum=1.0,
939
+ # # value=default_threshold,
940
+ # # label="Similarity Threshold"
941
+ # # )
942
 
943
+ # # compute_button = gr.Button("Compute")
 
 
 
 
 
944
 
945
+ # # status_output = gr.Markdown()
946
+ # # result_output = gr.Markdown()
 
 
 
947
 
948
+ # # # Function to update the visibility of dataset2_inputs
949
+ # # def update_visibility(deduplication_type_value):
950
+ # # if deduplication_type_value == "Cross-dataset":
951
+ # # return gr.update(visible=True)
952
+ # # else:
953
+ # # return gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
954
 
955
+ # # deduplication_type.change(
956
+ # # update_visibility,
957
+ # # inputs=deduplication_type,
958
+ # # outputs=dataset2_inputs
959
+ # # )
960
 
961
+ # # compute_button.click(
962
+ # # fn=perform_deduplication,
963
+ # # inputs=[
964
+ # # deduplication_type,
965
+ # # dataset1_name,
966
+ # # dataset1_split,
967
+ # # dataset1_text_column,
968
+ # # dataset2_name,
969
+ # # dataset2_split,
970
+ # # dataset2_text_column,
971
+ # # threshold
972
+ # # ],
973
+ # # outputs=[status_output, result_output]
974
+ # # )
975
+
976
+ # # demo.launch()
977
 
978
 
979
+ # # import gradio as gr
980
+ # # from datasets import load_dataset
981
+ # # import numpy as np
982
+ # # from model2vec import StaticModel
983
+ # # from reach import Reach
984
+ # # from difflib import ndiff
985
+ # # import sys
986
+ # # import tqdm
987
 
988
+ # # # Load the model at startup
989
+ # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
990
 
991
+ # # # Update default dataset to 'sst2' and set default threshold to 0.9
992
+ # # default_dataset1_name = "sst2"
993
+ # # default_dataset1_split = "train"
994
+ # # default_dataset2_name = "sst2"
995
+ # # default_dataset2_split = "validation"
996
+ # # default_text_column = "sentence"
997
+ # # default_threshold = 0.9
998
 
999
+ # # # Load the default datasets at startup
1000
+ # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1001
+ # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1002
 
1003
+ # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1004
+ # # """
1005
+ # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1006
+ # # """
1007
+ # # # Update progress to indicate building the index
1008
+ # # progress(0, desc="Building search index...")
1009
+ # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1010
 
1011
+ # # deduplicated_indices = set(range(len(embedding_matrix)))
1012
+ # # duplicate_to_original_mapping = {}
1013
 
1014
+ # # # Finding nearest neighbors
1015
+ # # progress(0, desc="Finding nearest neighbors...")
1016
+ # # results = reach.nearest_neighbor_threshold(
1017
+ # # embedding_matrix,
1018
+ # # threshold=threshold,
1019
+ # # batch_size=batch_size,
1020
+ # # show_progressbar=True # Allow internal progress bar
1021
+ # # )
1022
 
1023
+ # # # Processing duplicates with a progress bar
1024
+ # # total_items = len(embedding_matrix)
1025
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
1026
+ # # if i not in deduplicated_indices:
1027
+ # # continue
1028
 
1029
+ # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1030
 
1031
+ # # for sim_idx in similar_indices:
1032
+ # # if sim_idx in deduplicated_indices:
1033
+ # # deduplicated_indices.remove(sim_idx)
1034
+ # # duplicate_to_original_mapping[sim_idx] = i
1035
 
1036
+ # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1037
 
1038
+ # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1039
+ # # """
1040
+ # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1041
+ # # """
1042
+ # # # Update progress to indicate building the index
1043
+ # # progress(0, desc="Building search index from Dataset 1...")
1044
+ # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1045
 
1046
+ # # duplicate_indices_in_test = []
1047
+ # # duplicate_to_original_mapping = {}
1048
 
1049
+ # # # Finding nearest neighbors between datasets
1050
+ # # progress(0, desc="Finding nearest neighbors between datasets...")
1051
+ # # results = reach.nearest_neighbor_threshold(
1052
+ # # embedding_matrix_2,
1053
+ # # threshold=threshold,
1054
+ # # batch_size=batch_size,
1055
+ # # show_progressbar=True # Allow internal progress bar
1056
+ # # )
1057
 
1058
+ # # total_items = len(embedding_matrix_2)
1059
+ # # # Processing duplicates with a progress bar
1060
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
1061
+ # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1062
 
1063
+ # # if similar_indices:
1064
+ # # duplicate_indices_in_test.append(i)
1065
+ # # duplicate_to_original_mapping[i] = similar_indices[0]
1066
 
1067
+ # # return duplicate_indices_in_test, duplicate_to_original_mapping
1068
 
1069
+ # # def display_word_differences(x: str, y: str) -> str:
1070
+ # # diff = ndiff(x.split(), y.split())
1071
+ # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1072
 
1073
+ # # def perform_deduplication(
1074
+ # # deduplication_type,
1075
+ # # dataset1_name,
1076
+ # # dataset1_split,
1077
+ # # dataset1_text_column,
1078
+ # # dataset2_name="",
1079
+ # # dataset2_split="",
1080
+ # # dataset2_text_column="",
1081
+ # # threshold=default_threshold,
1082
+ # # progress=gr.Progress(track_tqdm=True)
1083
+ # # ):
1084
+ # # # Monkey-patch tqdm
1085
+ # # original_tqdm = tqdm.tqdm
1086
+ # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1087
+ # # tqdm.tqdm = progress.tqdm
1088
+ # # sys.modules['tqdm'].tqdm = progress.tqdm
1089
+ # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1090
+ # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1091
 
1092
+ # # try:
1093
+ # # # Convert threshold to float
1094
+ # # threshold = float(threshold)
1095
 
1096
+ # # if deduplication_type == "Single dataset":
1097
+ # # # Load Dataset 1
1098
+ # # progress(0, desc="Loading Dataset 1...")
1099
+ # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1100
+ # # ds = ds_default1
1101
+ # # else:
1102
+ # # ds = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
 
 
 
 
1103
 
1104
+ # # # Extract texts
1105
+ # # progress(0, desc="Extracting texts from Dataset 1...")
1106
+ # # texts = [example[dataset1_text_column] for example in ds]
 
1107
 
1108
+ # # # Compute embeddings
1109
+ # # progress(0, desc="Computing embeddings for Dataset 1...")
1110
+ # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1111
 
1112
+ # # # Deduplicate
1113
+ # # result_text = deduplicate_and_prepare_results_single(
1114
+ # # embedding_matrix, texts, threshold, progress
1115
+ # # )
 
 
 
 
 
 
 
1116
 
1117
+ # # return result_text
1118
 
1119
+ # # elif deduplication_type == "Cross-dataset":
1120
+ # # # Load Dataset 1
1121
+ # # progress(0, desc="Loading Dataset 1...")
1122
+ # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1123
+ # # ds1 = ds_default1
1124
+ # # else:
1125
+ # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
 
1126
 
1127
+ # # # Load Dataset 2
1128
+ # # progress(0, desc="Loading Dataset 2...")
1129
+ # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1130
+ # # ds2 = ds_default2
1131
+ # # else:
1132
+ # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1133
 
1134
+ # # # Extract texts from Dataset 1
1135
+ # # progress(0, desc="Extracting texts from Dataset 1...")
1136
+ # # texts1 = [example[dataset1_text_column] for example in ds1]
1137
 
1138
+ # # # Extract texts from Dataset 2
1139
+ # # progress(0, desc="Extracting texts from Dataset 2...")
1140
+ # # texts2 = [example[dataset2_text_column] for example in ds2]
1141
 
1142
+ # # # Compute embeddings for Dataset 1
1143
+ # # progress(0, desc="Computing embeddings for Dataset 1...")
1144
+ # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
1145
 
1146
+ # # # Compute embeddings for Dataset 2
1147
+ # # progress(0, desc="Computing embeddings for Dataset 2...")
1148
+ # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
1149
 
1150
+ # # # Deduplicate across datasets
1151
+ # # result_text = deduplicate_and_prepare_results_cross(
1152
+ # # embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
1153
+ # # )
 
 
 
 
 
 
 
 
1154
 
1155
+ # # return result_text
1156
 
1157
+ # # finally:
1158
+ # # # Restore original tqdm
1159
+ # # tqdm.tqdm = original_tqdm
1160
+ # # sys.modules['tqdm'].tqdm = original_tqdm
1161
+ # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1162
 
1163
+ # # # Restore reach's original tqdm
1164
+ # # if original_reach_tqdm is not None:
1165
+ # # Reach.tqdm = original_reach_tqdm
1166
+ # # else:
1167
+ # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1168
 
1169
+ # # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
1170
+ # # # Deduplicate
1171
+ # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1172
+ # # embedding_matrix, threshold, progress=progress
1173
+ # # )
 
 
 
1174
 
1175
+ # # # Prepare the results
1176
+ # # num_duplicates = len(duplicate_to_original_mapping)
1177
+ # # num_total = len(texts)
1178
+ # # num_deduplicated = len(deduplicated_indices)
1179
 
1180
+ # # result_text = f"**Total documents:** {num_total}\n"
1181
+ # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1182
+ # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 
 
 
 
1183
 
1184
+ # # # Show deduplicated examples
1185
+ # # if num_duplicates > 0:
1186
+ # # result_text += "**Examples of duplicates found:**\n\n"
1187
+ # # num_examples = min(5, num_duplicates)
1188
+ # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1189
+ # # original_text = texts[original_idx]
1190
+ # # duplicate_text = texts[duplicate_idx]
1191
+ # # differences = display_word_differences(original_text, duplicate_text)
1192
+ # # result_text += f"**Original text:**\n{original_text}\n\n"
1193
+ # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1194
+ # # result_text += f"**Differences:**\n{differences}\n"
1195
+ # # result_text += "-" * 50 + "\n\n"
1196
+ # # else:
1197
+ # # result_text += "No duplicates found."
1198
 
1199
+ # # return result_text
1200
+
1201
+ # # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
1202
+ # # # Deduplicate across datasets
1203
+ # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1204
+ # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
1205
+ # # )
1206
 
1207
+ # # num_duplicates = len(duplicate_indices_in_ds2)
1208
+ # # num_total_ds2 = len(texts2)
1209
+ # # num_unique_ds2 = num_total_ds2 - num_duplicates
1210
 
1211
+ # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1212
+ # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1213
+ # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 
 
 
1214
 
1215
+ # # # Show deduplicated examples
1216
+ # # if num_duplicates > 0:
1217
+ # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1218
+ # # num_examples = min(5, num_duplicates)
1219
+ # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1220
+ # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1221
+ # # original_text = texts1[original_idx]
1222
+ # # duplicate_text = texts2[duplicate_idx]
1223
+ # # differences = display_word_differences(original_text, duplicate_text)
1224
+ # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1225
+ # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1226
+ # # result_text += f"**Differences:**\n{differences}\n"
1227
+ # # result_text += "-" * 50 + "\n\n"
1228
+ # # else:
1229
+ # # result_text += "No duplicates found."
1230
 
1231
+ # # return result_text
1232
+
1233
+ # # with gr.Blocks() as demo:
1234
+ # # gr.Markdown("# Semantic Deduplication")
1235
+
1236
+ # # deduplication_type = gr.Radio(
1237
+ # # choices=["Single dataset", "Cross-dataset"],
1238
+ # # label="Deduplication Type",
1239
+ # # value="Single dataset"
1240
+ # # )
1241
+
1242
+ # # with gr.Row():
1243
+ # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1244
+ # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1245
+ # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1246
+
1247
+ # # dataset2_inputs = gr.Column(visible=False)
1248
+ # # with dataset2_inputs:
1249
+ # # gr.Markdown("### Dataset 2")
1250
+ # # with gr.Row():
1251
+ # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1252
+ # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1253
+ # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1254
+
1255
+ # # threshold = gr.Slider(
1256
+ # # minimum=0.0,
1257
+ # # maximum=1.0,
1258
+ # # value=default_threshold,
1259
+ # # label="Similarity Threshold"
1260
+ # # )
1261
+
1262
+ # # compute_button = gr.Button("Compute")
1263
+
1264
+ # # output = gr.Markdown()
1265
+
1266
+ # # # Function to update the visibility of dataset2_inputs
1267
+ # # def update_visibility(deduplication_type_value):
1268
+ # # if deduplication_type_value == "Cross-dataset":
1269
+ # # return gr.update(visible=True)
1270
+ # # else:
1271
+ # # return gr.update(visible=False)
1272
+
1273
+ # # deduplication_type.change(
1274
+ # # update_visibility,
1275
+ # # inputs=deduplication_type,
1276
+ # # outputs=dataset2_inputs
1277
+ # # )
1278
+
1279
+ # # compute_button.click(
1280
+ # # fn=perform_deduplication,
1281
+ # # inputs=[
1282
+ # # deduplication_type,
1283
+ # # dataset1_name,
1284
+ # # dataset1_split,
1285
+ # # dataset1_text_column,
1286
+ # # dataset2_name,
1287
+ # # dataset2_split,
1288
+ # # dataset2_text_column,
1289
+ # # threshold
1290
+ # # ],
1291
+ # # outputs=output
1292
+ # # )
1293
 
1294
+ # # demo.launch()
1295
+
1296
+
1297
 
1298
 
1299
  # # import gradio as gr
 
1334
  # # )
1335
 
1336
  # # # Process duplicates
1337
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
1338
  # # if i not in deduplicated_indices:
1339
  # # continue
1340
 
 
1363
  # # show_progressbar=True # Allow internal progress bar
1364
  # # )
1365
 
1366
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
 
1367
  # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1368
 
1369
  # # if similar_indices:
 
1389
  # # ):
1390
  # # # Monkey-patch tqdm
1391
  # # original_tqdm = tqdm.tqdm
1392
+ # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1393
  # # tqdm.tqdm = progress.tqdm
1394
  # # sys.modules['tqdm'].tqdm = progress.tqdm
1395
  # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1396
+ # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1397
 
1398
  # # try:
1399
  # # # Convert threshold to float
 
1460
  # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1461
 
1462
  # # # Deduplicate across datasets
1463
+ # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1464
+ # # embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1465
 
1466
  # # num_duplicates = len(duplicate_indices_in_ds2)
1467
  # # num_total_ds2 = len(texts2)
 
1492
  # # sys.modules['tqdm'].tqdm = original_tqdm
1493
  # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1494
 
1495
+ # # # Restore reach's original tqdm
1496
+ # # if original_reach_tqdm is not None:
1497
+ # # Reach.tqdm = original_reach_tqdm
1498
+ # # else:
1499
+ # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1500
+
1501
  # # with gr.Blocks() as demo:
1502
  # # gr.Markdown("# Semantic Deduplication")
1503
 
 
1560
  # # )
1561
 
1562
  # # demo.launch()
1563
+
1564
+
1565
+ # # # import gradio as gr
1566
+ # # # from datasets import load_dataset
1567
+ # # # import numpy as np
1568
+ # # # from model2vec import StaticModel
1569
+ # # # from reach import Reach
1570
+ # # # from difflib import ndiff
1571
+ # # # import sys
1572
+ # # # import tqdm
1573
+
1574
+ # # # # Load the model at startup
1575
+ # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1576
+
1577
+ # # # # Load the default datasets at startup
1578
+ # # # default_dataset1_name = "ag_news"
1579
+ # # # default_dataset1_split = "train"
1580
+ # # # default_dataset2_name = "ag_news"
1581
+ # # # default_dataset2_split = "test"
1582
+
1583
+ # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1584
+ # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1585
+
1586
+ # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1587
+ # # # """
1588
+ # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1589
+ # # # """
1590
+ # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1591
+
1592
+ # # # deduplicated_indices = set(range(len(embedding_matrix)))
1593
+ # # # duplicate_to_original_mapping = {}
1594
+
1595
+ # # # results = reach.nearest_neighbor_threshold(
1596
+ # # # embedding_matrix,
1597
+ # # # threshold=threshold,
1598
+ # # # batch_size=batch_size,
1599
+ # # # show_progressbar=True # Allow internal progress bar
1600
+ # # # )
1601
+
1602
+ # # # # Process duplicates
1603
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
1604
+ # # # if i not in deduplicated_indices:
1605
+ # # # continue
1606
+
1607
+ # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1608
+
1609
+ # # # for sim_idx in similar_indices:
1610
+ # # # if sim_idx in deduplicated_indices:
1611
+ # # # deduplicated_indices.remove(sim_idx)
1612
+ # # # duplicate_to_original_mapping[sim_idx] = i
1613
+
1614
+ # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1615
+
1616
+ # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1617
+ # # # """
1618
+ # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1619
+ # # # """
1620
+ # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1621
+
1622
+ # # # duplicate_indices_in_test = []
1623
+ # # # duplicate_to_original_mapping = {}
1624
+
1625
+ # # # results = reach.nearest_neighbor_threshold(
1626
+ # # # embedding_matrix_2,
1627
+ # # # threshold=threshold,
1628
+ # # # batch_size=batch_size,
1629
+ # # # show_progressbar=True # Allow internal progress bar
1630
+ # # # )
1631
+
1632
+ # # # # Process duplicates
1633
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
1634
+ # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1635
+
1636
+ # # # if similar_indices:
1637
+ # # # duplicate_indices_in_test.append(i)
1638
+ # # # duplicate_to_original_mapping[i] = similar_indices[0]
1639
+
1640
+ # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1641
+
1642
+ # # # def display_word_differences(x: str, y: str) -> str:
1643
+ # # # diff = ndiff(x.split(), y.split())
1644
+ # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1645
+
1646
+ # # # def perform_deduplication(
1647
+ # # # deduplication_type,
1648
+ # # # dataset1_name,
1649
+ # # # dataset1_split,
1650
+ # # # dataset1_text_column,
1651
+ # # # dataset2_name="",
1652
+ # # # dataset2_split="",
1653
+ # # # dataset2_text_column="",
1654
+ # # # threshold=0.8,
1655
+ # # # progress=gr.Progress(track_tqdm=True)
1656
+ # # # ):
1657
+ # # # # Monkey-patch tqdm
1658
+ # # # original_tqdm = tqdm.tqdm
1659
+ # # # tqdm.tqdm = progress.tqdm
1660
+ # # # sys.modules['tqdm'].tqdm = progress.tqdm
1661
+ # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1662
+
1663
+ # # # try:
1664
+ # # # # Convert threshold to float
1665
+ # # # threshold = float(threshold)
1666
+
1667
+ # # # if deduplication_type == "Single dataset":
1668
+ # # # # Check if the dataset is the default one
1669
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1670
+ # # # ds = ds_default1
1671
+ # # # else:
1672
+ # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1673
+
1674
+ # # # # Extract texts
1675
+ # # # texts = [example[dataset1_text_column] for example in ds]
1676
+
1677
+ # # # # Compute embeddings
1678
+ # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1679
+
1680
+ # # # # Deduplicate
1681
+ # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
1682
+
1683
+ # # # # Prepare the results
1684
+ # # # num_duplicates = len(duplicate_to_original_mapping)
1685
+ # # # num_total = len(texts)
1686
+ # # # num_deduplicated = len(deduplicated_indices)
1687
+
1688
+ # # # result_text = f"**Total documents:** {num_total}\n"
1689
+ # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1690
+ # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1691
+
1692
+ # # # # Show deduplicated examples
1693
+ # # # result_text += "**Examples of duplicates found:**\n\n"
1694
+ # # # num_examples = min(5, num_duplicates)
1695
+ # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1696
+ # # # original_text = texts[original_idx]
1697
+ # # # duplicate_text = texts[duplicate_idx]
1698
+ # # # differences = display_word_differences(original_text, duplicate_text)
1699
+ # # # result_text += f"**Original text:**\n{original_text}\n\n"
1700
+ # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1701
+ # # # result_text += f"**Differences:**\n{differences}\n"
1702
+ # # # result_text += "-" * 50 + "\n\n"
1703
+
1704
+ # # # return result_text
1705
+
1706
+ # # # elif deduplication_type == "Cross-dataset":
1707
+ # # # # Dataset 1
1708
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1709
+ # # # ds1 = ds_default1
1710
+ # # # else:
1711
+ # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1712
+
1713
+ # # # # Dataset 2
1714
+ # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1715
+ # # # ds2 = ds_default2
1716
+ # # # else:
1717
+ # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1718
+
1719
+ # # # # Extract texts
1720
+ # # # texts1 = [example[dataset1_text_column] for example in ds1]
1721
+ # # # texts2 = [example[dataset2_text_column] for example in ds2]
1722
+
1723
+ # # # # Compute embeddings
1724
+ # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
1725
+ # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1726
+
1727
+ # # # # Deduplicate across datasets
1728
+ # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1729
+
1730
+ # # # num_duplicates = len(duplicate_indices_in_ds2)
1731
+ # # # num_total_ds2 = len(texts2)
1732
+ # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1733
+
1734
+ # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1735
+ # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1736
+ # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1737
+
1738
+ # # # # Show deduplicated examples
1739
+ # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1740
+ # # # num_examples = min(5, num_duplicates)
1741
+ # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1742
+ # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1743
+ # # # original_text = texts1[original_idx]
1744
+ # # # duplicate_text = texts2[duplicate_idx]
1745
+ # # # differences = display_word_differences(original_text, duplicate_text)
1746
+ # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1747
+ # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1748
+ # # # result_text += f"**Differences:**\n{differences}\n"
1749
+ # # # result_text += "-" * 50 + "\n\n"
1750
+
1751
+ # # # return result_text
1752
+
1753
+ # # # finally:
1754
+ # # # # Restore original tqdm
1755
+ # # # tqdm.tqdm = original_tqdm
1756
+ # # # sys.modules['tqdm'].tqdm = original_tqdm
1757
+ # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1758
+
1759
+ # # # with gr.Blocks() as demo:
1760
+ # # # gr.Markdown("# Semantic Deduplication")
1761
+
1762
+ # # # deduplication_type = gr.Radio(
1763
+ # # # choices=["Single dataset", "Cross-dataset"],
1764
+ # # # label="Deduplication Type",
1765
+ # # # value="Single dataset"
1766
+ # # # )
1767
+
1768
+ # # # with gr.Row():
1769
+ # # # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
1770
+ # # # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
1771
+ # # # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
1772
+
1773
+ # # # dataset2_inputs = gr.Column(visible=False)
1774
+ # # # with dataset2_inputs:
1775
+ # # # gr.Markdown("### Dataset 2")
1776
+ # # # with gr.Row():
1777
+ # # # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
1778
+ # # # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
1779
+ # # # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
1780
+
1781
+ # # # threshold = gr.Slider(
1782
+ # # # minimum=0.0,
1783
+ # # # maximum=1.0,
1784
+ # # # value=0.8,
1785
+ # # # label="Similarity Threshold"
1786
+ # # # )
1787
+
1788
+ # # # compute_button = gr.Button("Compute")
1789
+
1790
+ # # # output = gr.Markdown()
1791
+
1792
+ # # # # Function to update the visibility of dataset2_inputs
1793
+ # # # def update_visibility(deduplication_type_value):
1794
+ # # # if deduplication_type_value == "Cross-dataset":
1795
+ # # # return gr.update(visible=True)
1796
+ # # # else:
1797
+ # # # return gr.update(visible=False)
1798
+
1799
+ # # # deduplication_type.change(
1800
+ # # # update_visibility,
1801
+ # # # inputs=deduplication_type,
1802
+ # # # outputs=dataset2_inputs
1803
+ # # # )
1804
+
1805
+ # # # compute_button.click(
1806
+ # # # fn=perform_deduplication,
1807
+ # # # inputs=[
1808
+ # # # deduplication_type,
1809
+ # # # dataset1_name,
1810
+ # # # dataset1_split,
1811
+ # # # dataset1_text_column,
1812
+ # # # dataset2_name,
1813
+ # # # dataset2_split,
1814
+ # # # dataset2_text_column,
1815
+ # # # threshold
1816
+ # # # ],
1817
+ # # # outputs=output
1818
+ # # # )
1819
+
1820
+ # # # demo.launch()