Pringled commited on
Commit
471be58
·
1 Parent(s): 2ba6e60

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +17 -245
app.py CHANGED
@@ -36,7 +36,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
36
  )
37
 
38
  # Process duplicates
39
- for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
40
  if i not in deduplicated_indices:
41
  continue
42
 
@@ -65,8 +65,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
65
  show_progressbar=True # Allow internal progress bar
66
  )
67
 
68
- # Process duplicates
69
- for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
70
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
71
 
72
  if similar_indices:
@@ -92,9 +91,11 @@ def perform_deduplication(
92
  ):
93
  # Monkey-patch tqdm
94
  original_tqdm = tqdm.tqdm
 
95
  tqdm.tqdm = progress.tqdm
96
  sys.modules['tqdm'].tqdm = progress.tqdm
97
  sys.modules['tqdm.auto'].tqdm = progress.tqdm
 
98
 
99
  try:
100
  # Convert threshold to float
@@ -161,7 +162,8 @@ def perform_deduplication(
161
  embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
162
 
163
  # Deduplicate across datasets
164
- duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 
165
 
166
  num_duplicates = len(duplicate_indices_in_ds2)
167
  num_total_ds2 = len(texts2)
@@ -192,6 +194,12 @@ def perform_deduplication(
192
  sys.modules['tqdm'].tqdm = original_tqdm
193
  sys.modules['tqdm.auto'].tqdm = original_tqdm
194
 
 
 
 
 
 
 
195
  with gr.Blocks() as demo:
196
  gr.Markdown("# Semantic Deduplication")
197
 
@@ -290,7 +298,7 @@ demo.launch()
290
  # embedding_matrix,
291
  # threshold=threshold,
292
  # batch_size=batch_size,
293
- # show_progressbar=False # Disable internal progress bar
294
  # )
295
 
296
  # # Process duplicates
@@ -320,7 +328,7 @@ demo.launch()
320
  # embedding_matrix_2,
321
  # threshold=threshold,
322
  # batch_size=batch_size,
323
- # show_progressbar=False # Disable internal progress bar
324
  # )
325
 
326
  # # Process duplicates
@@ -369,11 +377,8 @@ demo.launch()
369
  # texts = [example[dataset1_text_column] for example in ds]
370
 
371
  # # Compute embeddings
372
- # embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
373
 
374
- # # Show progress bar for embedding computation
375
- # embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings")
376
-
377
  # # Deduplicate
378
  # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
379
 
@@ -418,12 +423,8 @@ demo.launch()
418
  # texts2 = [example[dataset2_text_column] for example in ds2]
419
 
420
  # # Compute embeddings
421
- # embedding_matrix1 = model.encode(texts1, show_progressbar=False) # Disable internal progress bar
422
- # embedding_matrix2 = model.encode(texts2, show_progressbar=False) # Disable internal progress bar
423
-
424
- # # Show progress bar for embedding computation
425
- # embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1")
426
- # embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2")
427
 
428
  # # Deduplicate across datasets
429
  # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
@@ -519,232 +520,3 @@ demo.launch()
519
  # )
520
 
521
  # demo.launch()
522
-
523
-
524
- # import gradio as gr
525
- # from datasets import load_dataset
526
- # import numpy as np
527
- # from model2vec import StaticModel
528
- # from reach import Reach
529
- # from difflib import ndiff
530
-
531
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[np.ndarray, dict[int, int]]:
532
- # """
533
- # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
534
- # """
535
- # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
536
-
537
- # # Use a set for deduplicated indices and keep track of duplicates
538
- # deduplicated_indices = set(range(len(embedding_matrix))) # Start with all indices as deduplicated
539
- # duplicate_to_original_mapping = {}
540
-
541
- # results = reach.nearest_neighbor_threshold(
542
- # embedding_matrix,
543
- # threshold=threshold,
544
- # batch_size=batch_size,
545
- # show_progressbar=True
546
- # )
547
-
548
- # # Process duplicates
549
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
550
- # if i not in deduplicated_indices:
551
- # continue # Skip already marked duplicates
552
-
553
- # # Similar items are returned as (index, score), we are only interested in the index
554
- # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
555
-
556
- # # Mark similar documents as duplicates and map them to the original
557
- # for sim_idx in similar_indices:
558
- # if sim_idx in deduplicated_indices:
559
- # deduplicated_indices.remove(sim_idx)
560
- # duplicate_to_original_mapping[sim_idx] = i # Map duplicate to original
561
-
562
- # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
563
-
564
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[list[int], dict[int, int]]:
565
- # """
566
- # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
567
- # """
568
- # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
569
-
570
- # # Keep track of duplicates in the second dataset
571
- # duplicate_indices_in_test = []
572
- # duplicate_to_original_mapping = {}
573
-
574
- # # Find nearest neighbors from the test set in the train set
575
- # results = reach.nearest_neighbor_threshold(
576
- # embedding_matrix_2,
577
- # threshold=threshold,
578
- # batch_size=batch_size,
579
- # show_progressbar=True
580
- # )
581
-
582
- # # Process duplicates
583
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
584
- # # Similar items are returned as (index, score), we are only interested in the index
585
- # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold] # Keep those above the threshold
586
-
587
- # # If we find a similar item in the train set, mark it as a duplicate
588
- # if similar_indices:
589
- # duplicate_indices_in_test.append(i)
590
- # duplicate_to_original_mapping[i] = similar_indices[0] # Map duplicate in test to original in train
591
-
592
- # return duplicate_indices_in_test, duplicate_to_original_mapping
593
-
594
- # def display_word_differences(x: str, y: str) -> str:
595
- # diff = ndiff(x.split(), y.split())
596
- # return " ".join([word for word in diff if word.startswith(('+', '-'))])
597
-
598
- # def perform_deduplication(
599
- # deduplication_type,
600
- # dataset1_name,
601
- # dataset1_split,
602
- # dataset1_text_column,
603
- # dataset2_name="",
604
- # dataset2_split="",
605
- # dataset2_text_column="",
606
- # threshold=0.8,
607
- # progress=gr.Progress(track_tqdm=True)
608
- # ):
609
- # # Convert threshold to float
610
- # threshold = float(threshold)
611
-
612
- # if deduplication_type == "Single dataset":
613
- # # Load the dataset
614
- # ds = load_dataset(dataset1_name, split=dataset1_split)
615
-
616
- # # Extract texts
617
- # texts = [example[dataset1_text_column] for example in ds]
618
-
619
- # # Compute embeddings
620
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
621
- # embedding_matrix = model.encode(texts, show_progressbar=True)
622
-
623
- # # Deduplicate
624
- # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
625
-
626
- # # Prepare the results
627
- # num_duplicates = len(duplicate_to_original_mapping)
628
- # num_total = len(texts)
629
- # num_deduplicated = len(deduplicated_indices)
630
-
631
- # result_text = f"**Total documents:** {num_total}\n"
632
- # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
633
- # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
634
-
635
- # # Show deduplicated examples
636
- # result_text += "**Examples of duplicates found:**\n\n"
637
- # num_examples = min(5, num_duplicates)
638
- # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
639
- # original_text = texts[original_idx]
640
- # duplicate_text = texts[duplicate_idx]
641
- # differences = display_word_differences(original_text, duplicate_text)
642
- # result_text += f"**Original text:**\n{original_text}\n\n"
643
- # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
644
- # result_text += f"**Differences:**\n{differences}\n"
645
- # result_text += "-" * 50 + "\n\n"
646
-
647
- # return result_text
648
-
649
- # elif deduplication_type == "Cross-dataset":
650
- # # Load datasets
651
- # ds1 = load_dataset(dataset1_name, split=dataset1_split)
652
- # ds2 = load_dataset(dataset2_name, split=dataset2_split)
653
-
654
- # # Extract texts
655
- # texts1 = [example[dataset1_text_column] for example in ds1]
656
- # texts2 = [example[dataset2_text_column] for example in ds2]
657
-
658
- # # Compute embeddings
659
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
660
- # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
661
- # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
662
-
663
- # # Deduplicate across datasets
664
- # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
665
-
666
- # num_duplicates = len(duplicate_indices_in_ds2)
667
- # num_total_ds2 = len(texts2)
668
- # num_unique_ds2 = num_total_ds2 - num_duplicates
669
-
670
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
671
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
672
- # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
673
-
674
- # # Show deduplicated examples
675
- # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
676
- # num_examples = min(5, num_duplicates)
677
- # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
678
- # original_idx = duplicate_to_original_mapping[duplicate_idx]
679
- # original_text = texts1[original_idx]
680
- # duplicate_text = texts2[duplicate_idx]
681
- # differences = display_word_differences(original_text, duplicate_text)
682
- # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
683
- # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
684
- # result_text += f"**Differences:**\n{differences}\n"
685
- # result_text += "-" * 50 + "\n\n"
686
-
687
- # return result_text
688
-
689
- # with gr.Blocks() as demo:
690
- # gr.Markdown("# Semantic Deduplication")
691
-
692
- # deduplication_type = gr.Radio(
693
- # choices=["Single dataset", "Cross-dataset"],
694
- # label="Deduplication Type",
695
- # value="Single dataset"
696
- # )
697
-
698
- # with gr.Row():
699
- # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
700
- # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
701
- # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
702
-
703
- # dataset2_inputs = gr.Column(visible=False)
704
- # with dataset2_inputs:
705
- # gr.Markdown("### Dataset 2")
706
- # with gr.Row():
707
- # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
708
- # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
709
- # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
710
-
711
- # threshold = gr.Slider(
712
- # minimum=0.0,
713
- # maximum=1.0,
714
- # value=0.8,
715
- # label="Similarity Threshold"
716
- # )
717
-
718
- # compute_button = gr.Button("Compute")
719
-
720
- # output = gr.Markdown()
721
-
722
- # # Function to update the visibility of dataset2_inputs
723
- # def update_visibility(deduplication_type_value):
724
- # if deduplication_type_value == "Cross-dataset":
725
- # return gr.update(visible=True)
726
- # else:
727
- # return gr.update(visible=False)
728
-
729
- # deduplication_type.change(
730
- # update_visibility,
731
- # inputs=deduplication_type,
732
- # outputs=dataset2_inputs
733
- # )
734
-
735
- # compute_button.click(
736
- # fn=perform_deduplication,
737
- # inputs=[
738
- # deduplication_type,
739
- # dataset1_name,
740
- # dataset1_split,
741
- # dataset1_text_column,
742
- # dataset2_name,
743
- # dataset2_split,
744
- # dataset2_text_column,
745
- # threshold
746
- # ],
747
- # outputs=output
748
- # )
749
-
750
- # demo.launch()
 
36
  )
37
 
38
  # Process duplicates
39
+ for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
40
  if i not in deduplicated_indices:
41
  continue
42
 
 
65
  show_progressbar=True # Allow internal progress bar
66
  )
67
 
68
+ for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
 
69
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
70
 
71
  if similar_indices:
 
91
  ):
92
  # Monkey-patch tqdm
93
  original_tqdm = tqdm.tqdm
94
+ original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
95
  tqdm.tqdm = progress.tqdm
96
  sys.modules['tqdm'].tqdm = progress.tqdm
97
  sys.modules['tqdm.auto'].tqdm = progress.tqdm
98
+ Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
99
 
100
  try:
101
  # Convert threshold to float
 
162
  embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
163
 
164
  # Deduplicate across datasets
165
+ duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
166
+ embedding_matrix1, embedding_matrix2, threshold, progress=progress)
167
 
168
  num_duplicates = len(duplicate_indices_in_ds2)
169
  num_total_ds2 = len(texts2)
 
194
  sys.modules['tqdm'].tqdm = original_tqdm
195
  sys.modules['tqdm.auto'].tqdm = original_tqdm
196
 
197
+ # Restore reach's original tqdm
198
+ if original_reach_tqdm is not None:
199
+ Reach.tqdm = original_reach_tqdm
200
+ else:
201
+ del Reach.tqdm # If it wasn't originally in Reach's __dict__
202
+
203
  with gr.Blocks() as demo:
204
  gr.Markdown("# Semantic Deduplication")
205
 
 
298
  # embedding_matrix,
299
  # threshold=threshold,
300
  # batch_size=batch_size,
301
+ # show_progressbar=True # Allow internal progress bar
302
  # )
303
 
304
  # # Process duplicates
 
328
  # embedding_matrix_2,
329
  # threshold=threshold,
330
  # batch_size=batch_size,
331
+ # show_progressbar=True # Allow internal progress bar
332
  # )
333
 
334
  # # Process duplicates
 
377
  # texts = [example[dataset1_text_column] for example in ds]
378
 
379
  # # Compute embeddings
380
+ # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
381
 
 
 
 
382
  # # Deduplicate
383
  # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
384
 
 
423
  # texts2 = [example[dataset2_text_column] for example in ds2]
424
 
425
  # # Compute embeddings
426
+ # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
427
+ # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
 
 
 
 
428
 
429
  # # Deduplicate across datasets
430
  # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 
520
  # )
521
 
522
  # demo.launch()