Pringled commited on
Commit
8879099
·
1 Parent(s): 225d3fb
Files changed (1) hide show
  1. app.py +0 -744
app.py CHANGED
@@ -248,747 +248,3 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
248
 
249
 
250
  demo.launch()
251
-
252
- # import gradio as gr
253
- # from datasets import load_dataset
254
- # import numpy as np
255
- # from model2vec import StaticModel
256
- # from reach import Reach
257
- # from difflib import ndiff
258
-
259
- # # Load the model
260
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
261
-
262
- # # Default parameters
263
- # default_dataset_name = "sst2"
264
- # default_dataset_split = "train"
265
- # default_text_column = "sentence"
266
- # default_threshold = 0.9
267
-
268
- # def deduplicate_embeddings(
269
- # embeddings_a: np.ndarray,
270
- # embeddings_b: np.ndarray = None,
271
- # threshold: float = 0.9,
272
- # batch_size: int = 1024,
273
- # progress=None
274
- # ) -> tuple[np.ndarray, dict[int, int]]:
275
- # """
276
- # Deduplicate embeddings within one dataset or across two datasets.
277
-
278
- # :param embeddings_a: Embeddings of Dataset 1.
279
- # :param embeddings_b: Optional, embeddings of Dataset 2.
280
- # :param threshold: Similarity threshold for deduplication.
281
- # :param batch_size: Batch size for similarity computation.
282
- # :param progress: Gradio progress tracker for feedback.
283
- # :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
284
- # """
285
- # if embeddings_b is None:
286
- # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
287
- # duplicate_to_original = {}
288
- # results = reach.nearest_neighbor_threshold(
289
- # embeddings_a, threshold=threshold, batch_size=batch_size, show_progressbar=False
290
- # )
291
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_a))):
292
- # for sim_idx, _ in similar_items:
293
- # sim_idx = int(sim_idx)
294
- # if sim_idx != i and sim_idx not in duplicate_to_original:
295
- # duplicate_to_original[sim_idx] = i
296
- # deduplicated_indices = set(range(len(embeddings_a))) - set(duplicate_to_original.keys())
297
- # return deduplicated_indices, duplicate_to_original
298
- # else:
299
- # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
300
- # duplicate_indices_in_b = []
301
- # duplicate_to_original = {}
302
- # results = reach.nearest_neighbor_threshold(
303
- # embeddings_b, threshold=threshold, batch_size=batch_size, show_progressbar=False
304
- # )
305
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_b))):
306
- # if similar_items:
307
- # duplicate_indices_in_b.append(i)
308
- # duplicate_to_original[i] = int(similar_items[0][0])
309
- # return duplicate_indices_in_b, duplicate_to_original
310
-
311
- # def display_word_differences(x: str, y: str) -> str:
312
- # """
313
- # Display the word-level differences between two texts, formatted to avoid
314
- # misinterpretation of Markdown syntax.
315
-
316
- # :param x: First text.
317
- # :param y: Second text.
318
- # :return: A string showing word-level differences, wrapped in a code block.
319
- # """
320
- # diff = ndiff(x.split(), y.split())
321
- # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
322
- # return f"```\n{formatted_diff}\n```"
323
-
324
- # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
325
- # """
326
- # Load texts from a specified dataset and split.
327
-
328
- # :param dataset_name: Name of the dataset.
329
- # :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
330
- # :param text_column: Name of the text column.
331
- # :return: A list of texts from the dataset.
332
- # """
333
- # ds = load_dataset(dataset_name, split=dataset_split)
334
- # return [example[text_column] for example in ds]
335
-
336
- # def perform_deduplication(
337
- # deduplication_type: str,
338
- # dataset1_name: str,
339
- # dataset1_split: str,
340
- # dataset1_text_column: str,
341
- # dataset2_name: str = "",
342
- # dataset2_split: str = "",
343
- # dataset2_text_column: str = "",
344
- # threshold: float = default_threshold,
345
- # progress: gr.Progress = gr.Progress(track_tqdm=True)
346
- # ):
347
- # """
348
- # Perform deduplication on one or two datasets based on the deduplication type.
349
-
350
- # :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
351
- # :param dataset1_name: Name of the first dataset.
352
- # :param dataset1_split: Split of the first dataset.
353
- # :param dataset1_text_column: Text column of the first dataset.
354
- # :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
355
- # :param dataset2_split: Optional, split of the second dataset.
356
- # :param dataset2_text_column: Optional, text column of the second dataset.
357
- # :param threshold: Similarity threshold for deduplication.
358
- # :param progress: Gradio progress tracker.
359
- # :return: Status updates and result text for the Gradio interface.
360
- # """
361
- # try:
362
- # threshold = float(threshold)
363
-
364
- # # Load and process Dataset 1
365
- # yield "Loading Dataset 1...", ""
366
- # texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
367
- # yield "Computing embeddings for Dataset 1...", ""
368
- # embeddings1 = model.encode(texts1, show_progressbar=True)
369
-
370
- # if deduplication_type == "Single dataset":
371
- # # Deduplicate within Dataset 1
372
- # yield "Deduplicating within Dataset 1...", ""
373
- # deduplicated_indices, duplicate_mapping = deduplicate_embeddings(
374
- # embeddings1, threshold=threshold, progress=progress
375
- # )
376
-
377
- # num_duplicates = len(duplicate_mapping)
378
- # result_text = (
379
- # f"**Total documents:** {len(texts1)}\n\n"
380
- # f"**Duplicates found:** {num_duplicates}\n\n"
381
- # f"**Unique documents after deduplication:** {len(deduplicated_indices)}\n\n"
382
- # )
383
-
384
- # if num_duplicates > 0:
385
- # result_text += "**Sample duplicates:**\n\n"
386
- # for dup_idx, orig_idx in list(duplicate_mapping.items())[:5]:
387
- # orig_text = texts1[orig_idx]
388
- # dup_text = texts1[dup_idx]
389
- # differences = display_word_differences(orig_text, dup_text)
390
- # result_text += (
391
- # f"**Original:**\n{orig_text}\n\n"
392
- # f"**Duplicate:**\n{dup_text}\n\n"
393
- # f"**Differences:**\n{differences}\n"
394
- # + "-" * 50 + "\n\n"
395
- # )
396
- # else:
397
- # result_text += "No duplicates found."
398
-
399
- # yield "Deduplication completed.", result_text
400
-
401
- # else:
402
- # # Load and process Dataset 2
403
- # yield "Loading Dataset 2...", ""
404
- # texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
405
- # yield "Computing embeddings for Dataset 2...", ""
406
- # embeddings2 = model.encode(texts2, show_progressbar=True)
407
-
408
- # # Deduplicate Dataset 2 against Dataset 1
409
- # yield "Deduplicating Dataset 2 against Dataset 1...", ""
410
- # duplicate_indices, duplicate_mapping = deduplicate_embeddings(
411
- # embeddings1, embeddings_b=embeddings2, threshold=threshold, progress=progress
412
- # )
413
-
414
- # num_duplicates = len(duplicate_indices)
415
- # result_text = (
416
- # f"**Total documents in {dataset2_name}/{dataset2_split}:** {len(texts2)}\n\n"
417
- # f"**Duplicates found in Dataset 2:** {num_duplicates}\n\n"
418
- # f"**Unique documents after deduplication:** {len(texts2) - num_duplicates}\n\n"
419
- # )
420
-
421
- # if num_duplicates > 0:
422
- # result_text += "**Sample duplicates from Dataset 2:**\n\n"
423
- # for idx in duplicate_indices[:5]:
424
- # orig_text = texts1[duplicate_mapping[idx]]
425
- # dup_text = texts2[idx]
426
- # differences = display_word_differences(orig_text, dup_text)
427
- # result_text += (
428
- # f"**Original (Dataset 1):**\n{orig_text}\n\n"
429
- # f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
430
- # f"**Differences:**\n{differences}\n"
431
- # + "-" * 50 + "\n\n"
432
- # )
433
- # else:
434
- # result_text += "No duplicates found."
435
-
436
- # yield "Deduplication completed.", result_text
437
-
438
- # except Exception as e:
439
- # yield f"An error occurred: {e}", ""
440
- # raise e
441
-
442
- # # Gradio app with stop button support
443
- # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
444
- # gr.Markdown("# Semantic Deduplication")
445
- # gr.Markdown("""
446
- # This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
447
- # It can be used to identify duplicate texts within a single dataset or across two datasets.
448
- # You can adjust the similarity threshold to control the strictness of the deduplication.\n
449
- # NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
450
- # """)
451
-
452
- # deduplication_type = gr.Radio(
453
- # choices=["Cross-dataset", "Single dataset"], # Swapped "Cross-dataset" to the left
454
- # label="Deduplication Type",
455
- # value="Cross-dataset", # Set "Cross-dataset" as the default value
456
- # )
457
-
458
- # with gr.Row():
459
- # dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
460
- # dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
461
- # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
462
-
463
- # dataset2_inputs = gr.Column(visible=True) # Make dataset2_inputs visible by default
464
- # with dataset2_inputs:
465
- # gr.Markdown("### Dataset 2")
466
- # with gr.Row():
467
- # dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
468
- # dataset2_split = gr.Textbox(value=default_dataset_split, label="Dataset 2 Split")
469
- # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
470
-
471
- # threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
472
-
473
- # with gr.Row(): # Placing the button in the same row for better alignment
474
- # compute_button = gr.Button("Deduplicate")
475
-
476
- # status_output = gr.Markdown(elem_id="status_output")
477
- # result_output = gr.Markdown()
478
-
479
- # def update_visibility(choice: str):
480
- # return gr.update(visible=choice == "Cross-dataset")
481
-
482
- # deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
483
-
484
- # compute_button.click(
485
- # fn=perform_deduplication,
486
- # inputs=[
487
- # deduplication_type,
488
- # dataset1_name,
489
- # dataset1_split,
490
- # dataset1_text_column,
491
- # dataset2_name,
492
- # dataset2_split,
493
- # dataset2_text_column,
494
- # threshold,
495
- # ],
496
- # outputs=[status_output, result_output],
497
- # )
498
-
499
-
500
- # demo.launch()
501
-
502
- # # import gradio as gr
503
- # # from datasets import load_dataset
504
- # # import numpy as np
505
- # # from model2vec import StaticModel
506
- # # from reach import Reach
507
- # # from difflib import ndiff
508
-
509
- # # # Load the model
510
- # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
511
-
512
- # # # Default parameters
513
- # # default_dataset_name = "sst2"
514
- # # default_dataset_split = "train"
515
- # # default_text_column = "sentence"
516
- # # default_threshold = 0.9
517
-
518
- # # def deduplicate_embeddings(
519
- # # embeddings_a: np.ndarray,
520
- # # embeddings_b: np.ndarray = None,
521
- # # threshold: float = 0.9,
522
- # # batch_size: int = 1024,
523
- # # progress=None
524
- # # ) -> tuple[np.ndarray, dict[int, int]]:
525
- # # """
526
- # # Deduplicate embeddings within one dataset or across two datasets.
527
-
528
- # # :param embeddings_a: Embeddings of Dataset 1.
529
- # # :param embeddings_b: Optional, embeddings of Dataset 2.
530
- # # :param threshold: Similarity threshold for deduplication.
531
- # # :param batch_size: Batch size for similarity computation.
532
- # # :param progress: Gradio progress tracker for feedback.
533
- # # :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
534
- # # """
535
- # # if embeddings_b is None:
536
- # # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
537
- # # duplicate_to_original = {}
538
- # # results = reach.nearest_neighbor_threshold(
539
- # # embeddings_a, threshold=threshold, batch_size=batch_size, show_progressbar=False
540
- # # )
541
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_a))):
542
- # # for sim_idx, _ in similar_items:
543
- # # sim_idx = int(sim_idx)
544
- # # if sim_idx != i and sim_idx not in duplicate_to_original:
545
- # # duplicate_to_original[sim_idx] = i
546
- # # deduplicated_indices = set(range(len(embeddings_a))) - set(duplicate_to_original.keys())
547
- # # return deduplicated_indices, duplicate_to_original
548
- # # else:
549
- # # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
550
- # # duplicate_indices_in_b = []
551
- # # duplicate_to_original = {}
552
- # # results = reach.nearest_neighbor_threshold(
553
- # # embeddings_b, threshold=threshold, batch_size=batch_size, show_progressbar=False
554
- # # )
555
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_b))):
556
- # # if similar_items:
557
- # # duplicate_indices_in_b.append(i)
558
- # # duplicate_to_original[i] = int(similar_items[0][0])
559
- # # return duplicate_indices_in_b, duplicate_to_original
560
-
561
- # # def display_word_differences(x: str, y: str) -> str:
562
- # # """
563
- # # Display the word-level differences between two texts, formatted to avoid
564
- # # misinterpretation of Markdown syntax.
565
-
566
- # # :param x: First text.
567
- # # :param y: Second text.
568
- # # :return: A string showing word-level differences, wrapped in a code block.
569
- # # """
570
- # # diff = ndiff(x.split(), y.split())
571
- # # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
572
- # # return f"```\n{formatted_diff}\n```"
573
-
574
- # # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
575
- # # """
576
- # # Load texts from a specified dataset and split.
577
-
578
- # # :param dataset_name: Name of the dataset.
579
- # # :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
580
- # # :param text_column: Name of the text column.
581
- # # :return: A list of texts from the dataset.
582
- # # """
583
- # # ds = load_dataset(dataset_name, split=dataset_split)
584
- # # return [example[text_column] for example in ds]
585
-
586
- # # def perform_deduplication(
587
- # # deduplication_type: str,
588
- # # dataset1_name: str,
589
- # # dataset1_split: str,
590
- # # dataset1_text_column: str,
591
- # # dataset2_name: str = "",
592
- # # dataset2_split: str = "",
593
- # # dataset2_text_column: str = "",
594
- # # threshold: float = default_threshold,
595
- # # progress: gr.Progress = gr.Progress(track_tqdm=True)
596
- # # ):
597
- # # """
598
- # # Perform deduplication on one or two datasets based on the deduplication type.
599
-
600
- # # :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
601
- # # :param dataset1_name: Name of the first dataset.
602
- # # :param dataset1_split: Split of the first dataset.
603
- # # :param dataset1_text_column: Text column of the first dataset.
604
- # # :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
605
- # # :param dataset2_split: Optional, split of the second dataset.
606
- # # :param dataset2_text_column: Optional, text column of the second dataset.
607
- # # :param threshold: Similarity threshold for deduplication.
608
- # # :param progress: Gradio progress tracker.
609
- # # :return: Status updates and result text for the Gradio interface.
610
- # # """
611
- # # try:
612
- # # threshold = float(threshold)
613
-
614
- # # # Load and process Dataset 1
615
- # # yield "Loading Dataset 1...", ""
616
- # # texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
617
- # # yield "Computing embeddings for Dataset 1...", ""
618
- # # embeddings1 = model.encode(texts1, show_progressbar=True)
619
-
620
- # # if deduplication_type == "Single dataset":
621
- # # # Deduplicate within Dataset 1
622
- # # yield "Deduplicating within Dataset 1...", ""
623
- # # deduplicated_indices, duplicate_mapping = deduplicate_embeddings(
624
- # # embeddings1, threshold=threshold, progress=progress
625
- # # )
626
-
627
- # # num_duplicates = len(duplicate_mapping)
628
- # # result_text = (
629
- # # f"**Total documents:** {len(texts1)}\n\n"
630
- # # f"**Duplicates found:** {num_duplicates}\n\n"
631
- # # f"**Unique documents after deduplication:** {len(deduplicated_indices)}\n\n"
632
- # # )
633
-
634
- # # if num_duplicates > 0:
635
- # # result_text += "**Sample duplicates:**\n\n"
636
- # # for dup_idx, orig_idx in list(duplicate_mapping.items())[:5]:
637
- # # orig_text = texts1[orig_idx]
638
- # # dup_text = texts1[dup_idx]
639
- # # differences = display_word_differences(orig_text, dup_text)
640
- # # result_text += (
641
- # # f"**Original:**\n{orig_text}\n\n"
642
- # # f"**Duplicate:**\n{dup_text}\n\n"
643
- # # f"**Differences:**\n{differences}\n"
644
- # # + "-" * 50 + "\n\n"
645
- # # )
646
- # # else:
647
- # # result_text += "No duplicates found."
648
-
649
- # # yield "Deduplication completed.", result_text
650
-
651
- # # else:
652
- # # # Load and process Dataset 2
653
- # # yield "Loading Dataset 2...", ""
654
- # # texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
655
- # # yield "Computing embeddings for Dataset 2...", ""
656
- # # embeddings2 = model.encode(texts2, show_progressbar=True)
657
-
658
- # # # Deduplicate Dataset 2 against Dataset 1
659
- # # yield "Deduplicating Dataset 2 against Dataset 1...", ""
660
- # # duplicate_indices, duplicate_mapping = deduplicate_embeddings(
661
- # # embeddings1, embeddings_b=embeddings2, threshold=threshold, progress=progress
662
- # # )
663
-
664
- # # num_duplicates = len(duplicate_indices)
665
- # # result_text = (
666
- # # f"**Total documents in {dataset2_name}/{dataset2_split}:** {len(texts2)}\n\n"
667
- # # f"**Duplicates found in Dataset 2:** {num_duplicates}\n\n"
668
- # # f"**Unique documents after deduplication:** {len(texts2) - num_duplicates}\n\n"
669
- # # )
670
-
671
- # # if num_duplicates > 0:
672
- # # result_text += "**Sample duplicates from Dataset 2:**\n\n"
673
- # # for idx in duplicate_indices[:5]:
674
- # # orig_text = texts1[duplicate_mapping[idx]]
675
- # # dup_text = texts2[idx]
676
- # # differences = display_word_differences(orig_text, dup_text)
677
- # # result_text += (
678
- # # f"**Original (Dataset 1):**\n{orig_text}\n\n"
679
- # # f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
680
- # # f"**Differences:**\n{differences}\n"
681
- # # + "-" * 50 + "\n\n"
682
- # # )
683
- # # else:
684
- # # result_text += "No duplicates found."
685
-
686
- # # yield "Deduplication completed.", result_text
687
-
688
- # # except Exception as e:
689
- # # yield f"An error occurred: {e}", ""
690
- # # raise e
691
-
692
- # # # Gradio app with stop button support
693
- # # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
694
- # # gr.Markdown("# Semantic Deduplication")
695
- # # gr.Markdown("""
696
- # # This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
697
- # # It can be used to identify duplicate texts within a single dataset or across two datasets.
698
- # # You can adjust the similarity threshold to control the strictness of the deduplication.\n
699
- # # NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
700
- # # """)
701
-
702
- # # deduplication_type = gr.Radio(
703
- # # choices=["Single dataset", "Cross-dataset"],
704
- # # label="Deduplication Type",
705
- # # value="Cross-dataset", # Set "Cross-dataset" as the default value
706
- # # )
707
-
708
- # # with gr.Row():
709
- # # dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
710
- # # dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
711
- # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
712
-
713
- # # dataset2_inputs = gr.Column(visible=True) # Make dataset2_inputs visible by default
714
- # # with dataset2_inputs:
715
- # # gr.Markdown("### Dataset 2")
716
- # # with gr.Row():
717
- # # dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
718
- # # dataset2_split = gr.Textbox(value=default_dataset_split, label="Dataset 2 Split")
719
- # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
720
-
721
- # # threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
722
- # # compute_button = gr.Button("Deduplicate")
723
- # # status_output = gr.Markdown(elem_id="status_output")
724
- # # result_output = gr.Markdown()
725
-
726
- # # def update_visibility(choice: str):
727
- # # return gr.update(visible=choice == "Cross-dataset")
728
-
729
- # # deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
730
-
731
- # # compute_button.click(
732
- # # fn=perform_deduplication,
733
- # # inputs=[
734
- # # deduplication_type,
735
- # # dataset1_name,
736
- # # dataset1_split,
737
- # # dataset1_text_column,
738
- # # dataset2_name,
739
- # # dataset2_split,
740
- # # dataset2_text_column,
741
- # # threshold,
742
- # # ],
743
- # # outputs=[status_output, result_output],
744
- # # )
745
-
746
-
747
- # # demo.launch()
748
-
749
- # # # import gradio as gr
750
- # # # from datasets import load_dataset
751
- # # # import numpy as np
752
- # # # from model2vec import StaticModel
753
- # # # from reach import Reach
754
- # # # from difflib import ndiff
755
-
756
- # # # # Load the model
757
- # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
758
-
759
- # # # # Default parameters
760
- # # # default_dataset_name = "sst2"
761
- # # # default_dataset_split = "train"
762
- # # # default_text_column = "sentence"
763
- # # # default_threshold = 0.9
764
-
765
- # # # def deduplicate_embeddings(
766
- # # # embeddings_a: np.ndarray,
767
- # # # embeddings_b: np.ndarray = None,
768
- # # # threshold: float = 0.9,
769
- # # # batch_size: int = 1024,
770
- # # # progress=None
771
- # # # ) -> tuple[np.ndarray, dict[int, int]]:
772
- # # # """
773
- # # # Deduplicate embeddings within one dataset or across two datasets.
774
-
775
- # # # :param embeddings_a: Embeddings of Dataset 1.
776
- # # # :param embeddings_b: Optional, embeddings of Dataset 2.
777
- # # # :param threshold: Similarity threshold for deduplication.
778
- # # # :param batch_size: Batch size for similarity computation.
779
- # # # :param progress: Gradio progress tracker for feedback.
780
- # # # :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
781
- # # # """
782
- # # # if embeddings_b is None:
783
- # # # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
784
- # # # duplicate_to_original = {}
785
- # # # results = reach.nearest_neighbor_threshold(
786
- # # # embeddings_a, threshold=threshold, batch_size=batch_size, show_progressbar=False
787
- # # # )
788
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_a))):
789
- # # # for sim_idx, _ in similar_items:
790
- # # # sim_idx = int(sim_idx)
791
- # # # if sim_idx != i and sim_idx not in duplicate_to_original:
792
- # # # duplicate_to_original[sim_idx] = i
793
- # # # deduplicated_indices = set(range(len(embeddings_a))) - set(duplicate_to_original.keys())
794
- # # # return deduplicated_indices, duplicate_to_original
795
- # # # else:
796
- # # # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
797
- # # # duplicate_indices_in_b = []
798
- # # # duplicate_to_original = {}
799
- # # # results = reach.nearest_neighbor_threshold(
800
- # # # embeddings_b, threshold=threshold, batch_size=batch_size, show_progressbar=False
801
- # # # )
802
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_b))):
803
- # # # if similar_items:
804
- # # # duplicate_indices_in_b.append(i)
805
- # # # duplicate_to_original[i] = int(similar_items[0][0])
806
- # # # return duplicate_indices_in_b, duplicate_to_original
807
-
808
- # # # def display_word_differences(x: str, y: str) -> str:
809
- # # # """
810
- # # # Display the word-level differences between two texts, formatted to avoid
811
- # # # misinterpretation of Markdown syntax.
812
-
813
- # # # :param x: First text.
814
- # # # :param y: Second text.
815
- # # # :return: A string showing word-level differences, wrapped in a code block.
816
- # # # """
817
- # # # diff = ndiff(x.split(), y.split())
818
- # # # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
819
- # # # return f"```\n{formatted_diff}\n```"
820
-
821
- # # # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
822
- # # # """
823
- # # # Load texts from a specified dataset and split.
824
-
825
- # # # :param dataset_name: Name of the dataset.
826
- # # # :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
827
- # # # :param text_column: Name of the text column.
828
- # # # :return: A list of texts from the dataset.
829
- # # # """
830
- # # # ds = load_dataset(dataset_name, split=dataset_split)
831
- # # # return [example[text_column] for example in ds]
832
-
833
- # # # def perform_deduplication(
834
- # # # deduplication_type: str,
835
- # # # dataset1_name: str,
836
- # # # dataset1_split: str,
837
- # # # dataset1_text_column: str,
838
- # # # dataset2_name: str = "",
839
- # # # dataset2_split: str = "",
840
- # # # dataset2_text_column: str = "",
841
- # # # threshold: float = default_threshold,
842
- # # # progress: gr.Progress = gr.Progress(track_tqdm=True)
843
- # # # ):
844
- # # # """
845
- # # # Perform deduplication on one or two datasets based on the deduplication type.
846
-
847
- # # # :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
848
- # # # :param dataset1_name: Name of the first dataset.
849
- # # # :param dataset1_split: Split of the first dataset.
850
- # # # :param dataset1_text_column: Text column of the first dataset.
851
- # # # :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
852
- # # # :param dataset2_split: Optional, split of the second dataset.
853
- # # # :param dataset2_text_column: Optional, text column of the second dataset.
854
- # # # :param threshold: Similarity threshold for deduplication.
855
- # # # :param progress: Gradio progress tracker.
856
- # # # :return: Status updates and result text for the Gradio interface.
857
- # # # """
858
- # # # try:
859
- # # # threshold = float(threshold)
860
-
861
- # # # # Load and process Dataset 1
862
- # # # yield "Loading Dataset 1...", ""
863
- # # # texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
864
- # # # yield "Computing embeddings for Dataset 1...", ""
865
- # # # embeddings1 = model.encode(texts1, show_progressbar=True)
866
-
867
- # # # if deduplication_type == "Single dataset":
868
- # # # # Deduplicate within Dataset 1
869
- # # # yield "Deduplicating within Dataset 1...", ""
870
- # # # deduplicated_indices, duplicate_mapping = deduplicate_embeddings(
871
- # # # embeddings1, threshold=threshold, progress=progress
872
- # # # )
873
-
874
- # # # num_duplicates = len(duplicate_mapping)
875
- # # # result_text = (
876
- # # # f"**Total documents:** {len(texts1)}\n\n"
877
- # # # f"**Duplicates found:** {num_duplicates}\n\n"
878
- # # # f"**Unique documents after deduplication:** {len(deduplicated_indices)}\n\n"
879
- # # # )
880
-
881
- # # # if num_duplicates > 0:
882
- # # # result_text += "**Sample duplicates:**\n\n"
883
- # # # for dup_idx, orig_idx in list(duplicate_mapping.items())[:5]:
884
- # # # orig_text = texts1[orig_idx]
885
- # # # dup_text = texts1[dup_idx]
886
- # # # differences = display_word_differences(orig_text, dup_text)
887
- # # # result_text += (
888
- # # # f"**Original:**\n{orig_text}\n\n"
889
- # # # f"**Duplicate:**\n{dup_text}\n\n"
890
- # # # f"**Differences:**\n{differences}\n"
891
- # # # + "-" * 50 + "\n\n"
892
- # # # )
893
- # # # else:
894
- # # # result_text += "No duplicates found."
895
-
896
- # # # yield "Deduplication completed.", result_text
897
-
898
- # # # else:
899
- # # # # Load and process Dataset 2
900
- # # # yield "Loading Dataset 2...", ""
901
- # # # texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
902
- # # # yield "Computing embeddings for Dataset 2...", ""
903
- # # # embeddings2 = model.encode(texts2, show_progressbar=True)
904
-
905
- # # # # Deduplicate Dataset 2 against Dataset 1
906
- # # # yield "Deduplicating Dataset 2 against Dataset 1...", ""
907
- # # # duplicate_indices, duplicate_mapping = deduplicate_embeddings(
908
- # # # embeddings1, embeddings_b=embeddings2, threshold=threshold, progress=progress
909
- # # # )
910
-
911
- # # # num_duplicates = len(duplicate_indices)
912
- # # # result_text = (
913
- # # # f"**Total documents in {dataset2_name}/{dataset2_split}:** {len(texts2)}\n\n"
914
- # # # f"**Duplicates found in Dataset 2:** {num_duplicates}\n\n"
915
- # # # f"**Unique documents after deduplication:** {len(texts2) - num_duplicates}\n\n"
916
- # # # )
917
-
918
- # # # if num_duplicates > 0:
919
- # # # result_text += "**Sample duplicates from Dataset 2:**\n\n"
920
- # # # for idx in duplicate_indices[:5]:
921
- # # # orig_text = texts1[duplicate_mapping[idx]]
922
- # # # dup_text = texts2[idx]
923
- # # # differences = display_word_differences(orig_text, dup_text)
924
- # # # result_text += (
925
- # # # f"**Original (Dataset 1):**\n{orig_text}\n\n"
926
- # # # f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
927
- # # # f"**Differences:**\n{differences}\n"
928
- # # # + "-" * 50 + "\n\n"
929
- # # # )
930
- # # # else:
931
- # # # result_text += "No duplicates found."
932
-
933
- # # # yield "Deduplication completed.", result_text
934
-
935
- # # # except Exception as e:
936
- # # # yield f"An error occurred: {e}", ""
937
- # # # raise e
938
-
939
- # # # # Gradio app with stop button support
940
- # # # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
941
- # # # gr.Markdown("# Semantic Deduplication")
942
- # # # gr.Markdown("""
943
- # # # This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
944
- # # # It can be used to identify duplicate texts within a single dataset or across two datasets.
945
- # # # You can adjust the similarity threshold to control the strictness of the deduplication.\n
946
- # # # NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
947
- # # # """)
948
-
949
- # # # deduplication_type = gr.Radio(
950
- # # # choices=["Single dataset", "Cross-dataset"],
951
- # # # label="Deduplication Type",
952
- # # # value="Single dataset",
953
- # # # )
954
-
955
- # # # with gr.Row():
956
- # # # dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
957
- # # # dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
958
- # # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
959
-
960
- # # # dataset2_inputs = gr.Column(visible=False)
961
- # # # with dataset2_inputs:
962
- # # # gr.Markdown("### Dataset 2")
963
- # # # with gr.Row():
964
- # # # dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
965
- # # # dataset2_split = gr.Textbox(value=default_dataset_split, label="Dataset 2 Split")
966
- # # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
967
-
968
- # # # threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
969
- # # # compute_button = gr.Button("Deduplicate")
970
- # # # status_output = gr.Markdown(elem_id="status_output")
971
- # # # result_output = gr.Markdown()
972
-
973
- # # # def update_visibility(choice: str):
974
- # # # return gr.update(visible=choice == "Cross-dataset")
975
-
976
- # # # deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
977
-
978
- # # # compute_button.click(
979
- # # # fn=perform_deduplication,
980
- # # # inputs=[
981
- # # # deduplication_type,
982
- # # # dataset1_name,
983
- # # # dataset1_split,
984
- # # # dataset1_text_column,
985
- # # # dataset2_name,
986
- # # # dataset2_split,
987
- # # # dataset2_text_column,
988
- # # # threshold,
989
- # # # ],
990
- # # # outputs=[status_output, result_output],
991
- # # # )
992
-
993
-
994
- # # # demo.launch()
 
248
 
249
 
250
  demo.launch()