Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

471be58

1 Parent(s): 2ba6e60

Updated app with code for deduplication

Browse files

Files changed (1) hide show

app.py +17 -245

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
     )
     # Process duplicates
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
         if i not in deduplicated_indices:
             continue
@@ -65,8 +65,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
         show_progressbar=True  # Allow internal progress bar
     )
-    # Process duplicates
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
         if similar_indices:
@@ -92,9 +91,11 @@ def perform_deduplication(
 ):
     # Monkey-patch tqdm
     original_tqdm = tqdm.tqdm
     tqdm.tqdm = progress.tqdm
     sys.modules['tqdm'].tqdm = progress.tqdm
     sys.modules['tqdm.auto'].tqdm = progress.tqdm
     try:
         # Convert threshold to float
@@ -161,7 +162,8 @@ def perform_deduplication(
             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
             # Deduplicate across datasets
-            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
             num_duplicates = len(duplicate_indices_in_ds2)
             num_total_ds2 = len(texts2)
@@ -192,6 +194,12 @@ def perform_deduplication(
         sys.modules['tqdm'].tqdm = original_tqdm
         sys.modules['tqdm.auto'].tqdm = original_tqdm
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
@@ -290,7 +298,7 @@ demo.launch()
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
 #     )
 #     # Process duplicates
@@ -320,7 +328,7 @@ demo.launch()
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
 #     )
 #     # Process duplicates
@@ -369,11 +377,8 @@ demo.launch()
 #             texts = [example[dataset1_text_column] for example in ds]
 #             # Compute embeddings
-#             embedding_matrix = model.encode(texts, show_progressbar=False)  # Disable internal progress bar
-#             # Show progress bar for embedding computation
-#             embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings")
 #             # Deduplicate
 #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
@@ -418,12 +423,8 @@ demo.launch()
 #             texts2 = [example[dataset2_text_column] for example in ds2]
 #             # Compute embeddings
-#             embedding_matrix1 = model.encode(texts1, show_progressbar=False)  # Disable internal progress bar
-#             embedding_matrix2 = model.encode(texts2, show_progressbar=False)  # Disable internal progress bar
-#             # Show progress bar for embedding computation
-#             embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1")
-#             embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2")
 #             # Deduplicate across datasets
 #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
@@ -519,232 +520,3 @@ demo.launch()
 #     )
 # demo.launch()
-# import gradio as gr
-# from datasets import load_dataset
-# import numpy as np
-# from model2vec import StaticModel
-# from reach import Reach
-# from difflib import ndiff
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-#     """
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-#     # Use a set for deduplicated indices and keep track of duplicates
-#     deduplicated_indices = set(range(len(embedding_matrix)))  # Start with all indices as deduplicated
-#     duplicate_to_original_mapping = {}
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True
-#     )
-#     # Process duplicates
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
-#         if i not in deduplicated_indices:
-#             continue  # Skip already marked duplicates
-#         # Similar items are returned as (index, score), we are only interested in the index
-#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-#         # Mark similar documents as duplicates and map them to the original
-#         for sim_idx in similar_indices:
-#             if sim_idx in deduplicated_indices:
-#                 deduplicated_indices.remove(sim_idx)
-#                 duplicate_to_original_mapping[sim_idx] = i  # Map duplicate to original
-#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=gr.Progress(track_tqdm=True)) -> tuple[list[int], dict[int, int]]:
-#     """
-#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-#     """
-#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-#     # Keep track of duplicates in the second dataset
-#     duplicate_indices_in_test = []
-#     duplicate_to_original_mapping = {}
-#     # Find nearest neighbors from the test set in the train set
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix_2,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True
-#     )
-#     # Process duplicates
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
-#         # Similar items are returned as (index, score), we are only interested in the index
-#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]  # Keep those above the threshold
-#         # If we find a similar item in the train set, mark it as a duplicate
-#         if similar_indices:
-#             duplicate_indices_in_test.append(i)
-#             duplicate_to_original_mapping[i] = similar_indices[0]  # Map duplicate in test to original in train
-#     return duplicate_indices_in_test, duplicate_to_original_mapping
-# def display_word_differences(x: str, y: str) -> str:
-#     diff = ndiff(x.split(), y.split())
-#     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# def perform_deduplication(
-#     deduplication_type,
-#     dataset1_name,
-#     dataset1_split,
-#     dataset1_text_column,
-#     dataset2_name="",
-#     dataset2_split="",
-#     dataset2_text_column="",
-#     threshold=0.8,
-#     progress=gr.Progress(track_tqdm=True)
-# ):
-#     # Convert threshold to float
-#     threshold = float(threshold)
-#     if deduplication_type == "Single dataset":
-#         # Load the dataset
-#         ds = load_dataset(dataset1_name, split=dataset1_split)
-#         # Extract texts
-#         texts = [example[dataset1_text_column] for example in ds]
-#         # Compute embeddings
-#         model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-#         embedding_matrix = model.encode(texts, show_progressbar=True)
-#         # Deduplicate
-#         deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
-#         # Prepare the results
-#         num_duplicates = len(duplicate_to_original_mapping)
-#         num_total = len(texts)
-#         num_deduplicated = len(deduplicated_indices)
-#         result_text = f"**Total documents:** {num_total}\n"
-#         result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#         result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#         # Show deduplicated examples
-#         result_text += "**Examples of duplicates found:**\n\n"
-#         num_examples = min(5, num_duplicates)
-#         for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-#             original_text = texts[original_idx]
-#             duplicate_text = texts[duplicate_idx]
-#             differences = display_word_differences(original_text, duplicate_text)
-#             result_text += f"**Original text:**\n{original_text}\n\n"
-#             result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-#             result_text += f"**Differences:**\n{differences}\n"
-#             result_text += "-" * 50 + "\n\n"
-#         return result_text
-#     elif deduplication_type == "Cross-dataset":
-#         # Load datasets
-#         ds1 = load_dataset(dataset1_name, split=dataset1_split)
-#         ds2 = load_dataset(dataset2_name, split=dataset2_split)
-#         # Extract texts
-#         texts1 = [example[dataset1_text_column] for example in ds1]
-#         texts2 = [example[dataset2_text_column] for example in ds2]
-#         # Compute embeddings
-#         model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-#         embedding_matrix1 = model.encode(texts1, show_progressbar=True)
-#         embedding_matrix2 = model.encode(texts2, show_progressbar=True)
-#         # Deduplicate across datasets
-#         duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
-#         num_duplicates = len(duplicate_indices_in_ds2)
-#         num_total_ds2 = len(texts2)
-#         num_unique_ds2 = num_total_ds2 - num_duplicates
-#         result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-#         result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-#         result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-#         # Show deduplicated examples
-#         result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-#         num_examples = min(5, num_duplicates)
-#         for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-#             original_idx = duplicate_to_original_mapping[duplicate_idx]
-#             original_text = texts1[original_idx]
-#             duplicate_text = texts2[duplicate_idx]
-#             differences = display_word_differences(original_text, duplicate_text)
-#             result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-#             result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-#             result_text += f"**Differences:**\n{differences}\n"
-#             result_text += "-" * 50 + "\n\n"
-#         return result_text
-# with gr.Blocks() as demo:
-#     gr.Markdown("# Semantic Deduplication")
-#     deduplication_type = gr.Radio(
-#         choices=["Single dataset", "Cross-dataset"],
-#         label="Deduplication Type",
-#         value="Single dataset"
-#     )
-#     with gr.Row():
-#         dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
-#         dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
-#         dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
-#     dataset2_inputs = gr.Column(visible=False)
-#     with dataset2_inputs:
-#         gr.Markdown("### Dataset 2")
-#         with gr.Row():
-#             dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
-#             dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
-#             dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
-#     threshold = gr.Slider(
-#         minimum=0.0,
-#         maximum=1.0,
-#         value=0.8,
-#         label="Similarity Threshold"
-#     )
-#     compute_button = gr.Button("Compute")
-#     output = gr.Markdown()
-#     # Function to update the visibility of dataset2_inputs
-#     def update_visibility(deduplication_type_value):
-#         if deduplication_type_value == "Cross-dataset":
-#             return gr.update(visible=True)
-#         else:
-#             return gr.update(visible=False)
-#     deduplication_type.change(
-#         update_visibility,
-#         inputs=deduplication_type,
-#         outputs=dataset2_inputs
-#     )
-#     compute_button.click(
-#         fn=perform_deduplication,
-#         inputs=[
-#             deduplication_type,
-#             dataset1_name,
-#             dataset1_split,
-#             dataset1_text_column,
-#             dataset2_name,
-#             dataset2_split,
-#             dataset2_text_column,
-#             threshold
-#         ],
-#         outputs=output
-#     )
-# demo.launch()

     )
     # Process duplicates
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
         if i not in deduplicated_indices:
             continue
         show_progressbar=True  # Allow internal progress bar
     )
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
         if similar_indices:
 ):
     # Monkey-patch tqdm
     original_tqdm = tqdm.tqdm
+    original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
     tqdm.tqdm = progress.tqdm
     sys.modules['tqdm'].tqdm = progress.tqdm
     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+    Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
     try:
         # Convert threshold to float
             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
             # Deduplicate across datasets
+            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+                embedding_matrix1, embedding_matrix2, threshold, progress=progress)
             num_duplicates = len(duplicate_indices_in_ds2)
             num_total_ds2 = len(texts2)
         sys.modules['tqdm'].tqdm = original_tqdm
         sys.modules['tqdm.auto'].tqdm = original_tqdm
+        # Restore reach's original tqdm
+        if original_reach_tqdm is not None:
+            Reach.tqdm = original_reach_tqdm
+        else:
+            del Reach.tqdm  # If it wasn't originally in Reach's __dict__
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=True  # Allow internal progress bar
 #     )
 #     # Process duplicates
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=True  # Allow internal progress bar
 #     )
 #     # Process duplicates
 #             texts = [example[dataset1_text_column] for example in ds]
 #             # Compute embeddings
+#             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
 #             # Deduplicate
 #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
 #             texts2 = [example[dataset2_text_column] for example in ds2]
 #             # Compute embeddings
+#             embedding_matrix1 = model.encode(texts1, show_progressbar=True)  # Enable internal progress bar
+#             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
 #             # Deduplicate across datasets
 #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 #     )
 # demo.launch()